diff options
author | Johannes Schauer Marin Rodrigues <josch@debian.org> | 2021-12-07 16:28:44 +0100 |
---|---|---|
committer | Johannes Schauer Marin Rodrigues <josch@debian.org> | 2021-12-07 16:28:44 +0100 |
commit | 32d4b6a638456caf50a5f99f2a0b57d60d418c5f (patch) | |
tree | 3ef4ebddb9d6bb6bbb46ac10663b1a714063901e /searx/search | |
parent | 7a1db4de351875bebb4a8e7ffbe6710ad5b518c5 (diff) |
New upstream version 1.0.0+dfsg1
Diffstat (limited to 'searx/search')
-rw-r--r-- | searx/search/__init__.py | 213 | ||||
-rw-r--r-- | searx/search/checker/__init__.py | 4 | ||||
-rw-r--r-- | searx/search/checker/__main__.py | 95 | ||||
-rw-r--r-- | searx/search/checker/background.py | 125 | ||||
-rw-r--r-- | searx/search/checker/impl.py | 416 | ||||
-rw-r--r-- | searx/search/models.py | 71 | ||||
-rw-r--r-- | searx/search/processors/__init__.py | 41 | ||||
-rw-r--r-- | searx/search/processors/abstract.py | 52 | ||||
-rw-r--r-- | searx/search/processors/offline.py | 51 | ||||
-rw-r--r-- | searx/search/processors/online.py | 265 | ||||
-rw-r--r-- | searx/search/processors/online_currency.py | 69 | ||||
-rw-r--r-- | searx/search/processors/online_dictionary.py | 55 |
12 files changed, 1457 insertions, 0 deletions
diff --git a/searx/search/__init__.py b/searx/search/__init__.py new file mode 100644 index 0000000..f777e85 --- /dev/null +++ b/searx/search/__init__.py @@ -0,0 +1,213 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, <asciimoo@gmail.com> +''' + +import typing +import gc +import threading +from time import time +from uuid import uuid4 +from _thread import start_new_thread + +from searx import settings +from searx.answerers import ask +from searx.external_bang import get_bang_url +from searx.results import ResultContainer +from searx import logger +from searx.plugins import plugins +from searx.search.models import EngineRef, SearchQuery +from searx.search.processors import processors, initialize as initialize_processors +from searx.search.checker import initialize as initialize_checker + + +logger = logger.getChild('search') + +max_request_timeout = settings.get('outgoing', {}).get('max_request_timeout' or None) +if max_request_timeout is None: + logger.info('max_request_timeout={0}'.format(max_request_timeout)) +else: + if isinstance(max_request_timeout, float): + logger.info('max_request_timeout={0} second(s)'.format(max_request_timeout)) + else: + logger.critical('outgoing.max_request_timeout if defined has to be float') + import sys + sys.exit(1) + + +def initialize(settings_engines=None, enable_checker=False): + settings_engines = settings_engines or settings['engines'] + initialize_processors(settings_engines) + if enable_checker: + initialize_checker() + + +class Search: + """Search information container""" + + __slots__ = "search_query", "result_container", "start_time", "actual_timeout" + + def __init__(self, search_query): + # init vars + super().__init__() + self.search_query = search_query + self.result_container = ResultContainer() + self.start_time = None + self.actual_timeout = None + + def search_external_bang(self): + """ + Check if there is a external bang. + If yes, update self.result_container and return True + """ + if self.search_query.external_bang: + self.result_container.redirect_url = get_bang_url(self.search_query) + + # This means there was a valid bang and the + # rest of the search does not need to be continued + if isinstance(self.result_container.redirect_url, str): + return True + return False + + def search_answerers(self): + """ + Check if an answer return a result. + If yes, update self.result_container and return True + """ + answerers_results = ask(self.search_query) + + if answerers_results: + for results in answerers_results: + self.result_container.extend('answer', results) + return True + return False + + # do search-request + def _get_requests(self): + # init vars + requests = [] + + # max of all selected engine timeout + default_timeout = 0 + + # start search-reqest for all selected engines + for engineref in self.search_query.engineref_list: + processor = processors[engineref.name] + + # set default request parameters + request_params = processor.get_params(self.search_query, engineref.category) + if request_params is None: + continue + + with threading.RLock(): + processor.engine.stats['sent_search_count'] += 1 + + # append request to list + requests.append((engineref.name, self.search_query.query, request_params)) + + # update default_timeout + default_timeout = max(default_timeout, processor.engine.timeout) + + # adjust timeout + actual_timeout = default_timeout + query_timeout = self.search_query.timeout_limit + + if max_request_timeout is None and query_timeout is None: + # No max, no user query: default_timeout + pass + elif max_request_timeout is None and query_timeout is not None: + # No max, but user query: From user query except if above default + actual_timeout = min(default_timeout, query_timeout) + elif max_request_timeout is not None and query_timeout is None: + # Max, no user query: Default except if above max + actual_timeout = min(default_timeout, max_request_timeout) + elif max_request_timeout is not None and query_timeout is not None: + # Max & user query: From user query except if above max + actual_timeout = min(query_timeout, max_request_timeout) + + logger.debug("actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})" + .format(actual_timeout, default_timeout, query_timeout, max_request_timeout)) + + return requests, actual_timeout + + def search_multiple_requests(self, requests): + search_id = uuid4().__str__() + + for engine_name, query, request_params in requests: + th = threading.Thread( + target=processors[engine_name].search, + args=(query, request_params, self.result_container, self.start_time, self.actual_timeout), + name=search_id, + ) + th._timeout = False + th._engine_name = engine_name + th.start() + + for th in threading.enumerate(): + if th.name == search_id: + remaining_time = max(0.0, self.actual_timeout - (time() - self.start_time)) + th.join(remaining_time) + if th.is_alive(): + th._timeout = True + self.result_container.add_unresponsive_engine(th._engine_name, 'timeout') + logger.warning('engine timeout: {0}'.format(th._engine_name)) + + def search_standard(self): + """ + Update self.result_container, self.actual_timeout + """ + requests, self.actual_timeout = self._get_requests() + + # send all search-request + if requests: + self.search_multiple_requests(requests) + start_new_thread(gc.collect, tuple()) + + # return results, suggestions, answers and infoboxes + return True + + # do search-request + def search(self): + self.start_time = time() + + if not self.search_external_bang(): + if not self.search_answerers(): + self.search_standard() + + return self.result_container + + +class SearchWithPlugins(Search): + """Similar to the Search class but call the plugins.""" + + __slots__ = 'ordered_plugin_list', 'request' + + def __init__(self, search_query, ordered_plugin_list, request): + super().__init__(search_query) + self.ordered_plugin_list = ordered_plugin_list + self.request = request + + def search(self): + if plugins.call(self.ordered_plugin_list, 'pre_search', self.request, self): + super().search() + + plugins.call(self.ordered_plugin_list, 'post_search', self.request, self) + + results = self.result_container.get_ordered_results() + + for result in results: + plugins.call(self.ordered_plugin_list, 'on_result', self.request, self, result) + + return self.result_container diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py new file mode 100644 index 0000000..85b9178 --- /dev/null +++ b/searx/search/checker/__init__.py @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from .impl import Checker +from .background import initialize, get_result diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py new file mode 100644 index 0000000..0d7d1b8 --- /dev/null +++ b/searx/search/checker/__main__.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import sys +import io +import os +import argparse +import logging + +import searx.search +import searx.search.checker +from searx.search import processors +from searx.engines import engine_shortcuts + + +# configure logging +root = logging.getLogger() +handler = logging.StreamHandler(sys.stdout) +for h in root.handlers: + root.removeHandler(h) +root.addHandler(handler) + +# color only for a valid terminal +if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: + RESET_SEQ = "\033[0m" + COLOR_SEQ = "\033[1;%dm" + BOLD_SEQ = "\033[1m" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) +else: + RESET_SEQ = "" + COLOR_SEQ = "" + BOLD_SEQ = "" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" + +# equivalent of 'python -u' (unbuffered stdout, stderr) +stdout = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True) +stderr = io.TextIOWrapper(open(sys.stderr.fileno(), 'wb', 0), write_through=True) + + +# iterator of processors +def iter_processor(engine_name_list): + if len(engine_name_list) > 0: + for name in engine_name_list: + name = engine_shortcuts.get(name, name) + processor = processors.get(name) + if processor is not None: + yield name, processor + else: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}') + else: + for name, processor in searx.search.processors.items(): + yield name, processor + + +# actual check & display +def run(engine_name_list, verbose): + searx.search.initialize() + for name, processor in iter_processor(engine_name_list): + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + if not sys.stdout.isatty(): + stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + checker = searx.search.checker.Checker(processor) + checker.run() + if checker.test_results.succesfull: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n') + if verbose: + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + else: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}') + if not verbose: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n') + else: + stdout.write('\n') + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + for test_name, logs in checker.test_results.logs.items(): + for log in logs: + log = map(lambda l: l if isinstance(l, str) else repr(l), log) + stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n') + + +# call by setup.py +def main(): + parser = argparse.ArgumentParser(description='Check searx engines.') + parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', + help='engines name or shortcut list. Empty for all engines.') + parser.add_argument('--verbose', '-v', + action='store_true', dest='verbose', + help='Display details about the test results', + default=False) + args = parser.parse_args() + run(args.engine_name_list, args.verbose) + + +if __name__ == '__main__': + main() diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py new file mode 100644 index 0000000..c3292d9 --- /dev/null +++ b/searx/search/checker/background.py @@ -0,0 +1,125 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import random +import time +import threading +import os +import signal + +from searx import logger, settings, searx_debug +from searx.exceptions import SearxSettingsException +from searx.search.processors import processors +from searx.search.checker import Checker +from searx.shared import schedule, storage + + +CHECKER_RESULT = 'CHECKER_RESULT' +running = threading.Lock() + + +def _get_interval(every, error_msg): + if isinstance(every, int): + every = (every, every) + if not isinstance(every, (tuple, list))\ + or len(every) != 2\ + or not isinstance(every[0], int)\ + or not isinstance(every[1], int): + raise SearxSettingsException(error_msg, None) + return every + + +def _get_every(): + every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800)) + return _get_interval(every, 'checker.scheduling.every is not a int or list') + + +def get_result(): + serialized_result = storage.get_str(CHECKER_RESULT) + if serialized_result is not None: + return json.loads(serialized_result) + + +def _set_result(result, include_timestamp=True): + if include_timestamp: + result['timestamp'] = int(time.time() / 3600) * 3600 + storage.set_str(CHECKER_RESULT, json.dumps(result)) + + +def run(): + if not running.acquire(blocking=False): + return + try: + logger.info('Starting checker') + result = { + 'status': 'ok', + 'engines': {} + } + for name, processor in processors.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.succesfull: + result['engines'][name] = {'success': True} + else: + result['engines'][name] = {'success': False, 'errors': checker.test_results.errors} + + _set_result(result) + logger.info('Check done') + except Exception: + _set_result({'status': 'error'}) + logger.exception('Error while running the checker') + finally: + running.release() + + +def _run_with_delay(): + every = _get_every() + delay = random.randint(0, every[1] - every[0]) + logger.debug('Start checker in %i seconds', delay) + time.sleep(delay) + run() + + +def _start_scheduling(): + every = _get_every() + if schedule(every[0], _run_with_delay): + run() + + +def _signal_handler(signum, frame): + t = threading.Thread(target=run) + t.daemon = True + t.start() + + +def initialize(): + if hasattr(signal, 'SIGUSR1'): + # Windows doesn't support SIGUSR1 + logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) + signal.signal(signal.SIGUSR1, _signal_handler) + + # disabled by default + _set_result({'status': 'disabled'}, include_timestamp=False) + + # special case when debug is activate + if searx_debug and settings.get('checker', {}).get('off_when_debug', True): + logger.info('debug mode: checker is disabled') + return + + # check value of checker.scheduling.every now + scheduling = settings.get('checker', {}).get('scheduling', None) + if scheduling is None or not scheduling: + logger.info('Checker scheduler is disabled') + return + + # + _set_result({'status': 'unknown'}, include_timestamp=False) + + start_after = scheduling.get('start_after', (300, 1800)) + start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') + delay = random.randint(start_after[0], start_after[1]) + logger.info('Start checker in %i seconds', delay) + t = threading.Timer(delay, _start_scheduling) + t.daemon = True + t.start() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py new file mode 100644 index 0000000..ad45440 --- /dev/null +++ b/searx/search/checker/impl.py @@ -0,0 +1,416 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing +import types +import functools +import itertools +import threading +from time import time +from urllib.parse import urlparse + +import re +from langdetect import detect_langs +from langdetect.lang_detect_exception import LangDetectException +import requests.exceptions + +from searx import poolrequests, logger +from searx.results import ResultContainer +from searx.search.models import SearchQuery, EngineRef +from searx.search.processors import EngineProcessor + + +logger = logger.getChild('searx.search.checker') + +HTML_TAGS = [ + 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', + 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', + 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', + 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', + 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', + 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', + 'frame', 'frameset' +] + + +def get_check_no_html(): + rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS] + rep += ['</' + tag + '>' for tag in HTML_TAGS] + pattern = re.compile('|'.join(rep)) + + def f(text): + return pattern.search(text.lower()) is None + + return f + + +_check_no_html = get_check_no_html() + + +def _is_url(url): + try: + result = urlparse(url) + except ValueError: + return False + if result.scheme not in ('http', 'https'): + return False + return True + + +@functools.lru_cache(maxsize=8192) +def _is_url_image(image_url): + if not isinstance(image_url, str): + return False + + if image_url.startswith('//'): + image_url = 'https:' + image_url + + if image_url.startswith('data:'): + return image_url.startswith('data:image/') + + if not _is_url(image_url): + return False + + retry = 2 + + while retry > 0: + a = time() + try: + poolrequests.set_timeout_for_thread(10.0, time()) + r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0' + }) + if r.headers["content-type"].startswith('image/'): + return True + return False + except requests.exceptions.Timeout: + logger.error('Timeout for %s: %i', image_url, int(time() - a)) + retry -= 1 + except requests.exceptions.RequestException: + logger.exception('Exception for %s', image_url) + return False + + +def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: + return { + 'query': search_query.query, + 'lang': search_query.lang, + 'pageno': search_query.pageno, + 'safesearch': search_query.safesearch, + 'time_range': search_query.time_range, + } + + +def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ + -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: + param1 = _search_query_to_dict(sq1) + param2 = _search_query_to_dict(sq2) + common = {} + diff = {} + for k, value1 in param1.items(): + value2 = param2[k] + if value1 == value2: + common[k] = value1 + else: + diff[k] = (value1, value2) + return (common, diff) + + +class TestResults: + + __slots__ = 'errors', 'logs', 'languages' + + def __init__(self): + self.errors: typing.Dict[str, typing.List[str]] = {} + self.logs: typing.Dict[str, typing.List[typing.Any]] = {} + self.languages: typing.Set[str] = set() + + def add_error(self, test, message, *args): + # message to self.errors + errors_for_test = self.errors.setdefault(test, []) + if message not in errors_for_test: + errors_for_test.append(message) + # (message, *args) to self.logs + logs_for_test = self.logs.setdefault(test, []) + if (message, *args) not in logs_for_test: + logs_for_test.append((message, *args)) + + def add_language(self, language): + self.languages.add(language) + + @property + def succesfull(self): + return len(self.errors) == 0 + + def __iter__(self): + for test_name, errors in self.errors.items(): + for error in sorted(errors): + yield (test_name, error) + + +class ResultContainerTests: + + __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' + + def __init__(self, + test_results: TestResults, + test_name: str, + search_query: SearchQuery, + result_container: ResultContainer): + self.test_name = test_name + self.search_query = search_query + self.result_container = result_container + self.languages: typing.Set[str] = set() + self.test_results = test_results + self.stop_test = False + + @property + def result_urls(self): + results = self.result_container.get_ordered_results() + return [result['url'] for result in results if 'url' in result] + + def _record_error(self, message: str, *args) -> None: + sq = _search_query_to_dict(self.search_query) + sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()]) + self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') + + def _add_language(self, text: str) -> typing.Optional[str]: + try: + r = detect_langs(str(text)) # pylint: disable=E1101 + except LangDetectException: + return None + + if len(r) > 0 and r[0].prob > 0.95: + self.languages.add(r[0].lang) + self.test_results.add_language(r[0].lang) + return None + + def _check_result(self, result): + if not _check_no_html(result.get('title', '')): + self._record_error('HTML in title', repr(result.get('title', ''))) + if not _check_no_html(result.get('content', '')): + self._record_error('HTML in content', repr(result.get('content', ''))) + if result.get('url') is None: + self._record_error('url is None') + + self._add_language(result.get('title', '')) + self._add_language(result.get('content', '')) + + template = result.get('template', 'default.html') + if template == 'default.html': + return + if template == 'code.html': + return + if template == 'torrent.html': + return + if template == 'map.html': + return + if template == 'images.html': + thumbnail_src = result.get('thumbnail_src') + if thumbnail_src is not None: + if not _is_url_image(thumbnail_src): + self._record_error('thumbnail_src URL is invalid', thumbnail_src) + elif not _is_url_image(result.get('img_src')): + self._record_error('img_src URL is invalid', result.get('img_src')) + if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): + self._record_error('thumbnail URL is invalid', result.get('img_src')) + + def _check_results(self, results: list): + for result in results: + self._check_result(result) + + def _check_answers(self, answers): + for answer in answers: + if not _check_no_html(answer): + self._record_error('HTML in answer', answer) + + def _check_infoboxes(self, infoboxes): + for infobox in infoboxes: + if not _check_no_html(infobox.get('content', '')): + self._record_error('HTML in infobox content', infobox.get('content', '')) + self._add_language(infobox.get('content', '')) + for attribute in infobox.get('attributes', {}): + if not _check_no_html(attribute.get('value', '')): + self._record_error('HTML in infobox attribute value', attribute.get('value', '')) + + def check_basic(self): + if len(self.result_container.unresponsive_engines) > 0: + for message in self.result_container.unresponsive_engines: + self._record_error(message[1] + ' ' + (message[2] or '')) + self.stop_test = True + return + + results = self.result_container.get_ordered_results() + if len(results) > 0: + self._check_results(results) + + if len(self.result_container.answers) > 0: + self._check_answers(self.result_container.answers) + + if len(self.result_container.infoboxes) > 0: + self._check_infoboxes(self.result_container.infoboxes) + + def has_infobox(self): + """Check the ResultContainer has at least one infobox""" + if len(self.result_container.infoboxes) == 0: + self._record_error('No infobox') + + def has_answer(self): + """Check the ResultContainer has at least one answer""" + if len(self.result_container.answers) == 0: + self._record_error('No answer') + + def has_language(self, lang): + """Check at least one title or content of the results is written in the `lang`. + + Detected using pycld3, may be not accurate""" + if lang not in self.languages: + self._record_error(lang + ' not found') + + def not_empty(self): + """Check the ResultContainer has at least one answer or infobox or result""" + result_types = set() + results = self.result_container.get_ordered_results() + if len(results) > 0: + result_types.add('results') + + if len(self.result_container.answers) > 0: + result_types.add('answers') + + if len(self.result_container.infoboxes) > 0: + result_types.add('infoboxes') + + if len(result_types) == 0: + self._record_error('No result') + + def one_title_contains(self, title: str): + """Check one of the title contains `title` (case insensitive comparaison)""" + title = title.lower() + for result in self.result_container.get_ordered_results(): + if title in result['title'].lower(): + return + self._record_error(('{!r} not found in the title'.format(title))) + + +class CheckerTests: + + __slots__ = 'test_results', 'test_name', 'result_container_tests_list' + + def __init__(self, + test_results: TestResults, + test_name: str, + result_container_tests_list: typing.List[ResultContainerTests]): + self.test_results = test_results + self.test_name = test_name + self.result_container_tests_list = result_container_tests_list + + def unique_results(self): + """Check the results of each ResultContainer is unique""" + urls_list = [rct.result_urls for rct in self.result_container_tests_list] + if len(urls_list[0]) > 0: + # results on the first page + for i, urls_i in enumerate(urls_list): + for j, urls_j in enumerate(urls_list): + if i < j and urls_i == urls_j: + common, diff = _search_query_diff(self.result_container_tests_list[i].search_query, + self.result_container_tests_list[j].search_query) + common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) + diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) + diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) + self.test_results.add_error(self.test_name, + 'results are identitical for {} and {} ({})' + .format(diff1_str, diff2_str, common_str)) + + +class Checker: + + __slots__ = 'processor', 'tests', 'test_results' + + def __init__(self, processor: EngineProcessor): + self.processor = processor + self.tests = self.processor.get_tests() + self.test_results = TestResults() + + @property + def engineref_list(self): + engine_name = self.processor.engine_name + engine_category = self.processor.engine.categories[0] + return [EngineRef(engine_name, engine_category)] + + @staticmethod + def search_query_matrix_iterator(engineref_list, matrix): + p = [] + for name, values in matrix.items(): + if isinstance(values, (tuple, list)): + l = [(name, value) for value in values] + else: + l = [(name, values)] + p.append(l) + + for kwargs in itertools.product(*p): + kwargs = {k: v for k, v in kwargs} + query = kwargs['query'] + params = dict(kwargs) + del params['query'] + yield SearchQuery(query, engineref_list, **params) + + def call_test(self, obj, test_description): + if isinstance(test_description, (tuple, list)): + method, args = test_description[0], test_description[1:] + else: + method = test_description + args = () + if isinstance(method, str) and hasattr(obj, method): + getattr(obj, method)(*args) + elif isinstance(method, types.FunctionType): + method(*args) + else: + self.test_results.add_error(obj.test_name, + 'method {!r} ({}) not found for {}' + .format(method, method.__class__.__name__, obj.__class__.__name__)) + + def call_tests(self, obj, test_descriptions): + for test_description in test_descriptions: + self.call_test(obj, test_description) + + def search(self, search_query: SearchQuery) -> ResultContainer: + result_container = ResultContainer() + engineref_category = search_query.engineref_list[0].category + params = self.processor.get_params(search_query, engineref_category) + if params is not None: + with threading.RLock(): + self.processor.engine.stats['sent_search_count'] += 1 + self.processor.search(search_query.query, params, result_container, time(), 5) + return result_container + + def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: + result_container = self.search(search_query) + result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) + result_container_check.check_basic() + return result_container_check + + def run_test(self, test_name): + test_parameters = self.tests[test_name] + search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) + rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] + stop_test = False + if 'result_container' in test_parameters: + for rct in rct_list: + stop_test = stop_test or rct.stop_test + if not rct.stop_test: + self.call_tests(rct, test_parameters['result_container']) + if not stop_test: + if 'test' in test_parameters: + checker_tests = CheckerTests(self.test_results, test_name, rct_list) + self.call_tests(checker_tests, test_parameters['test']) + + def run(self): + for test_name in self.tests: + self.run_test(test_name) diff --git a/searx/search/models.py b/searx/search/models.py new file mode 100644 index 0000000..7233fac --- /dev/null +++ b/searx/search/models.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing + + +class EngineRef: + + __slots__ = 'name', 'category' + + def __init__(self, name: str, category: str): + self.name = name + self.category = category + + def __repr__(self): + return "EngineRef({!r}, {!r})".format(self.name, self.category) + + def __eq__(self, other): + return self.name == other.name and self.category == other.category + + def __hash__(self): + return hash((self.name, self.category)) + + +class SearchQuery: + """container for all the search parameters (query, language, etc...)""" + + __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ + 'timeout_limit', 'external_bang', 'engine_data' + + def __init__(self, + query: str, + engineref_list: typing.List[EngineRef], + lang: str='all', + safesearch: int=0, + pageno: int=1, + time_range: typing.Optional[str]=None, + timeout_limit: typing.Optional[float]=None, + external_bang: typing.Optional[str]=None, + engine_data: typing.Optional[typing.Dict[str, str]]=None): + self.query = query + self.engineref_list = engineref_list + self.lang = lang + self.safesearch = safesearch + self.pageno = pageno + self.time_range = time_range + self.timeout_limit = timeout_limit + self.external_bang = external_bang + self.engine_data = engine_data or {} + + @property + def categories(self): + return list(set(map(lambda engineref: engineref.category, self.engineref_list))) + + def __repr__(self): + return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ + format(self.query, self.engineref_list, self.lang, self.safesearch, + self.pageno, self.time_range, self.timeout_limit, self.external_bang) + + def __eq__(self, other): + return self.query == other.query\ + and self.engineref_list == other.engineref_list\ + and self.lang == other.lang\ + and self.safesearch == other.safesearch\ + and self.pageno == other.pageno\ + and self.time_range == other.time_range\ + and self.timeout_limit == other.timeout_limit\ + and self.external_bang == other.external_bang + + def __hash__(self): + return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, + self.timeout_limit, self.external_bang)) diff --git a/searx/search/processors/__init__.py b/searx/search/processors/__init__.py new file mode 100644 index 0000000..4cae3cd --- /dev/null +++ b/searx/search/processors/__init__.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from .online import OnlineProcessor +from .offline import OfflineProcessor +from .online_dictionary import OnlineDictionaryProcessor +from .online_currency import OnlineCurrencyProcessor +from .abstract import EngineProcessor +from searx import logger +import searx.engines as engines + + +__all__ = ['EngineProcessor', 'OfflineProcessor', 'OnlineProcessor', + 'OnlineDictionaryProcessor', 'OnlineCurrencyProcessor', 'processors'] +logger = logger.getChild('search.processors') +processors = {} + + +def get_processor_class(engine_type): + for c in [OnlineProcessor, OfflineProcessor, OnlineDictionaryProcessor, OnlineCurrencyProcessor]: + if c.engine_type == engine_type: + return c + return None + + +def get_processor(engine, engine_name): + engine_type = getattr(engine, 'engine_type', 'online') + processor_class = get_processor_class(engine_type) + if processor_class: + return processor_class(engine, engine_name) + else: + return None + + +def initialize(engine_list): + engines.initialize_engines(engine_list) + for engine_name, engine in engines.engines.items(): + processor = get_processor(engine, engine_name) + if processor is None: + logger.error('Error get processor for engine %s', engine_name) + else: + processors[engine_name] = processor diff --git a/searx/search/processors/abstract.py b/searx/search/processors/abstract.py new file mode 100644 index 0000000..26dab06 --- /dev/null +++ b/searx/search/processors/abstract.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from abc import abstractmethod, ABC +from searx import logger + + +logger = logger.getChild('searx.search.processor') + + +class EngineProcessor(ABC): + + def __init__(self, engine, engine_name): + self.engine = engine + self.engine_name = engine_name + + def get_params(self, search_query, engine_category): + # if paging is not supported, skip + if search_query.pageno > 1 and not self.engine.paging: + return None + + # if time_range is not supported, skip + if search_query.time_range and not self.engine.time_range_support: + return None + + params = {} + params['category'] = engine_category + params['pageno'] = search_query.pageno + params['safesearch'] = search_query.safesearch + params['time_range'] = search_query.time_range + params['engine_data'] = search_query.engine_data.get(self.engine_name, {}) + + if hasattr(self.engine, 'language') and self.engine.language: + params['language'] = self.engine.language + else: + params['language'] = search_query.lang + return params + + @abstractmethod + def search(self, query, params, result_container, start_time, timeout_limit): + pass + + def get_tests(self): + tests = getattr(self.engine, 'tests', None) + if tests is None: + tests = getattr(self.engine, 'additional_tests', {}) + tests.update(self.get_default_tests()) + return tests + else: + return tests + + def get_default_tests(self): + return {} diff --git a/searx/search/processors/offline.py b/searx/search/processors/offline.py new file mode 100644 index 0000000..ede8eb5 --- /dev/null +++ b/searx/search/processors/offline.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import threading +from time import time +from searx import logger +from searx.metrology.error_recorder import record_exception, record_error +from searx.search.processors.abstract import EngineProcessor + + +logger = logger.getChild('search.processor.offline') + + +class OfflineProcessor(EngineProcessor): + + engine_type = 'offline' + + def _record_stats_on_error(self, result_container, start_time): + engine_time = time() - start_time + result_container.add_timing(self.engine_name, engine_time, engine_time) + + with threading.RLock(): + self.engine.stats['errors'] += 1 + + def _search_basic(self, query, params): + return self.engine.search(query, params) + + def search(self, query, params, result_container, start_time, timeout_limit): + try: + search_results = self._search_basic(query, params) + + if search_results: + result_container.extend(self.engine_name, search_results) + + engine_time = time() - start_time + result_container.add_timing(self.engine_name, engine_time, engine_time) + with threading.RLock(): + self.engine.stats['engine_time'] += engine_time + self.engine.stats['engine_time_count'] += 1 + + except ValueError as e: + record_exception(self.engine_name, e) + self._record_stats_on_error(result_container, start_time) + logger.exception('engine {0} : invalid input : {1}'.format(self.engine_name, e)) + except Exception as e: + record_exception(self.engine_name, e) + self._record_stats_on_error(result_container, start_time) + result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash', str(e)) + logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e)) + else: + if getattr(threading.current_thread(), '_timeout', False): + record_error(self.engine_name, 'Timeout') diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py new file mode 100644 index 0000000..1fc6444 --- /dev/null +++ b/searx/search/processors/online.py @@ -0,0 +1,265 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from urllib.parse import urlparse +from time import time +import threading + +import requests.exceptions + +import searx.poolrequests as poolrequests +from searx.engines import settings +from searx import logger +from searx.utils import gen_useragent +from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException, + SearxEngineTooManyRequestsException,) +from searx.metrology.error_recorder import record_exception, record_error + +from searx.search.processors.abstract import EngineProcessor + + +logger = logger.getChild('search.processor.online') + + +def default_request_params(): + return { + 'method': 'GET', + 'headers': {}, + 'data': {}, + 'url': '', + 'cookies': {}, + 'verify': True, + 'auth': None + } + + +class OnlineProcessor(EngineProcessor): + + engine_type = 'online' + + def get_params(self, search_query, engine_category): + params = super().get_params(search_query, engine_category) + if params is None: + return None + + # skip suspended engines + if self.engine.suspend_end_time >= time(): + logger.debug('Engine currently suspended: %s', self.engine_name) + return None + + # add default params + params.update(default_request_params()) + + # add an user agent + params['headers']['User-Agent'] = gen_useragent() + + return params + + def _send_http_request(self, params): + # create dictionary which contain all + # informations about the request + request_args = dict( + headers=params['headers'], + cookies=params['cookies'], + verify=params['verify'], + auth=params['auth'] + ) + + # setting engine based proxies + if hasattr(self.engine, 'proxies'): + request_args['proxies'] = poolrequests.get_proxies(self.engine.proxies) + + # max_redirects + max_redirects = params.get('max_redirects') + if max_redirects: + request_args['max_redirects'] = max_redirects + + # allow_redirects + if 'allow_redirects' in params: + request_args['allow_redirects'] = params['allow_redirects'] + + # soft_max_redirects + soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0) + + # raise_for_status + request_args['raise_for_httperror'] = params.get('raise_for_httperror', True) + + # specific type of request (GET or POST) + if params['method'] == 'GET': + req = poolrequests.get + else: + req = poolrequests.post + + request_args['data'] = params['data'] + + # send the request + response = req(params['url'], **request_args) + + # check soft limit of the redirect count + if len(response.history) > soft_max_redirects: + # unexpected redirect : record an error + # but the engine might still return valid results. + status_code = str(response.status_code or '') + reason = response.reason or '' + hostname = str(urlparse(response.url or '').netloc) + record_error(self.engine_name, + '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects), + (status_code, reason, hostname)) + + return response + + def _search_basic(self, query, params): + # update request parameters dependent on + # search-engine (contained in engines folder) + self.engine.request(query, params) + + # ignoring empty urls + if params['url'] is None: + return None + + if not params['url']: + return None + + # send request + response = self._send_http_request(params) + + # parse the response + response.search_params = params + return self.engine.response(response) + + def search(self, query, params, result_container, start_time, timeout_limit): + # set timeout for all HTTP requests + poolrequests.set_timeout_for_thread(timeout_limit, start_time=start_time) + # reset the HTTP total time + poolrequests.reset_time_for_thread() + # enable HTTP only if explicitly enabled + poolrequests.set_enable_http_protocol(self.engine.enable_http) + + # suppose everything will be alright + requests_exception = False + suspended_time = None + + try: + # send requests and parse the results + search_results = self._search_basic(query, params) + + # check if the engine accepted the request + if search_results is not None: + # yes, so add results + result_container.extend(self.engine_name, search_results) + + # update engine time when there is no exception + engine_time = time() - start_time + page_load_time = poolrequests.get_time_for_thread() + result_container.add_timing(self.engine_name, engine_time, page_load_time) + with threading.RLock(): + self.engine.stats['engine_time'] += engine_time + self.engine.stats['engine_time_count'] += 1 + # update stats with the total HTTP time + self.engine.stats['page_load_time'] += page_load_time + self.engine.stats['page_load_count'] += 1 + except Exception as e: + record_exception(self.engine_name, e) + + # Timing + engine_time = time() - start_time + page_load_time = poolrequests.get_time_for_thread() + result_container.add_timing(self.engine_name, engine_time, page_load_time) + + # Record the errors + with threading.RLock(): + self.engine.stats['errors'] += 1 + + if (issubclass(e.__class__, requests.exceptions.Timeout)): + result_container.add_unresponsive_engine(self.engine_name, 'HTTP timeout') + # requests timeout (connect or read) + logger.error("engine {0} : HTTP requests timeout" + "(search duration : {1} s, timeout: {2} s) : {3}" + .format(self.engine_name, engine_time, timeout_limit, e.__class__.__name__)) + requests_exception = True + elif (issubclass(e.__class__, requests.exceptions.RequestException)): + result_container.add_unresponsive_engine(self.engine_name, 'HTTP error') + # other requests exception + logger.exception("engine {0} : requests exception" + "(search duration : {1} s, timeout: {2} s) : {3}" + .format(self.engine_name, engine_time, timeout_limit, e)) + requests_exception = True + elif (issubclass(e.__class__, SearxEngineCaptchaException)): + result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required') + logger.exception('engine {0} : CAPTCHA'.format(self.engine_name)) + suspended_time = e.suspended_time # pylint: disable=no-member + elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)): + result_container.add_unresponsive_engine(self.engine_name, 'too many requests') + logger.exception('engine {0} : Too many requests'.format(self.engine_name)) + suspended_time = e.suspended_time # pylint: disable=no-member + elif (issubclass(e.__class__, SearxEngineAccessDeniedException)): + result_container.add_unresponsive_engine(self.engine_name, 'blocked') + logger.exception('engine {0} : Searx is blocked'.format(self.engine_name)) + suspended_time = e.suspended_time # pylint: disable=no-member + else: + result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash') + # others errors + logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e)) + else: + if getattr(threading.current_thread(), '_timeout', False): + record_error(self.engine_name, 'Timeout') + + # suspend the engine if there is an HTTP error + # or suspended_time is defined + with threading.RLock(): + if requests_exception or suspended_time: + # update continuous_errors / suspend_end_time + self.engine.continuous_errors += 1 + if suspended_time is None: + suspended_time = min(settings['search']['max_ban_time_on_fail'], + self.engine.continuous_errors * settings['search']['ban_time_on_fail']) + self.engine.suspend_end_time = time() + suspended_time + else: + # reset the suspend variables + self.engine.continuous_errors = 0 + self.engine.suspend_end_time = 0 + + def get_default_tests(self): + tests = {} + + tests['simple'] = { + 'matrix': {'query': ('life', 'computer')}, + 'result_container': ['not_empty'], + } + + if getattr(self.engine, 'paging', False): + tests['paging'] = { + 'matrix': {'query': 'time', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + if 'general' in self.engine.categories: + # avoid documentation about HTML tags (<time> and <input type="time">) + tests['paging']['matrix']['query'] = 'news' + + if getattr(self.engine, 'time_range', False): + tests['time_range'] = { + 'matrix': {'query': 'news', + 'time_range': (None, 'day')}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + + if getattr(self.engine, 'supported_languages', []): + tests['lang_fr'] = { + 'matrix': {'query': 'paris', 'lang': 'fr'}, + 'result_container': ['not_empty', ('has_language', 'fr')], + } + tests['lang_en'] = { + 'matrix': {'query': 'paris', 'lang': 'en'}, + 'result_container': ['not_empty', ('has_language', 'en')], + } + + if getattr(self.engine, 'safesearch', False): + tests['safesearch'] = { + 'matrix': {'query': 'porn', + 'safesearch': (0, 2)}, + 'test': ['unique_results'] + } + + return tests diff --git a/searx/search/processors/online_currency.py b/searx/search/processors/online_currency.py new file mode 100644 index 0000000..0dc3f3b --- /dev/null +++ b/searx/search/processors/online_currency.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import unicodedata +import re + +from searx.data import CURRENCIES +from .online import OnlineProcessor + + +parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) + + +def normalize_name(name): + name = name.lower().replace('-', ' ').rstrip('s') + name = re.sub(' +', ' ', name) + return unicodedata.normalize('NFKD', name).lower() + + +def name_to_iso4217(name): + global CURRENCIES + name = normalize_name(name) + currency = CURRENCIES['names'].get(name, [name]) + if isinstance(currency, str): + return currency + return currency[0] + + +def iso4217_to_name(iso4217, language): + global CURRENCIES + return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217) + + +class OnlineCurrencyProcessor(OnlineProcessor): + + engine_type = 'online_currency' + + def get_params(self, search_query, engine_category): + params = super().get_params(search_query, engine_category) + if params is None: + return None + + m = parser_re.match(search_query.query) + if not m: + return None + + amount_str, from_currency, to_currency = m.groups() + try: + amount = float(amount_str) + except ValueError: + return None + from_currency = name_to_iso4217(from_currency.strip()) + to_currency = name_to_iso4217(to_currency.strip()) + + params['amount'] = amount + params['from'] = from_currency + params['to'] = to_currency + params['from_name'] = iso4217_to_name(from_currency, 'en') + params['to_name'] = iso4217_to_name(to_currency, 'en') + return params + + def get_default_tests(self): + tests = {} + + tests['currency'] = { + 'matrix': {'query': '1337 usd in rmb'}, + 'result_container': ['has_answer'], + } + + return tests diff --git a/searx/search/processors/online_dictionary.py b/searx/search/processors/online_dictionary.py new file mode 100644 index 0000000..987c710 --- /dev/null +++ b/searx/search/processors/online_dictionary.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import re + +from searx.utils import is_valid_lang +from .online import OnlineProcessor + + +parser_re = re.compile('.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) + + +class OnlineDictionaryProcessor(OnlineProcessor): + + engine_type = 'online_dictionnary' + + def get_params(self, search_query, engine_category): + params = super().get_params(search_query, engine_category) + if params is None: + return None + + m = parser_re.match(search_query.query) + if not m: + return None + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return None + + params['from_lang'] = from_lang + params['to_lang'] = to_lang + params['query'] = query + + return params + + def get_default_tests(self): + tests = {} + + if getattr(self.engine, 'paging', False): + tests['translation_paging'] = { + 'matrix': {'query': 'en-es house', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + 'test': ['unique_results'] + } + else: + tests['translation'] = { + 'matrix': {'query': 'en-es house'}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + } + + return tests |