first commit

This commit is contained in:
Iyas Altawil
2025-06-26 15:38:10 +03:30
commit e928faf6d2
899 changed files with 403713 additions and 0 deletions

208
searx/search/__init__.py Normal file
View File

@@ -0,0 +1,208 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, too-few-public-methods
# the public namespace has not yet been finally defined ..
# __all__ = ["EngineRef", "SearchQuery"]
import threading
from timeit import default_timer
from uuid import uuid4
from flask import copy_current_request_context
from searx import logger
from searx import settings
import searx.answerers
import searx.plugins
from searx.engines import load_engines
from searx.extended_types import SXNG_Request
from searx.external_bang import get_bang_url
from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time
from searx.network import initialize as initialize_network, check_network_configuration
from searx.results import ResultContainer
from searx.search.checker import initialize as initialize_checker
from searx.search.models import SearchQuery
from searx.search.processors import PROCESSORS, initialize as initialize_processors
from .models import EngineRef, SearchQuery
logger = logger.getChild('search')
def initialize(settings_engines=None, enable_checker=False, check_network=False, enable_metrics=True):
settings_engines = settings_engines or settings['engines']
load_engines(settings_engines)
initialize_network(settings_engines, settings['outgoing'])
if check_network:
check_network_configuration()
initialize_metrics([engine['name'] for engine in settings_engines], enable_metrics)
initialize_processors(settings_engines)
if enable_checker:
initialize_checker()
class Search:
"""Search information container"""
__slots__ = "search_query", "result_container", "start_time", "actual_timeout"
def __init__(self, search_query: SearchQuery):
"""Initialize the Search"""
# init vars
super().__init__()
self.search_query = search_query
self.result_container = ResultContainer()
self.start_time = None
self.actual_timeout = None
def search_external_bang(self):
"""
Check if there is a external bang.
If yes, update self.result_container and return True
"""
if self.search_query.external_bang:
self.result_container.redirect_url = get_bang_url(self.search_query)
# This means there was a valid bang and the
# rest of the search does not need to be continued
if isinstance(self.result_container.redirect_url, str):
return True
return False
def search_answerers(self):
results = searx.answerers.STORAGE.ask(self.search_query.query)
self.result_container.extend(None, results)
return bool(results)
# do search-request
def _get_requests(self):
# init vars
requests = []
# max of all selected engine timeout
default_timeout = 0
# start search-request for all selected engines
for engineref in self.search_query.engineref_list:
processor = PROCESSORS[engineref.name]
# stop the request now if the engine is suspend
if processor.extend_container_if_suspended(self.result_container):
continue
# set default request parameters
request_params = processor.get_params(self.search_query, engineref.category)
if request_params is None:
continue
counter_inc('engine', engineref.name, 'search', 'count', 'sent')
# append request to list
requests.append((engineref.name, self.search_query.query, request_params))
# update default_timeout
default_timeout = max(default_timeout, processor.engine.timeout)
# adjust timeout
max_request_timeout = settings['outgoing']['max_request_timeout']
actual_timeout = default_timeout
query_timeout = self.search_query.timeout_limit
if max_request_timeout is None and query_timeout is None:
# No max, no user query: default_timeout
pass
elif max_request_timeout is None and query_timeout is not None:
# No max, but user query: From user query except if above default
actual_timeout = min(default_timeout, query_timeout)
elif max_request_timeout is not None and query_timeout is None:
# Max, no user query: Default except if above max
actual_timeout = min(default_timeout, max_request_timeout)
elif max_request_timeout is not None and query_timeout is not None:
# Max & user query: From user query except if above max
actual_timeout = min(query_timeout, max_request_timeout)
logger.debug(
"actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})".format(
actual_timeout, default_timeout, query_timeout, max_request_timeout
)
)
return requests, actual_timeout
def search_multiple_requests(self, requests):
# pylint: disable=protected-access
search_id = str(uuid4())
for engine_name, query, request_params in requests:
_search = copy_current_request_context(PROCESSORS[engine_name].search)
th = threading.Thread( # pylint: disable=invalid-name
target=_search,
args=(query, request_params, self.result_container, self.start_time, self.actual_timeout),
name=search_id,
)
th._timeout = False
th._engine_name = engine_name
th.start()
for th in threading.enumerate(): # pylint: disable=invalid-name
if th.name == search_id:
remaining_time = max(0.0, self.actual_timeout - (default_timer() - self.start_time))
th.join(remaining_time)
if th.is_alive():
th._timeout = True
self.result_container.add_unresponsive_engine(th._engine_name, 'timeout')
PROCESSORS[th._engine_name].logger.error('engine timeout')
def search_standard(self):
"""
Update self.result_container, self.actual_timeout
"""
requests, self.actual_timeout = self._get_requests()
# send all search-request
if requests:
self.search_multiple_requests(requests)
# return results, suggestions, answers and infoboxes
return True
# do search-request
def search(self) -> ResultContainer:
self.start_time = default_timer()
if not self.search_external_bang():
if not self.search_answerers():
self.search_standard()
return self.result_container
class SearchWithPlugins(Search):
"""Inherit from the Search class, add calls to the plugins."""
__slots__ = 'user_plugins', 'request'
def __init__(self, search_query: SearchQuery, request: SXNG_Request, user_plugins: list[str]):
super().__init__(search_query)
self.user_plugins = user_plugins
self.result_container.on_result = self._on_result
# pylint: disable=line-too-long
# get the "real" request to use it outside the Flask context.
# see
# * https://github.com/pallets/flask/blob/d01d26e5210e3ee4cbbdef12f05c886e08e92852/src/flask/globals.py#L55
# * https://github.com/pallets/werkzeug/blob/3c5d3c9bd0d9ce64590f0af8997a38f3823b368d/src/werkzeug/local.py#L548-L559
# * https://werkzeug.palletsprojects.com/en/2.0.x/local/#werkzeug.local.LocalProxy._get_current_object
# pylint: enable=line-too-long
self.request = request._get_current_object()
def _on_result(self, result):
return searx.plugins.STORAGE.on_result(self.request, self, result)
def search(self) -> ResultContainer:
if searx.plugins.STORAGE.pre_search(self.request, self):
super().search()
searx.plugins.STORAGE.post_search(self.request, self)
self.result_container.close()
return self.result_container

View File

@@ -0,0 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from .impl import Checker
from .background import initialize, get_result
__all__ = ('Checker', 'initialize', 'get_result')

View File

@@ -0,0 +1,118 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
import sys
import io
import os
import argparse
import logging
import searx.search
import searx.search.checker
from searx.search import PROCESSORS
from searx.engines import engine_shortcuts
# configure logging
root = logging.getLogger()
handler = logging.StreamHandler(sys.stdout)
for h in root.handlers:
root.removeHandler(h)
root.addHandler(handler)
# color only for a valid terminal
if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']:
RESET_SEQ = "\033[0m"
COLOR_SEQ = "\033[1;%dm"
BOLD_SEQ = "\033[1m"
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8))
else:
RESET_SEQ = ""
COLOR_SEQ = ""
BOLD_SEQ = ""
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", ""
# equivalent of 'python -u' (unbuffered stdout, stderr)
stdout = io.TextIOWrapper(
# pylint: disable=consider-using-with
open(sys.stdout.fileno(), 'wb', 0),
write_through=True,
)
stderr = io.TextIOWrapper(
# pylint: disable=consider-using-with
open(sys.stderr.fileno(), 'wb', 0),
write_through=True,
)
# iterator of processors
def iter_processor(engine_name_list):
if len(engine_name_list) > 0:
for name in engine_name_list:
name = engine_shortcuts.get(name, name)
processor = PROCESSORS.get(name)
if processor is not None:
yield name, processor
else:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}\n')
else:
for name, processor in searx.search.PROCESSORS.items():
yield name, processor
# actual check & display
def run(engine_name_list, verbose):
searx.search.initialize()
name_checker_list = []
for name, processor in iter_processor(engine_name_list):
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n')
if not sys.stdout.isatty():
stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n')
checker = searx.search.checker.Checker(processor)
checker.run()
name_checker_list.append((name, checker))
stdout.write(f'\n== {BOLD_SEQ}Results{RESET_SEQ} ' + '=' * 70 + '\n')
for name, checker in name_checker_list:
if checker.test_results.successful:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n')
if verbose:
stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n')
else:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}')
if not verbose:
errors = [test_name + ': ' + error for test_name, error in checker.test_results]
stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n')
else:
stdout.write('\n')
stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n')
for test_name, logs in checker.test_results.logs.items():
for log in logs:
log = map(lambda l: l if isinstance(l, str) else repr(l), log)
stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n')
# call by setup.py
def main():
parser = argparse.ArgumentParser(description='Check searx engines.')
parser.add_argument(
'engine_name_list',
metavar='engine name',
type=str,
nargs='*',
help='engines name or shortcut list. Empty for all engines.',
)
parser.add_argument(
'--verbose',
'-v',
action='store_true',
dest='verbose',
help='Display details about the test results',
default=False,
)
args = parser.parse_args()
run(args.engine_name_list, args.verbose)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,168 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, cyclic-import
import json
import time
import threading
import os
import signal
from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
import redis.exceptions
from searx import logger, settings, sxng_debug
from searx.redisdb import client as get_redis_client
from searx.exceptions import SearxSettingsException
from searx.search.processors import PROCESSORS
from searx.search.checker import Checker
from searx.search.checker.scheduler import scheduler_function
REDIS_RESULT_KEY = 'SearXNG_checker_result'
REDIS_LOCK_KEY = 'SearXNG_checker_lock'
CheckerResult = Union['CheckerOk', 'CheckerErr', 'CheckerOther']
class CheckerOk(TypedDict):
"""Checking the engines succeeded"""
status: Literal['ok']
engines: Dict[str, 'EngineResult']
timestamp: int
class CheckerErr(TypedDict):
"""Checking the engines failed"""
status: Literal['error']
timestamp: int
class CheckerOther(TypedDict):
"""The status is unknown or disabled"""
status: Literal['unknown', 'disabled']
EngineResult = Union['EngineOk', 'EngineErr']
class EngineOk(TypedDict):
"""Checking the engine succeeded"""
success: Literal[True]
class EngineErr(TypedDict):
"""Checking the engine failed"""
success: Literal[False]
errors: Dict[str, List[str]]
def _get_interval(every: Any, error_msg: str) -> Tuple[int, int]:
if isinstance(every, int):
return (every, every)
if (
not isinstance(every, (tuple, list))
or len(every) != 2 # type: ignore
or not isinstance(every[0], int)
or not isinstance(every[1], int)
):
raise SearxSettingsException(error_msg, None)
return (every[0], every[1])
def get_result() -> CheckerResult:
client = get_redis_client()
if client is None:
# without Redis, the checker is disabled
return {'status': 'disabled'}
serialized_result: Optional[bytes] = client.get(REDIS_RESULT_KEY)
if serialized_result is None:
# the Redis key does not exist
return {'status': 'unknown'}
return json.loads(serialized_result)
def _set_result(result: CheckerResult):
client = get_redis_client()
if client is None:
# without Redis, the function does nothing
return
client.set(REDIS_RESULT_KEY, json.dumps(result))
def _timestamp():
return int(time.time() / 3600) * 3600
def run():
try:
# use a Redis lock to make sure there is no checker running at the same time
# (this should not happen, this is a safety measure)
with get_redis_client().lock(REDIS_LOCK_KEY, blocking_timeout=60, timeout=3600):
logger.info('Starting checker')
result: CheckerOk = {'status': 'ok', 'engines': {}, 'timestamp': _timestamp()}
for name, processor in PROCESSORS.items():
logger.debug('Checking %s engine', name)
checker = Checker(processor)
checker.run()
if checker.test_results.successful:
result['engines'][name] = {'success': True}
else:
result['engines'][name] = {'success': False, 'errors': checker.test_results.errors}
_set_result(result)
logger.info('Check done')
except redis.exceptions.LockError:
_set_result({'status': 'error', 'timestamp': _timestamp()})
logger.exception('Error while running the checker')
except Exception: # pylint: disable=broad-except
_set_result({'status': 'error', 'timestamp': _timestamp()})
logger.exception('Error while running the checker')
def _signal_handler(_signum: int, _frame: Any):
t = threading.Thread(target=run)
t.daemon = True
t.start()
def initialize():
if hasattr(signal, 'SIGUSR1'):
# Windows doesn't support SIGUSR1
logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid())
signal.signal(signal.SIGUSR1, _signal_handler)
# special case when debug is activate
if sxng_debug and settings['checker']['off_when_debug']:
logger.info('debug mode: checker is disabled')
return
# check value of checker.scheduling.every now
scheduling = settings['checker']['scheduling']
if scheduling is None or not scheduling:
logger.info('Checker scheduler is disabled')
return
# make sure there is a Redis connection
if get_redis_client() is None:
logger.error('The checker requires Redis')
return
# start the background scheduler
every_range = _get_interval(scheduling.get('every', (300, 1800)), 'checker.scheduling.every is not a int or list')
start_after_range = _get_interval(
scheduling.get('start_after', (300, 1800)), 'checker.scheduling.start_after is not a int or list'
)
t = threading.Thread(
target=scheduler_function,
args=(start_after_range[0], start_after_range[1], every_range[0], every_range[1], run),
name='checker_scheduler',
)
t.daemon = True
t.start()

View File

@@ -0,0 +1,442 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, invalid-name
import gc
import typing
import types
import functools
import itertools
from time import time
from timeit import default_timer
from urllib.parse import urlparse
import re
import httpx
from searx import network, logger
from searx.utils import gen_useragent, detect_language
from searx.results import ResultContainer
from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
from searx.metrics import counter_inc
logger = logger.getChild('searx.search.checker')
HTML_TAGS = [
# fmt: off
'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script',
'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite',
'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small',
'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt',
'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input',
'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet',
'frame', 'frameset'
# fmt: on
]
def get_check_no_html():
rep = ['<' + tag + r'[^\>]*>' for tag in HTML_TAGS]
rep += ['</' + tag + '>' for tag in HTML_TAGS]
pattern = re.compile('|'.join(rep))
def f(text):
return pattern.search(text.lower()) is None
return f
_check_no_html = get_check_no_html()
def _is_url(url):
try:
result = urlparse(url)
except ValueError:
return False
if result.scheme not in ('http', 'https'):
return False
return True
@functools.lru_cache(maxsize=8192)
def _download_and_check_if_image(image_url: str) -> bool:
"""Download an URL and check if the Content-Type starts with "image/"
This function should not be called directly: use _is_url_image
otherwise the cache of functools.lru_cache contains data: URL which might be huge.
"""
retry = 2
while retry > 0:
a = time()
try:
# use "image_proxy" (avoid HTTP/2)
network.set_context_network_name('image_proxy')
r, stream = network.stream(
'GET',
image_url,
timeout=10.0,
allow_redirects=True,
headers={
'User-Agent': gen_useragent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-GPC': '1',
'Cache-Control': 'max-age=0',
},
)
r.close()
if r.status_code == 200:
is_image = r.headers.get('content-type', '').startswith('image/')
else:
is_image = False
del r
del stream
return is_image
except httpx.TimeoutException:
logger.error('Timeout for %s: %i', image_url, int(time() - a))
retry -= 1
except httpx.HTTPError:
logger.exception('Exception for %s', image_url)
return False
return False
def _is_url_image(image_url) -> bool:
"""Normalize image_url"""
if not isinstance(image_url, str):
return False
if image_url.startswith('//'):
image_url = 'https:' + image_url
if image_url.startswith('data:'):
return image_url.startswith('data:image/')
if not _is_url(image_url):
return False
return _download_and_check_if_image(image_url)
def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]:
return {
'query': search_query.query,
'lang': search_query.lang,
'pageno': search_query.pageno,
'safesearch': search_query.safesearch,
'time_range': search_query.time_range,
}
def _search_query_diff(
sq1: SearchQuery, sq2: SearchQuery
) -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]:
param1 = _search_query_to_dict(sq1)
param2 = _search_query_to_dict(sq2)
common = {}
diff = {}
for k, value1 in param1.items():
value2 = param2[k]
if value1 == value2:
common[k] = value1
else:
diff[k] = (value1, value2)
return (common, diff)
class TestResults: # pylint: disable=missing-class-docstring
__slots__ = 'errors', 'logs', 'languages'
def __init__(self):
self.errors: typing.Dict[str, typing.List[str]] = {}
self.logs: typing.Dict[str, typing.List[typing.Any]] = {}
self.languages: typing.Set[str] = set()
def add_error(self, test, message, *args):
# message to self.errors
errors_for_test = self.errors.setdefault(test, [])
if message not in errors_for_test:
errors_for_test.append(message)
# (message, *args) to self.logs
logs_for_test = self.logs.setdefault(test, [])
if (message, *args) not in logs_for_test:
logs_for_test.append((message, *args))
def add_language(self, language):
self.languages.add(language)
@property
def successful(self):
return len(self.errors) == 0
def __iter__(self):
for test_name, errors in self.errors.items():
for error in sorted(errors):
yield (test_name, error)
class ResultContainerTests: # pylint: disable=missing-class-docstring
__slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results'
def __init__(
self, test_results: TestResults, test_name: str, search_query: SearchQuery, result_container: ResultContainer
):
self.test_name = test_name
self.search_query = search_query
self.result_container = result_container
self.languages: typing.Set[str] = set()
self.test_results = test_results
self.stop_test = False
@property
def result_urls(self):
results = self.result_container.get_ordered_results()
return [result['url'] for result in results if 'url' in result]
def _record_error(self, message: str, *args) -> None:
sq = _search_query_to_dict(self.search_query)
sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()])
self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
def _add_language(self, text: str) -> typing.Optional[str]:
langStr = detect_language(text)
if langStr:
self.languages.add(langStr)
self.test_results.add_language(langStr)
def _check_result(self, result):
if not _check_no_html(result.get('title', '')):
self._record_error('HTML in title', repr(result.get('title', '')))
if not _check_no_html(result.get('content', '')):
self._record_error('HTML in content', repr(result.get('content', '')))
if result.get('url') is None:
self._record_error('url is None')
self._add_language(result.get('title', ''))
self._add_language(result.get('content', ''))
template = result.get('template', 'default.html')
if template == 'default.html':
return
if template == 'code.html':
return
if template == 'torrent.html':
return
if template == 'map.html':
return
if template == 'images.html':
thumbnail_src = result.get('thumbnail_src')
if thumbnail_src is not None:
if not _is_url_image(thumbnail_src):
self._record_error('thumbnail_src URL is invalid', thumbnail_src)
elif not _is_url_image(result.get('img_src')):
self._record_error('img_src URL is invalid', result.get('img_src'))
if template == 'videos.html' and not _is_url_image(result.get('thumbnail')):
self._record_error('thumbnail URL is invalid', result.get('img_src'))
def _check_results(self, results: list):
for result in results:
self._check_result(result)
def _check_answers(self, answers):
for answer in answers:
if not _check_no_html(answer):
self._record_error('HTML in answer', answer)
def _check_infoboxes(self, infoboxes):
for infobox in infoboxes:
if not _check_no_html(infobox.get('content', '')):
self._record_error('HTML in infobox content', infobox.get('content', ''))
self._add_language(infobox.get('content', ''))
for attribute in infobox.get('attributes', {}):
if not _check_no_html(attribute.get('value', '')):
self._record_error('HTML in infobox attribute value', attribute.get('value', ''))
def check_basic(self):
if len(self.result_container.unresponsive_engines) > 0:
for message in self.result_container.unresponsive_engines:
self._record_error(message[1] + ' ' + (message[2] or ''))
self.stop_test = True
return
results = self.result_container.get_ordered_results()
if len(results) > 0:
self._check_results(results)
if len(self.result_container.answers) > 0:
self._check_answers(self.result_container.answers)
if len(self.result_container.infoboxes) > 0:
self._check_infoboxes(self.result_container.infoboxes)
def has_infobox(self):
"""Check the ResultContainer has at least one infobox"""
if len(self.result_container.infoboxes) == 0:
self._record_error('No infobox')
def has_answer(self):
"""Check the ResultContainer has at least one answer"""
if len(self.result_container.answers) == 0:
self._record_error('No answer')
def has_language(self, lang):
"""Check at least one title or content of the results is written in the `lang`.
Detected using pycld3, may be not accurate"""
if lang not in self.languages:
self._record_error(lang + ' not found')
def not_empty(self):
"""Check the ResultContainer has at least one answer or infobox or result"""
result_types = set()
results = self.result_container.get_ordered_results()
if len(results) > 0:
result_types.add('results')
if len(self.result_container.answers) > 0:
result_types.add('answers')
if len(self.result_container.infoboxes) > 0:
result_types.add('infoboxes')
if len(result_types) == 0:
self._record_error('No result')
def one_title_contains(self, title: str):
"""Check one of the title contains `title` (case insensitive comparison)"""
title = title.lower()
for result in self.result_container.get_ordered_results():
if title in result['title'].lower():
return
self._record_error(('{!r} not found in the title'.format(title)))
class CheckerTests: # pylint: disable=missing-class-docstring, too-few-public-methods
__slots__ = 'test_results', 'test_name', 'result_container_tests_list'
def __init__(
self, test_results: TestResults, test_name: str, result_container_tests_list: typing.List[ResultContainerTests]
):
self.test_results = test_results
self.test_name = test_name
self.result_container_tests_list = result_container_tests_list
def unique_results(self):
"""Check the results of each ResultContainer is unique"""
urls_list = [rct.result_urls for rct in self.result_container_tests_list]
if len(urls_list[0]) > 0:
# results on the first page
for i, urls_i in enumerate(urls_list):
for j, urls_j in enumerate(urls_list):
if i < j and urls_i == urls_j:
common, diff = _search_query_diff(
self.result_container_tests_list[i].search_query,
self.result_container_tests_list[j].search_query,
)
common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()])
diff1_str = ', '.join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()])
diff2_str = ', '.join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()])
self.test_results.add_error(
self.test_name,
'results are identical for {} and {} ({})'.format(diff1_str, diff2_str, common_str),
)
class Checker: # pylint: disable=missing-class-docstring
__slots__ = 'processor', 'tests', 'test_results'
def __init__(self, processor: EngineProcessor):
self.processor = processor
self.tests = self.processor.get_tests()
self.test_results = TestResults()
@property
def engineref_list(self):
engine_name = self.processor.engine_name
engine_category = self.processor.engine.categories[0]
return [EngineRef(engine_name, engine_category)]
@staticmethod
def search_query_matrix_iterator(engineref_list, matrix):
p = []
for name, values in matrix.items():
if isinstance(values, (tuple, list)):
l = [(name, value) for value in values]
else:
l = [(name, values)]
p.append(l)
for kwargs in itertools.product(*p):
kwargs = dict(kwargs)
query = kwargs['query']
params = dict(kwargs)
del params['query']
yield SearchQuery(query, engineref_list, **params)
def call_test(self, obj, test_description):
if isinstance(test_description, (tuple, list)):
method, args = test_description[0], test_description[1:]
else:
method = test_description
args = ()
if isinstance(method, str) and hasattr(obj, method):
getattr(obj, method)(*args)
elif isinstance(method, types.FunctionType):
method(*args)
else:
self.test_results.add_error(
obj.test_name,
'method {!r} ({}) not found for {}'.format(method, method.__class__.__name__, obj.__class__.__name__),
)
def call_tests(self, obj, test_descriptions):
for test_description in test_descriptions:
self.call_test(obj, test_description)
def search(self, search_query: SearchQuery) -> ResultContainer:
result_container = ResultContainer()
engineref_category = search_query.engineref_list[0].category
params = self.processor.get_params(search_query, engineref_category)
if params is not None:
counter_inc('engine', search_query.engineref_list[0].name, 'search', 'count', 'sent')
self.processor.search(search_query.query, params, result_container, default_timer(), 5)
return result_container
def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests:
result_container = self.search(search_query)
result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container)
result_container_check.check_basic()
return result_container_check
def run_test(self, test_name):
test_parameters = self.tests[test_name]
search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix']))
rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list]
stop_test = False
if 'result_container' in test_parameters:
for rct in rct_list:
stop_test = stop_test or rct.stop_test
if not rct.stop_test:
self.call_tests(rct, test_parameters['result_container'])
if not stop_test:
if 'test' in test_parameters:
checker_tests = CheckerTests(self.test_results, test_name, rct_list)
self.call_tests(checker_tests, test_parameters['test'])
def run(self):
for test_name in self.tests:
self.run_test(test_name)
# clear cache
_download_and_check_if_image.cache_clear()
# force a garbage collector
gc.collect()

View File

@@ -0,0 +1,36 @@
-- SPDX-License-Identifier: AGPL-3.0-or-later
--
-- This script is not a string in scheduler.py, so editors can provide syntax highlighting.
-- The Redis KEY is defined here and not in Python on purpose:
-- only this LUA script can read and update this key to avoid lock and concurrency issues.
local redis_key = 'SearXNG_checker_next_call_ts'
local now = redis.call('TIME')[1]
local start_after_from = ARGV[1]
local start_after_to = ARGV[2]
local every_from = ARGV[3]
local every_to = ARGV[4]
local next_call_ts = redis.call('GET', redis_key)
if (next_call_ts == false or next_call_ts == nil) then
-- the scheduler has never run on this Redis instance, so:
-- 1/ the scheduler does not run now
-- 2/ the next call is a random time between start_after_from and start_after_to
local initial_delay = math.random(start_after_from, start_after_to)
redis.call('SET', redis_key, now + initial_delay)
return { false, initial_delay }
end
-- next_call_ts is defined
-- --> if now is lower than next_call_ts then we don't run the embedded checker
-- --> if now is higher then we update next_call_ts and ask to run the embedded checker now.
local call_now = next_call_ts <= now
if call_now then
-- the checker runs now, define the timestamp of the next call:
-- this is a random delay between every_from and every_to
local periodic_delay = math.random(every_from, every_to)
next_call_ts = redis.call('INCRBY', redis_key, periodic_delay)
end
return { call_now, next_call_ts - now }

View File

@@ -0,0 +1,58 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
"""Lame scheduler which use Redis as a source of truth:
* the Redis key SearXNG_checker_next_call_ts contains the next time the embedded checker should run.
* to avoid lock, a unique Redis script reads and updates the Redis key SearXNG_checker_next_call_ts.
* this Redis script returns a list of two elements:
* the first one is a boolean. If True, the embedded checker must run now in this worker.
* the second element is the delay in second to wait before the next call to the Redis script.
This scheduler is not generic on purpose: if more feature are required, a dedicate scheduler must be used
(= a better scheduler should not use the web workers)
"""
import logging
import time
from pathlib import Path
from typing import Callable
from searx.redisdb import client as get_redis_client
from searx.redislib import lua_script_storage
logger = logging.getLogger('searx.search.checker')
SCHEDULER_LUA = Path(__file__).parent / "scheduler.lua"
def scheduler_function(start_after_from: int, start_after_to: int, every_from: int, every_to: int, callback: Callable):
"""Run the checker periodically. The function never returns.
Parameters:
* start_after_from and start_after_to: when to call "callback" for the first on the Redis instance
* every_from and every_to: after the first call, how often to call "callback"
There is no issue:
* to call this function is multiple workers
* to kill workers at any time as long there is one at least one worker
"""
scheduler_now_script = SCHEDULER_LUA.open().read()
while True:
# ask the Redis script what to do
# the script says
# * if the checker must run now.
# * how to long to way before calling the script again (it can be call earlier, but not later).
script = lua_script_storage(get_redis_client(), scheduler_now_script)
call_now, wait_time = script(args=[start_after_from, start_after_to, every_from, every_to])
# does the worker run the checker now?
if call_now:
# run the checker
try:
callback()
except Exception: # pylint: disable=broad-except
logger.exception("Error calling the embedded checker")
# only worker display the wait_time
logger.info("Next call to the checker in %s seconds", wait_time)
# wait until the next call
time.sleep(wait_time)

132
searx/search/models.py Normal file
View File

@@ -0,0 +1,132 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
import typing
import babel
class EngineRef:
"""Reference by names to an engine and category"""
__slots__ = 'name', 'category'
def __init__(self, name: str, category: str):
self.name = name
self.category = category
def __repr__(self):
return "EngineRef({!r}, {!r})".format(self.name, self.category)
def __eq__(self, other):
return self.name == other.name and self.category == other.category
def __hash__(self):
return hash((self.name, self.category))
class SearchQuery:
"""container for all the search parameters (query, language, etc...)"""
__slots__ = (
'query',
'engineref_list',
'lang',
'locale',
'safesearch',
'pageno',
'time_range',
'timeout_limit',
'external_bang',
'engine_data',
'redirect_to_first_result',
)
def __init__(
self,
query: str,
engineref_list: typing.List[EngineRef],
lang: str = 'all',
safesearch: int = 0,
pageno: int = 1,
time_range: typing.Optional[str] = None,
timeout_limit: typing.Optional[float] = None,
external_bang: typing.Optional[str] = None,
engine_data: typing.Optional[typing.Dict[str, str]] = None,
redirect_to_first_result: typing.Optional[bool] = None,
): # pylint:disable=too-many-arguments
self.query = query
self.engineref_list = engineref_list
self.lang = lang
self.safesearch = safesearch
self.pageno = pageno
self.time_range = time_range
self.timeout_limit = timeout_limit
self.external_bang = external_bang
self.engine_data = engine_data or {}
self.redirect_to_first_result = redirect_to_first_result
self.locale = None
if self.lang:
try:
self.locale = babel.Locale.parse(self.lang, sep='-')
except babel.core.UnknownLocaleError:
pass
@property
def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
def __repr__(self):
return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".format(
self.query,
self.engineref_list,
self.lang,
self.safesearch,
self.pageno,
self.time_range,
self.timeout_limit,
self.external_bang,
self.redirect_to_first_result,
)
def __eq__(self, other):
return (
self.query == other.query
and self.engineref_list == other.engineref_list
and self.lang == other.lang
and self.safesearch == other.safesearch
and self.pageno == other.pageno
and self.time_range == other.time_range
and self.timeout_limit == other.timeout_limit
and self.external_bang == other.external_bang
and self.redirect_to_first_result == other.redirect_to_first_result
)
def __hash__(self):
return hash(
(
self.query,
tuple(self.engineref_list),
self.lang,
self.safesearch,
self.pageno,
self.time_range,
self.timeout_limit,
self.external_bang,
self.redirect_to_first_result,
)
)
def __copy__(self):
return SearchQuery(
self.query,
self.engineref_list,
self.lang,
self.safesearch,
self.pageno,
self.time_range,
self.timeout_limit,
self.external_bang,
self.engine_data,
self.redirect_to_first_result,
)

View File

@@ -0,0 +1,82 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implement request processors used by engine-types.
"""
__all__ = [
'EngineProcessor',
'OfflineProcessor',
'OnlineProcessor',
'OnlineDictionaryProcessor',
'OnlineCurrencyProcessor',
'OnlineUrlSearchProcessor',
'PROCESSORS',
]
import threading
from typing import Dict
from searx import logger
from searx import engines
from .online import OnlineProcessor
from .offline import OfflineProcessor
from .online_dictionary import OnlineDictionaryProcessor
from .online_currency import OnlineCurrencyProcessor
from .online_url_search import OnlineUrlSearchProcessor
from .abstract import EngineProcessor
logger = logger.getChild('search.processors')
PROCESSORS: Dict[str, EngineProcessor] = {}
"""Cache request processors, stored by *engine-name* (:py:func:`initialize`)
:meta hide-value:
"""
def get_processor_class(engine_type):
"""Return processor class according to the ``engine_type``"""
for c in [
OnlineProcessor,
OfflineProcessor,
OnlineDictionaryProcessor,
OnlineCurrencyProcessor,
OnlineUrlSearchProcessor,
]:
if c.engine_type == engine_type:
return c
return None
def get_processor(engine, engine_name):
"""Return processor instance that fits to ``engine.engine.type``)"""
engine_type = getattr(engine, 'engine_type', 'online')
processor_class = get_processor_class(engine_type)
if processor_class:
return processor_class(engine, engine_name)
return None
def initialize_processor(processor):
"""Initialize one processor
Call the init function of the engine
"""
if processor.has_initialize_function:
t = threading.Thread(target=processor.initialize, daemon=True)
t.start()
def initialize(engine_list):
"""Initialize all engines and store a processor for each engine in :py:obj:`PROCESSORS`."""
for engine_data in engine_list:
engine_name = engine_data['name']
engine = engines.engines.get(engine_name)
if engine:
processor = get_processor(engine, engine_name)
initialize_processor(processor)
if processor is None:
engine.logger.error('Error get processor for engine %s', engine_name)
else:
PROCESSORS[engine_name] = processor

View File

@@ -0,0 +1,195 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Abstract base classes for engine request processors.
"""
import threading
from abc import abstractmethod, ABC
from timeit import default_timer
from typing import Dict, Union
from searx import settings, logger
from searx.engines import engines
from searx.network import get_time_for_thread, get_network
from searx.metrics import histogram_observe, counter_inc, count_exception, count_error
from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineResponseException
from searx.utils import get_engine_from_settings
logger = logger.getChild('searx.search.processor')
SUSPENDED_STATUS: Dict[Union[int, str], 'SuspendedStatus'] = {}
class SuspendedStatus:
"""Class to handle suspend state."""
__slots__ = 'suspend_end_time', 'suspend_reason', 'continuous_errors', 'lock'
def __init__(self):
self.lock = threading.Lock()
self.continuous_errors = 0
self.suspend_end_time = 0
self.suspend_reason = None
@property
def is_suspended(self):
return self.suspend_end_time >= default_timer()
def suspend(self, suspended_time, suspend_reason):
with self.lock:
# update continuous_errors / suspend_end_time
self.continuous_errors += 1
if suspended_time is None:
suspended_time = min(
settings['search']['max_ban_time_on_fail'],
self.continuous_errors * settings['search']['ban_time_on_fail'],
)
self.suspend_end_time = default_timer() + suspended_time
self.suspend_reason = suspend_reason
logger.debug('Suspend for %i seconds', suspended_time)
def resume(self):
with self.lock:
# reset the suspend variables
self.continuous_errors = 0
self.suspend_end_time = 0
self.suspend_reason = None
class EngineProcessor(ABC):
"""Base classes used for all types of request processors."""
__slots__ = 'engine', 'engine_name', 'lock', 'suspended_status', 'logger'
def __init__(self, engine, engine_name: str):
self.engine = engine
self.engine_name = engine_name
self.logger = engines[engine_name].logger
key = get_network(self.engine_name)
key = id(key) if key else self.engine_name
self.suspended_status = SUSPENDED_STATUS.setdefault(key, SuspendedStatus())
def initialize(self):
try:
self.engine.init(get_engine_from_settings(self.engine_name))
except SearxEngineResponseException as exc:
self.logger.warning('Fail to initialize // %s', exc)
except Exception: # pylint: disable=broad-except
self.logger.exception('Fail to initialize')
else:
self.logger.debug('Initialized')
@property
def has_initialize_function(self):
return hasattr(self.engine, 'init')
def handle_exception(self, result_container, exception_or_message, suspend=False):
# update result_container
if isinstance(exception_or_message, BaseException):
exception_class = exception_or_message.__class__
module_name = getattr(exception_class, '__module__', 'builtins')
module_name = '' if module_name == 'builtins' else module_name + '.'
error_message = module_name + exception_class.__qualname__
else:
error_message = exception_or_message
result_container.add_unresponsive_engine(self.engine_name, error_message)
# metrics
counter_inc('engine', self.engine_name, 'search', 'count', 'error')
if isinstance(exception_or_message, BaseException):
count_exception(self.engine_name, exception_or_message)
else:
count_error(self.engine_name, exception_or_message)
# suspend the engine ?
if suspend:
suspended_time = None
if isinstance(exception_or_message, SearxEngineAccessDeniedException):
suspended_time = exception_or_message.suspended_time
self.suspended_status.suspend(suspended_time, error_message) # pylint: disable=no-member
def _extend_container_basic(self, result_container, start_time, search_results):
# update result_container
result_container.extend(self.engine_name, search_results)
engine_time = default_timer() - start_time
page_load_time = get_time_for_thread()
result_container.add_timing(self.engine_name, engine_time, page_load_time)
# metrics
counter_inc('engine', self.engine_name, 'search', 'count', 'successful')
histogram_observe(engine_time, 'engine', self.engine_name, 'time', 'total')
if page_load_time is not None:
histogram_observe(page_load_time, 'engine', self.engine_name, 'time', 'http')
def extend_container(self, result_container, start_time, search_results):
if getattr(threading.current_thread(), '_timeout', False):
# the main thread is not waiting anymore
self.handle_exception(result_container, 'timeout', None)
else:
# check if the engine accepted the request
if search_results is not None:
self._extend_container_basic(result_container, start_time, search_results)
self.suspended_status.resume()
def extend_container_if_suspended(self, result_container):
if self.suspended_status.is_suspended:
result_container.add_unresponsive_engine(
self.engine_name, self.suspended_status.suspend_reason, suspended=True
)
return True
return False
def get_params(self, search_query, engine_category):
"""Returns a set of (see :ref:`request params <engine request arguments>`) or
``None`` if request is not supported.
Not supported conditions (``None`` is returned):
- A page-number > 1 when engine does not support paging.
- A time range when the engine does not support time range.
"""
# if paging is not supported, skip
if search_query.pageno > 1 and not self.engine.paging:
return None
# if max page is reached, skip
max_page = self.engine.max_page or settings['search']['max_page']
if max_page and max_page < search_query.pageno:
return None
# if time_range is not supported, skip
if search_query.time_range and not self.engine.time_range_support:
return None
params = {}
params["query"] = search_query.query
params['category'] = engine_category
params['pageno'] = search_query.pageno
params['safesearch'] = search_query.safesearch
params['time_range'] = search_query.time_range
params['engine_data'] = search_query.engine_data.get(self.engine_name, {})
params['searxng_locale'] = search_query.lang
# deprecated / vintage --> use params['searxng_locale']
#
# Conditions related to engine's traits are implemented in engine.traits
# module. Don't do 'locale' decisions here in the abstract layer of the
# search processor, just pass the value from user's choice unchanged to
# the engine request.
if hasattr(self.engine, 'language') and self.engine.language:
params['language'] = self.engine.language
else:
params['language'] = search_query.lang
return params
@abstractmethod
def search(self, query, params, result_container, start_time, timeout_limit):
pass
def get_tests(self):
tests = getattr(self.engine, 'tests', None)
if tests is None:
tests = getattr(self.engine, 'additional_tests', {})
tests.update(self.get_default_tests())
return tests
def get_default_tests(self):
return {}

View File

@@ -0,0 +1,26 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Processors for engine-type: ``offline``
"""
from .abstract import EngineProcessor
class OfflineProcessor(EngineProcessor):
"""Processor class used by ``offline`` engines"""
engine_type = 'offline'
def _search_basic(self, query, params):
return self.engine.search(query, params)
def search(self, query, params, result_container, start_time, timeout_limit):
try:
search_results = self._search_basic(query, params)
self.extend_container(result_container, start_time, search_results)
except ValueError as e:
# do not record the error
self.logger.exception('engine {0} : invalid input : {1}'.format(self.engine_name, e))
except Exception as e: # pylint: disable=broad-except
self.handle_exception(result_container, e)
self.logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e))

View File

@@ -0,0 +1,233 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Processors for engine-type: ``online``
"""
# pylint: disable=use-dict-literal
from timeit import default_timer
import asyncio
import ssl
import httpx
import searx.network
from searx.utils import gen_useragent
from searx.exceptions import (
SearxEngineAccessDeniedException,
SearxEngineCaptchaException,
SearxEngineTooManyRequestsException,
)
from searx.metrics.error_recorder import count_error
from .abstract import EngineProcessor
def default_request_params():
"""Default request parameters for ``online`` engines."""
return {
# fmt: off
'method': 'GET',
'headers': {},
'data': {},
'url': '',
'cookies': {},
'auth': None
# fmt: on
}
class OnlineProcessor(EngineProcessor):
"""Processor class for ``online`` engines."""
engine_type = 'online'
def initialize(self):
# set timeout for all HTTP requests
searx.network.set_timeout_for_thread(self.engine.timeout, start_time=default_timer())
# reset the HTTP total time
searx.network.reset_time_for_thread()
# set the network
searx.network.set_context_network_name(self.engine_name)
super().initialize()
def get_params(self, search_query, engine_category):
"""Returns a set of :ref:`request params <engine request online>` or ``None``
if request is not supported.
"""
params = super().get_params(search_query, engine_category)
if params is None:
return None
# add default params
params.update(default_request_params())
# add an user agent
params['headers']['User-Agent'] = gen_useragent()
# add Accept-Language header
if self.engine.send_accept_language_header and search_query.locale:
ac_lang = search_query.locale.language
if search_query.locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
search_query.locale.language,
search_query.locale.territory,
search_query.locale.language,
)
params['headers']['Accept-Language'] = ac_lang
self.logger.debug('HTTP Accept-Language: %s', params['headers'].get('Accept-Language', ''))
return params
def _send_http_request(self, params):
# create dictionary which contain all
# information about the request
request_args = dict(headers=params['headers'], cookies=params['cookies'], auth=params['auth'])
# verify
# if not None, it overrides the verify value defined in the network.
# use False to accept any server certificate
# use a path to file to specify a server certificate
verify = params.get('verify')
if verify is not None:
request_args['verify'] = params['verify']
# max_redirects
max_redirects = params.get('max_redirects')
if max_redirects:
request_args['max_redirects'] = max_redirects
# allow_redirects
if 'allow_redirects' in params:
request_args['allow_redirects'] = params['allow_redirects']
# soft_max_redirects
soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0)
# raise_for_status
request_args['raise_for_httperror'] = params.get('raise_for_httperror', True)
# specific type of request (GET or POST)
if params['method'] == 'GET':
req = searx.network.get
else:
req = searx.network.post
request_args['data'] = params['data']
# send the request
response = req(params['url'], **request_args)
# check soft limit of the redirect count
if len(response.history) > soft_max_redirects:
# unexpected redirect : record an error
# but the engine might still return valid results.
status_code = str(response.status_code or '')
reason = response.reason_phrase or ''
hostname = response.url.host
count_error(
self.engine_name,
'{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
(status_code, reason, hostname),
secondary=True,
)
return response
def _search_basic(self, query, params):
# update request parameters dependent on
# search-engine (contained in engines folder)
self.engine.request(query, params)
# ignoring empty urls
if not params['url']:
return None
# send request
response = self._send_http_request(params)
# parse the response
response.search_params = params
return self.engine.response(response)
def search(self, query, params, result_container, start_time, timeout_limit):
# set timeout for all HTTP requests
searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time)
# reset the HTTP total time
searx.network.reset_time_for_thread()
# set the network
searx.network.set_context_network_name(self.engine_name)
try:
# send requests and parse the results
search_results = self._search_basic(query, params)
self.extend_container(result_container, start_time, search_results)
except ssl.SSLError as e:
# requests timeout (connect or read)
self.handle_exception(result_container, e, suspend=True)
self.logger.error("SSLError {}, verify={}".format(e, searx.network.get_network(self.engine_name).verify))
except (httpx.TimeoutException, asyncio.TimeoutError) as e:
# requests timeout (connect or read)
self.handle_exception(result_container, e, suspend=True)
self.logger.error(
"HTTP requests timeout (search duration : {0} s, timeout: {1} s) : {2}".format(
default_timer() - start_time, timeout_limit, e.__class__.__name__
)
)
except (httpx.HTTPError, httpx.StreamError) as e:
# other requests exception
self.handle_exception(result_container, e, suspend=True)
self.logger.exception(
"requests exception (search duration : {0} s, timeout: {1} s) : {2}".format(
default_timer() - start_time, timeout_limit, e
)
)
except SearxEngineCaptchaException as e:
self.handle_exception(result_container, e, suspend=True)
self.logger.exception('CAPTCHA')
except SearxEngineTooManyRequestsException as e:
self.handle_exception(result_container, e, suspend=True)
self.logger.exception('Too many requests')
except SearxEngineAccessDeniedException as e:
self.handle_exception(result_container, e, suspend=True)
self.logger.exception('SearXNG is blocked')
except Exception as e: # pylint: disable=broad-except
self.handle_exception(result_container, e)
self.logger.exception('exception : {0}'.format(e))
def get_default_tests(self):
tests = {}
tests['simple'] = {
'matrix': {'query': ('life', 'computer')},
'result_container': ['not_empty'],
}
if getattr(self.engine, 'paging', False):
tests['paging'] = {
'matrix': {'query': 'time', 'pageno': (1, 2, 3)},
'result_container': ['not_empty'],
'test': ['unique_results'],
}
if 'general' in self.engine.categories:
# avoid documentation about HTML tags (<time> and <input type="time">)
tests['paging']['matrix']['query'] = 'news'
if getattr(self.engine, 'time_range', False):
tests['time_range'] = {
'matrix': {'query': 'news', 'time_range': (None, 'day')},
'result_container': ['not_empty'],
'test': ['unique_results'],
}
if getattr(self.engine, 'traits', False):
tests['lang_fr'] = {
'matrix': {'query': 'paris', 'lang': 'fr'},
'result_container': ['not_empty', ('has_language', 'fr')],
}
tests['lang_en'] = {
'matrix': {'query': 'paris', 'lang': 'en'},
'result_container': ['not_empty', ('has_language', 'en')],
}
if getattr(self.engine, 'safesearch', False):
tests['safesearch'] = {'matrix': {'query': 'porn', 'safesearch': (0, 2)}, 'test': ['unique_results']}
return tests

View File

@@ -0,0 +1,63 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Processors for engine-type: ``online_currency``
"""
import unicodedata
import re
from searx.data import CURRENCIES
from .online import OnlineProcessor
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
def normalize_name(name: str):
name = name.strip()
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
class OnlineCurrencyProcessor(OnlineProcessor):
"""Processor class used by ``online_currency`` engines."""
engine_type = 'online_currency'
def get_params(self, search_query, engine_category):
"""Returns a set of :ref:`request params <engine request online_currency>`
or ``None`` if search query does not match to :py:obj:`parser_re`."""
params = super().get_params(search_query, engine_category)
if params is None:
return None
m = parser_re.match(search_query.query)
if not m:
return None
amount_str, from_currency, to_currency = m.groups()
try:
amount = float(amount_str)
except ValueError:
return None
from_currency = CURRENCIES.name_to_iso4217(normalize_name(from_currency))
to_currency = CURRENCIES.name_to_iso4217(normalize_name(to_currency))
params['amount'] = amount
params['from'] = from_currency
params['to'] = to_currency
params['from_name'] = CURRENCIES.iso4217_to_name(from_currency, "en")
params['to_name'] = CURRENCIES.iso4217_to_name(to_currency, "en")
return params
def get_default_tests(self):
tests = {}
tests['currency'] = {
'matrix': {'query': '1337 usd in rmb'},
'result_container': ['has_answer'],
}
return tests

View File

@@ -0,0 +1,60 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Processors for engine-type: ``online_dictionary``
"""
import re
from searx.utils import is_valid_lang
from .online import OnlineProcessor
parser_re = re.compile('.*?([a-z]+)-([a-z]+) (.+)$', re.I)
class OnlineDictionaryProcessor(OnlineProcessor):
"""Processor class used by ``online_dictionary`` engines."""
engine_type = 'online_dictionary'
def get_params(self, search_query, engine_category):
"""Returns a set of :ref:`request params <engine request online_dictionary>` or
``None`` if search query does not match to :py:obj:`parser_re`.
"""
params = super().get_params(search_query, engine_category)
if params is None:
return None
m = parser_re.match(search_query.query)
if not m:
return None
from_lang, to_lang, query = m.groups()
from_lang = is_valid_lang(from_lang)
to_lang = is_valid_lang(to_lang)
if not from_lang or not to_lang:
return None
params['from_lang'] = from_lang
params['to_lang'] = to_lang
params['query'] = query
return params
def get_default_tests(self):
tests = {}
if getattr(self.engine, 'paging', False):
tests['translation_paging'] = {
'matrix': {'query': 'en-es house', 'pageno': (1, 2, 3)},
'result_container': ['not_empty', ('one_title_contains', 'house')],
'test': ['unique_results'],
}
else:
tests['translation'] = {
'matrix': {'query': 'en-es house'},
'result_container': ['not_empty', ('one_title_contains', 'house')],
}
return tests

View File

@@ -0,0 +1,45 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Processors for engine-type: ``online_url_search``
"""
import re
from .online import OnlineProcessor
re_search_urls = {
'http': re.compile(r'https?:\/\/[^ ]*'),
'ftp': re.compile(r'ftps?:\/\/[^ ]*'),
'data:image': re.compile('data:image/[^; ]*;base64,[^ ]*'),
}
class OnlineUrlSearchProcessor(OnlineProcessor):
"""Processor class used by ``online_url_search`` engines."""
engine_type = 'online_url_search'
def get_params(self, search_query, engine_category):
"""Returns a set of :ref:`request params <engine request online>` or ``None`` if
search query does not match to :py:obj:`re_search_urls`.
"""
params = super().get_params(search_query, engine_category)
if params is None:
return None
url_match = False
search_urls = {}
for k, v in re_search_urls.items():
m = v.search(search_query.query)
v = None
if m:
url_match = True
v = m[0]
search_urls[k] = v
if not url_match:
return None
params['search_urls'] = search_urls
return params