first commit

This commit is contained in:
Iyas Altawil
2025-06-26 15:38:10 +03:30
commit e928faf6d2
899 changed files with 403713 additions and 0 deletions

127
searx/__init__.py Normal file
View File

@@ -0,0 +1,127 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, cyclic-import
import sys
import os
from os.path import dirname, abspath
import logging
import searx.unixthreadname
import searx.settings_loader
from searx.settings_defaults import SCHEMA, apply_schema
# Debug
LOG_FORMAT_DEBUG = '%(levelname)-7s %(name)-30.30s: %(message)s'
# Production
LOG_FORMAT_PROD = '%(asctime)-15s %(levelname)s:%(name)s: %(message)s'
LOG_LEVEL_PROD = logging.WARNING
searx_dir = abspath(dirname(__file__))
searx_parent_dir = abspath(dirname(dirname(__file__)))
settings = {}
sxng_debug = False
logger = logging.getLogger('searx')
_unset = object()
def init_settings():
"""Initialize global ``settings`` and ``sxng_debug`` variables and
``logger`` from ``SEARXNG_SETTINGS_PATH``.
"""
global settings, sxng_debug # pylint: disable=global-variable-not-assigned
cfg, msg = searx.settings_loader.load_settings(load_user_settings=True)
cfg = cfg or {}
apply_schema(cfg, SCHEMA, [])
settings.clear()
settings.update(cfg)
sxng_debug = get_setting("general.debug")
if sxng_debug:
_logging_config_debug()
else:
logging.basicConfig(level=LOG_LEVEL_PROD, format=LOG_FORMAT_PROD)
logging.root.setLevel(level=LOG_LEVEL_PROD)
logging.getLogger('werkzeug').setLevel(level=LOG_LEVEL_PROD)
logger.info(msg)
# log max_request_timeout
max_request_timeout = settings['outgoing']['max_request_timeout']
if max_request_timeout is None:
logger.info('max_request_timeout=%s', repr(max_request_timeout))
else:
logger.info('max_request_timeout=%i second(s)', max_request_timeout)
if settings['server']['public_instance']:
logger.warning(
"Be aware you have activated features intended only for public instances. "
"This force the usage of the limiter and link_token / "
"see https://docs.searxng.org/admin/searx.limiter.html"
)
def get_setting(name, default=_unset):
"""Returns the value to which ``name`` point. If there is no such name in the
settings and the ``default`` is unset, a :py:obj:`KeyError` is raised.
"""
value = settings
for a in name.split('.'):
if isinstance(value, dict):
value = value.get(a, _unset)
else:
value = _unset
if value is _unset:
if default is _unset:
raise KeyError(name)
value = default
break
return value
def _is_color_terminal():
if os.getenv('TERM') in ('dumb', 'unknown'):
return False
return sys.stdout.isatty()
def _logging_config_debug():
try:
import coloredlogs # pylint: disable=import-outside-toplevel
except ImportError:
coloredlogs = None
log_level = os.environ.get('SEARXNG_DEBUG_LOG_LEVEL', 'DEBUG')
if coloredlogs and _is_color_terminal():
level_styles = {
'spam': {'color': 'green', 'faint': True},
'debug': {},
'notice': {'color': 'magenta'},
'success': {'bold': True, 'color': 'green'},
'info': {'bold': True, 'color': 'cyan'},
'warning': {'color': 'yellow'},
'error': {'color': 'red'},
'critical': {'bold': True, 'color': 'red'},
}
field_styles = {
'asctime': {'color': 'green'},
'hostname': {'color': 'magenta'},
'levelname': {'color': 8},
'name': {'color': 8},
'programname': {'color': 'cyan'},
'username': {'color': 'yellow'},
}
coloredlogs.install(level=log_level, level_styles=level_styles, field_styles=field_styles, fmt=LOG_FORMAT_DEBUG)
else:
logging.basicConfig(level=logging.getLevelName(log_level), format=LOG_FORMAT_DEBUG)
init_settings()

View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""The *answerers* give instant answers related to the search query, they
usually provide answers of type :py:obj:`Answer <searx.result_types.Answer>`.
Here is an example of a very simple answerer that adds a "Hello" into the answer
area:
.. code::
from flask_babel import gettext as _
from searx.answerers import Answerer
from searx.result_types import Answer
class MyAnswerer(Answerer):
keywords = [ "hello", "hello world" ]
def info(self):
return AnswererInfo(name=_("Hello"), description=_("lorem .."), keywords=self.keywords)
def answer(self, request, search):
return [ Answer(answer="Hello") ]
----
.. autoclass:: Answerer
:members:
.. autoclass:: AnswererInfo
:members:
.. autoclass:: AnswerStorage
:members:
.. autoclass:: searx.answerers._core.ModuleAnswerer
:members:
:show-inheritance:
"""
from __future__ import annotations
__all__ = ["AnswererInfo", "Answerer", "AnswerStorage"]
from ._core import AnswererInfo, Answerer, AnswerStorage
STORAGE: AnswerStorage = AnswerStorage()
STORAGE.load_builtins()

169
searx/answerers/_core.py Normal file
View File

@@ -0,0 +1,169 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=too-few-public-methods, missing-module-docstring
from __future__ import annotations
import abc
import importlib
import logging
import pathlib
import warnings
from dataclasses import dataclass
from searx.utils import load_module
from searx.result_types.answer import BaseAnswer
_default = pathlib.Path(__file__).parent
log: logging.Logger = logging.getLogger("searx.answerers")
@dataclass
class AnswererInfo:
"""Object that holds information about an answerer, these infos are shown
to the user in the Preferences menu.
To be able to translate the information into other languages, the text must
be written in English and translated with :py:obj:`flask_babel.gettext`.
"""
name: str
"""Name of the *answerer*."""
description: str
"""Short description of the *answerer*."""
examples: list[str]
"""List of short examples of the usage / of query terms."""
keywords: list[str]
"""See :py:obj:`Answerer.keywords`"""
class Answerer(abc.ABC):
"""Abstract base class of answerers."""
keywords: list[str]
"""Keywords to which the answerer has *answers*."""
@abc.abstractmethod
def answer(self, query: str) -> list[BaseAnswer]:
"""Function that returns a list of answers to the question/query."""
@abc.abstractmethod
def info(self) -> AnswererInfo:
"""Information about the *answerer*, see :py:obj:`AnswererInfo`."""
class ModuleAnswerer(Answerer):
"""A wrapper class for legacy *answerers* where the names (keywords, answer,
info) are implemented on the module level (not in a class).
.. note::
For internal use only!
"""
def __init__(self, mod):
for name in ["keywords", "self_info", "answer"]:
if not getattr(mod, name, None):
raise SystemExit(2)
if not isinstance(mod.keywords, tuple):
raise SystemExit(2)
self.module = mod
self.keywords = mod.keywords # type: ignore
def answer(self, query: str) -> list[BaseAnswer]:
return self.module.answer(query)
def info(self) -> AnswererInfo:
kwargs = self.module.self_info()
kwargs["keywords"] = self.keywords
return AnswererInfo(**kwargs)
class AnswerStorage(dict):
"""A storage for managing the *answerers* of SearXNG. With the
:py:obj:`AnswerStorage.ask`” method, a caller can ask questions to all
*answerers* and receives a list of the results."""
answerer_list: set[Answerer]
"""The list of :py:obj:`Answerer` in this storage."""
def __init__(self):
super().__init__()
self.answerer_list = set()
def load_builtins(self):
"""Loads ``answerer.py`` modules from the python packages in
:origin:`searx/answerers`. The python modules are wrapped by
:py:obj:`ModuleAnswerer`."""
for f in _default.iterdir():
if f.name.startswith("_"):
continue
if f.is_file() and f.suffix == ".py":
self.register_by_fqn(f"searx.answerers.{f.stem}.SXNGAnswerer")
continue
# for backward compatibility (if a fork has additional answerers)
if f.is_dir() and (f / "answerer.py").exists():
warnings.warn(
f"answerer module {f} is deprecated / migrate to searx.answerers.Answerer", DeprecationWarning
)
mod = load_module("answerer.py", str(f))
self.register(ModuleAnswerer(mod))
def register_by_fqn(self, fqn: str):
"""Register a :py:obj:`Answerer` via its fully qualified class namen(FQN)."""
mod_name, _, obj_name = fqn.rpartition('.')
mod = importlib.import_module(mod_name)
code_obj = getattr(mod, obj_name, None)
if code_obj is None:
msg = f"answerer {fqn} is not implemented"
log.critical(msg)
raise ValueError(msg)
self.register(code_obj())
def register(self, answerer: Answerer):
"""Register a :py:obj:`Answerer`."""
self.answerer_list.add(answerer)
for _kw in answerer.keywords:
self[_kw] = self.get(_kw, [])
self[_kw].append(answerer)
def ask(self, query: str) -> list[BaseAnswer]:
"""An answerer is identified via keywords, if there is a keyword at the
first position in the ``query`` for which there is one or more
answerers, then these are called, whereby the entire ``query`` is passed
as argument to the answerer function."""
results = []
keyword = None
for keyword in query.split():
if keyword:
break
if not keyword or keyword not in self:
return results
for answerer in self[keyword]:
for answer in answerer.answer(query):
# In case of *answers* prefix ``answerer:`` is set, see searx.result_types.Result
answer.engine = f"answerer: {keyword}"
results.append(answer)
return results
@property
def info(self) -> list[AnswererInfo]:
return [a.info() for a in self.answerer_list]

80
searx/answerers/random.py Normal file
View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from __future__ import annotations
import hashlib
import random
import string
import uuid
from flask_babel import gettext
from searx.result_types import Answer
from searx.result_types.answer import BaseAnswer
from . import Answerer, AnswererInfo
def random_characters():
random_string_letters = string.ascii_lowercase + string.digits + string.ascii_uppercase
return [random.choice(random_string_letters) for _ in range(random.randint(8, 32))]
def random_string():
return ''.join(random_characters())
def random_float():
return str(random.random())
def random_int():
random_int_max = 2**31
return str(random.randint(-random_int_max, random_int_max))
def random_sha256():
m = hashlib.sha256()
m.update(''.join(random_characters()).encode())
return str(m.hexdigest())
def random_uuid():
return str(uuid.uuid4())
def random_color():
color = "%06x" % random.randint(0, 0xFFFFFF)
return f"#{color.upper()}"
class SXNGAnswerer(Answerer):
"""Random value generator"""
keywords = ["random"]
random_types = {
"string": random_string,
"int": random_int,
"float": random_float,
"sha256": random_sha256,
"uuid": random_uuid,
"color": random_color,
}
def info(self):
return AnswererInfo(
name=gettext(self.__doc__),
description=gettext("Generate different random values"),
keywords=self.keywords,
examples=[f"random {x}" for x in self.random_types],
)
def answer(self, query: str) -> list[BaseAnswer]:
parts = query.split()
if len(parts) != 2 or parts[1] not in self.random_types:
return []
return [Answer(answer=self.random_types[parts[1]]())]

View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from __future__ import annotations
from functools import reduce
from operator import mul
import babel
import babel.numbers
from flask_babel import gettext
from searx.extended_types import sxng_request
from searx.result_types import Answer
from searx.result_types.answer import BaseAnswer
from . import Answerer, AnswererInfo
kw2func = [
("min", min),
("max", max),
("avg", lambda args: sum(args) / len(args)),
("sum", sum),
("prod", lambda args: reduce(mul, args, 1)),
]
class SXNGAnswerer(Answerer):
"""Statistics functions"""
keywords = [kw for kw, _ in kw2func]
def info(self):
return AnswererInfo(
name=gettext(self.__doc__),
description=gettext("Compute {func} of the arguments".format(func='/'.join(self.keywords))),
keywords=self.keywords,
examples=["avg 123 548 2.04 24.2"],
)
def answer(self, query: str) -> list[BaseAnswer]:
results = []
parts = query.split()
if len(parts) < 2:
return results
ui_locale = babel.Locale.parse(sxng_request.preferences.get_value('locale'), sep='-')
try:
args = [babel.numbers.parse_decimal(num, ui_locale, numbering_system="latn") for num in parts[1:]]
except: # pylint: disable=bare-except
# seems one of the args is not a float type, can't be converted to float
return results
for k, func in kw2func:
if k == parts[0]:
res = func(args)
res = babel.numbers.format_decimal(res, locale=ui_locale)
f_str = ', '.join(babel.numbers.format_decimal(arg, locale=ui_locale) for arg in args)
results.append(Answer(answer=f"[{ui_locale}] {k}({f_str}) = {res} "))
break
return results

384
searx/autocomplete.py Normal file
View File

@@ -0,0 +1,384 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This module implements functions needed for the autocompleter.
"""
# pylint: disable=use-dict-literal
import json
import html
from urllib.parse import urlencode, quote_plus
import lxml.etree
import lxml.html
from httpx import HTTPError
from searx.extended_types import SXNG_Response
from searx import settings
from searx.engines import (
engines,
google,
)
from searx.network import get as http_get, post as http_post
from searx.exceptions import SearxEngineResponseException
from searx.utils import extr, gen_useragent
def update_kwargs(**kwargs):
if 'timeout' not in kwargs:
kwargs['timeout'] = settings['outgoing']['request_timeout']
kwargs['raise_for_httperror'] = True
def get(*args, **kwargs) -> SXNG_Response:
update_kwargs(**kwargs)
return http_get(*args, **kwargs)
def post(*args, **kwargs) -> SXNG_Response:
update_kwargs(**kwargs)
return http_post(*args, **kwargs)
def baidu(query, _lang):
# baidu search autocompleter
base_url = "https://www.baidu.com/sugrec?"
response = get(base_url + urlencode({'ie': 'utf-8', 'json': 1, 'prod': 'pc', 'wd': query}))
results = []
if response.ok:
data = response.json()
if 'g' in data:
for item in data['g']:
results.append(item['q'])
return results
def brave(query, _lang):
# brave search autocompleter
url = 'https://search.brave.com/api/suggest?'
url += urlencode({'q': query})
country = 'all'
# if lang in _brave:
# country = lang
kwargs = {'cookies': {'country': country}}
resp = get(url, **kwargs)
results = []
if resp.ok:
data = resp.json()
for item in data[1]:
results.append(item)
return results
def dbpedia(query, _lang):
# dbpedia autocompleter, no HTTPS
autocomplete_url = 'https://lookup.dbpedia.org/api/search.asmx/KeywordSearch?'
response = get(autocomplete_url + urlencode(dict(QueryString=query)))
results = []
if response.ok:
dom = lxml.etree.fromstring(response.content)
results = dom.xpath('//Result/Label//text()')
return results
def duckduckgo(query, sxng_locale):
"""Autocomplete from DuckDuckGo. Supports DuckDuckGo's languages"""
traits = engines['duckduckgo'].traits
args = {
'q': query,
'kl': traits.get_region(sxng_locale, traits.all_locale),
}
url = 'https://duckduckgo.com/ac/?type=list&' + urlencode(args)
resp = get(url)
ret_val = []
if resp.ok:
j = resp.json()
if len(j) > 1:
ret_val = j[1]
return ret_val
def google_complete(query, sxng_locale):
"""Autocomplete from Google. Supports Google's languages and subdomains
(:py:obj:`searx.engines.google.get_google_info`) by using the async REST
API::
https://{subdomain}/complete/search?{args}
"""
google_info = google.get_google_info({'searxng_locale': sxng_locale}, engines['google'].traits)
url = 'https://{subdomain}/complete/search?{args}'
args = urlencode(
{
'q': query,
'client': 'gws-wiz',
'hl': google_info['params']['hl'],
}
)
results = []
resp = get(url.format(subdomain=google_info['subdomain'], args=args))
if resp and resp.ok:
json_txt = resp.text[resp.text.find('[') : resp.text.find(']', -3) + 1]
data = json.loads(json_txt)
for item in data[0]:
results.append(lxml.html.fromstring(item[0]).text_content())
return results
def mwmbl(query, _lang):
"""Autocomplete from Mwmbl_."""
# mwmbl autocompleter
url = 'https://api.mwmbl.org/search/complete?{query}'
results = get(url.format(query=urlencode({'q': query}))).json()[1]
# results starting with `go:` are direct urls and not useful for auto completion
return [result for result in results if not result.startswith("go: ") and not result.startswith("search: ")]
def naver(query, _lang):
# Naver search autocompleter
url = f"https://ac.search.naver.com/nx/ac?{urlencode({'q': query, 'r_format': 'json', 'st': 0})}"
response = get(url)
results = []
if response.ok:
data = response.json()
if data.get('items'):
for item in data['items'][0]:
results.append(item[0])
return results
def qihu360search(query, _lang):
# 360Search search autocompleter
url = f"https://sug.so.360.cn/suggest?{urlencode({'format': 'json', 'word': query})}"
response = get(url)
results = []
if response.ok:
data = response.json()
if 'result' in data:
for item in data['result']:
results.append(item['word'])
return results
def quark(query, _lang):
# Quark search autocompleter
url = f"https://sugs.m.sm.cn/web?{urlencode({'q': query})}"
response = get(url)
results = []
if response.ok:
data = response.json()
for item in data.get('r', []):
results.append(item['w'])
return results
def seznam(query, _lang):
# seznam search autocompleter
url = 'https://suggest.seznam.cz/fulltext/cs?{query}'
resp = get(
url.format(
query=urlencode(
{'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'}
)
)
)
if not resp.ok:
return []
data = resp.json()
return [
''.join([part.get('text', '') for part in item.get('text', [])])
for item in data.get('result', [])
if item.get('itemType', None) == 'ItemType.TEXT'
]
def sogou(query, _lang):
# Sogou search autocompleter
base_url = "https://sor.html5.qq.com/api/getsug?"
response = get(base_url + urlencode({'m': 'searxng', 'key': query}))
if response.ok:
raw_json = extr(response.text, "[", "]", default="")
try:
data = json.loads(f"[{raw_json}]]")
return data[1]
except json.JSONDecodeError:
return []
return []
def startpage(query, sxng_locale):
"""Autocomplete from Startpage's Firefox extension.
Supports the languages specified in lang_map.
"""
lang_map = {
'da': 'dansk',
'de': 'deutsch',
'en': 'english',
'es': 'espanol',
'fr': 'francais',
'nb': 'norsk',
'nl': 'nederlands',
'pl': 'polski',
'pt': 'portugues',
'sv': 'svenska',
}
base_lang = sxng_locale.split('-')[0]
lui = lang_map.get(base_lang, 'english')
url_params = {
'q': query,
'format': 'opensearch',
'segment': 'startpage.defaultffx',
'lui': lui,
}
url = f'https://www.startpage.com/suggestions?{urlencode(url_params)}'
# Needs user agent, returns a 204 otherwise
h = {'User-Agent': gen_useragent()}
resp = get(url, headers=h)
if resp.ok:
try:
data = resp.json()
if len(data) >= 2 and isinstance(data[1], list):
return data[1]
except json.JSONDecodeError:
pass
return []
def stract(query, _lang):
# stract autocompleter (beta)
url = f"https://stract.com/beta/api/autosuggest?q={quote_plus(query)}"
resp = post(url)
if not resp.ok:
return []
return [html.unescape(suggestion['raw']) for suggestion in resp.json()]
def swisscows(query, _lang):
# swisscows autocompleter
url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5'
resp = json.loads(get(url.format(query=urlencode({'query': query}))).text)
return resp
def qwant(query, sxng_locale):
"""Autocomplete from Qwant. Supports Qwant's regions."""
results = []
locale = engines['qwant'].traits.get_region(sxng_locale, 'en_US')
url = 'https://api.qwant.com/v3/suggest?{query}'
resp = get(url.format(query=urlencode({'q': query, 'locale': locale, 'version': '2'})))
if resp.ok:
data = resp.json()
if data['status'] == 'success':
for item in data['data']['items']:
results.append(item['value'])
return results
def wikipedia(query, sxng_locale):
"""Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
results = []
eng_traits = engines['wikipedia'].traits
wiki_lang = eng_traits.get_language(sxng_locale, 'en')
wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org') # type: ignore
url = 'https://{wiki_netloc}/w/api.php?{args}'
args = urlencode(
{
'action': 'opensearch',
'format': 'json',
'formatversion': '2',
'search': query,
'namespace': '0',
'limit': '10',
}
)
resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
if resp.ok:
data = resp.json()
if len(data) > 1:
results = data[1]
return results
def yandex(query, _lang):
# yandex autocompleter
url = "https://suggest.yandex.com/suggest-ff.cgi?{0}"
resp = json.loads(get(url.format(urlencode(dict(part=query)))).text)
if len(resp) > 1:
return resp[1]
return []
backends = {
'360search': qihu360search,
'baidu': baidu,
'brave': brave,
'dbpedia': dbpedia,
'duckduckgo': duckduckgo,
'google': google_complete,
'mwmbl': mwmbl,
'naver': naver,
'quark': quark,
'qwant': qwant,
'seznam': seznam,
'sogou': sogou,
'startpage': startpage,
'stract': stract,
'swisscows': swisscows,
'wikipedia': wikipedia,
'yandex': yandex,
}
def search_autocomplete(backend_name, query, sxng_locale):
backend = backends.get(backend_name)
if backend is None:
return []
try:
return backend(query, sxng_locale)
except (HTTPError, SearxEngineResponseException):
return []

58
searx/babel_extract.py Normal file
View File

@@ -0,0 +1,58 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This module implements the :origin:`searxng_msg <babel.cfg>` extractor to
extract messages from:
- :origin:`searx/searxng.msg`
The ``searxng.msg`` files are selected by Babel_, see Babel's configuration in
:origin:`babel.cfg`::
searxng_msg = searx.babel_extract.extract
...
[searxng_msg: **/searxng.msg]
A ``searxng.msg`` file is a python file that is *executed* by the
:py:obj:`extract` function. Additional ``searxng.msg`` files can be added by:
1. Adding a ``searxng.msg`` file in one of the SearXNG python packages and
2. implement a method in :py:obj:`extract` that yields messages from this file.
.. _Babel: https://babel.pocoo.org/en/latest/index.html
"""
from os import path
SEARXNG_MSG_FILE = "searxng.msg"
_MSG_FILES = [path.join(path.dirname(__file__), SEARXNG_MSG_FILE)]
def extract(
# pylint: disable=unused-argument
fileobj,
keywords,
comment_tags,
options,
):
"""Extract messages from ``searxng.msg`` files by a custom extractor_.
.. _extractor:
https://babel.pocoo.org/en/latest/messages.html#writing-extraction-methods
"""
if fileobj.name not in _MSG_FILES:
raise RuntimeError("don't know how to extract messages from %s" % fileobj.name)
namespace = {}
exec(fileobj.read(), {}, namespace) # pylint: disable=exec-used
for obj_name in namespace['__all__']:
obj = namespace[obj_name]
if isinstance(obj, list):
for msg in obj:
# (lineno, funcname, message, comments)
yield 0, '_', msg, [f"{obj_name}"]
elif isinstance(obj, dict):
for k, msg in obj.items():
yield 0, '_', msg, [f"{obj_name}['{k}']"]
else:
raise ValueError(f"{obj_name} should be list or dict")

View File

@@ -0,0 +1,22 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. _botdetection src:
Implementations used for bot detection.
"""
from ._helpers import dump_request
from ._helpers import get_real_ip
from ._helpers import get_network
from ._helpers import too_many_requests
__all__ = ['dump_request', 'get_network', 'get_real_ip', 'too_many_requests']
redis_client = None
cfg = None
def init(_cfg, _redis_client):
global redis_client, cfg # pylint: disable=global-statement
redis_client = _redis_client
cfg = _cfg

View File

@@ -0,0 +1,137 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, invalid-name
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
IPv4Address,
IPv6Address,
ip_network,
ip_address,
)
import flask
import werkzeug
from searx import logger
from searx.extended_types import SXNG_Request
from . import config
logger = logger.getChild('botdetection')
def dump_request(request: SXNG_Request):
return (
request.path
+ " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
+ " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
+ " || form: %s" % request.form
+ " || Accept: %s" % request.headers.get('Accept')
+ " || Accept-Language: %s" % request.headers.get('Accept-Language')
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+ " || Content-Type: %s" % request.headers.get('Content-Type')
+ " || Content-Length: %s" % request.headers.get('Content-Length')
+ " || Connection: %s" % request.headers.get('Connection')
+ " || User-Agent: %s" % request.headers.get('User-Agent')
+ " || Sec-Fetch-Site: %s" % request.headers.get('Sec-Fetch-Site')
+ " || Sec-Fetch-Mode: %s" % request.headers.get('Sec-Fetch-Mode')
+ " || Sec-Fetch-Dest: %s" % request.headers.get('Sec-Fetch-Dest')
)
def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None:
"""Returns a HTTP 429 response object and writes a ERROR message to the
'botdetection' logger. This function is used in part by the filter methods
to return the default ``Too Many Requests`` response.
"""
logger.debug("BLOCK %s: %s", network.compressed, log_msg)
return flask.make_response(('Too Many Requests', 429))
def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network:
"""Returns the (client) network of whether the real_ip is part of."""
if real_ip.version == 6:
prefix = cfg['real_ip.ipv6_prefix']
else:
prefix = cfg['real_ip.ipv4_prefix']
network = ip_network(f"{real_ip}/{prefix}", strict=False)
# logger.debug("get_network(): %s", network.compressed)
return network
_logged_errors = []
def _log_error_only_once(err_msg):
if err_msg not in _logged_errors:
logger.error(err_msg)
_logged_errors.append(err_msg)
def get_real_ip(request: SXNG_Request) -> str:
"""Returns real IP of the request. Since not all proxies set all the HTTP
headers and incoming headers can be faked it may happen that the IP cannot
be determined correctly.
.. sidebar:: :py:obj:`flask.Request.remote_addr`
SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
This function tries to get the remote IP in the order listed below,
additional some tests are done and if inconsistencies or errors are
detected, they are logged.
The remote IP of the request is taken from (first match):
- X-Forwarded-For_ header
- `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
- :py:obj:`flask.Request.remote_addr`
.. _ProxyFix:
https://werkzeug.palletsprojects.com/middleware/proxy_fix/
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""
forwarded_for = request.headers.get("X-Forwarded-For")
real_ip = request.headers.get('X-Real-IP')
remote_addr = request.remote_addr
# logger.debug(
# "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr
# )
if not forwarded_for:
_log_error_only_once("X-Forwarded-For header is not set!")
else:
from . import cfg # pylint: disable=import-outside-toplevel, cyclic-import
forwarded_for = [x.strip() for x in forwarded_for.split(',')]
x_for: int = cfg['real_ip.x_for'] # type: ignore
forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
if not real_ip:
_log_error_only_once("X-Real-IP header is not set!")
if forwarded_for and real_ip and forwarded_for != real_ip:
logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)
if forwarded_for and remote_addr and forwarded_for != remote_addr:
logger.warning(
"IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
)
if real_ip and remote_addr and real_ip != remote_addr:
logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
request_ip = ip_address(forwarded_for or real_ip or remote_addr or '0.0.0.0')
if request_ip.version == 6 and request_ip.ipv4_mapped:
request_ip = request_ip.ipv4_mapped
# logger.debug("get_real_ip() -> %s", request_ip)
return str(request_ip)

View File

@@ -0,0 +1,381 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Configuration class :py:class:`Config` with deep-update, schema validation
and deprecated names.
The :py:class:`Config` class implements a configuration that is based on
structured dictionaries. The configuration schema is defined in a dictionary
structure and the configuration data is given in a dictionary structure.
"""
from __future__ import annotations
from typing import Any
import copy
import typing
import logging
import pathlib
from ..compat import tomllib
__all__ = ['Config', 'UNSET', 'SchemaIssue']
log = logging.getLogger(__name__)
class FALSE:
"""Class of ``False`` singleton"""
# pylint: disable=multiple-statements
def __init__(self, msg):
self.msg = msg
def __bool__(self):
return False
def __str__(self):
return self.msg
__repr__ = __str__
UNSET = FALSE('<UNSET>')
class SchemaIssue(ValueError):
"""Exception to store and/or raise a message from a schema issue."""
def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str):
self.level = level
super().__init__(msg)
def __str__(self):
return f"[cfg schema {self.level}] {self.args[0]}"
class Config:
"""Base class used for configuration"""
UNSET = UNSET
@classmethod
def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config:
# init schema
log.debug("load schema file: %s", schema_file)
cfg = cls(cfg_schema=toml_load(schema_file), deprecated=deprecated)
if not cfg_file.exists():
log.warning("missing config file: %s", cfg_file)
return cfg
# load configuration
log.debug("load config file: %s", cfg_file)
upd_cfg = toml_load(cfg_file)
is_valid, issue_list = cfg.validate(upd_cfg)
for msg in issue_list:
log.error(str(msg))
if not is_valid:
raise TypeError(f"schema of {cfg_file} is invalid!")
cfg.update(upd_cfg)
return cfg
def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]):
"""Constructor of class Config.
:param cfg_schema: Schema of the configuration
:param deprecated: dictionary that maps deprecated configuration names to a messages
These values are needed for validation, see :py:obj:`validate`.
"""
self.cfg_schema = cfg_schema
self.deprecated = deprecated
self.cfg = copy.deepcopy(cfg_schema)
def __getitem__(self, key: str) -> Any:
return self.get(key)
def validate(self, cfg: dict):
"""Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`.
Validation is done by :py:obj:`validate`."""
return validate(self.cfg_schema, cfg, self.deprecated)
def update(self, upd_cfg: dict):
"""Update this configuration by ``upd_cfg``."""
dict_deepupdate(self.cfg, upd_cfg)
def default(self, name: str):
"""Returns default value of field ``name`` in ``self.cfg_schema``."""
return value(name, self.cfg_schema)
def get(self, name: str, default: Any = UNSET, replace: bool = True) -> Any:
"""Returns the value to which ``name`` points in the configuration.
If there is no such ``name`` in the config and the ``default`` is
:py:obj:`UNSET`, a :py:obj:`KeyError` is raised.
"""
parent = self._get_parent_dict(name)
val = parent.get(name.split('.')[-1], UNSET)
if val is UNSET:
if default is UNSET:
raise KeyError(name)
val = default
if replace and isinstance(val, str):
val = val % self
return val
def set(self, name: str, val):
"""Set the value to which ``name`` points in the configuration.
If there is no such ``name`` in the config, a :py:obj:`KeyError` is
raised.
"""
parent = self._get_parent_dict(name)
parent[name.split('.')[-1]] = val
def _get_parent_dict(self, name):
parent_name = '.'.join(name.split('.')[:-1])
if parent_name:
parent = value(parent_name, self.cfg)
else:
parent = self.cfg
if (parent is UNSET) or (not isinstance(parent, dict)):
raise KeyError(parent_name)
return parent
def path(self, name: str, default=UNSET):
"""Get a :py:class:`pathlib.Path` object from a config string."""
val = self.get(name, default)
if val is UNSET:
if default is UNSET:
raise KeyError(name)
return default
return pathlib.Path(str(val))
def pyobj(self, name, default=UNSET):
"""Get python object referred by full qualiffied name (FQN) in the config
string."""
fqn = self.get(name, default)
if fqn is UNSET:
if default is UNSET:
raise KeyError(name)
return default
(modulename, name) = str(fqn).rsplit('.', 1)
m = __import__(modulename, {}, {}, [name], 0)
return getattr(m, name)
def toml_load(file_name):
try:
with open(file_name, "rb") as f:
return tomllib.load(f)
except tomllib.TOMLDecodeError as exc:
msg = str(exc).replace('\t', '').replace('\n', ' ')
log.error("%s: %s", file_name, msg)
raise
# working with dictionaries
def value(name: str, data_dict: dict):
"""Returns the value to which ``name`` points in the ``dat_dict``.
.. code: python
>>> data_dict = {
"foo": {"bar": 1 },
"bar": {"foo": 2 },
"foobar": [1, 2, 3],
}
>>> value('foobar', data_dict)
[1, 2, 3]
>>> value('foo.bar', data_dict)
1
>>> value('foo.bar.xxx', data_dict)
<UNSET>
"""
ret_val = data_dict
for part in name.split('.'):
if isinstance(ret_val, dict):
ret_val = ret_val.get(part, UNSET)
if ret_val is UNSET:
break
return ret_val
def validate(
schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str]
) -> typing.Tuple[bool, list]:
"""Deep validation of dictionary in ``data_dict`` against dictionary in
``schema_dict``. Argument deprecated is a dictionary that maps deprecated
configuration names to a messages::
deprecated = {
"foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'",
"..." : "..."
}
The function returns a python tuple ``(is_valid, issue_list)``:
``is_valid``:
A bool value indicating ``data_dict`` is valid or not.
``issue_list``:
A list of messages (:py:obj:`SchemaIssue`) from the validation::
[schema warn] data_dict: deprecated 'fontlib.foo': <DEPRECATED['foo.bar']>
[schema invalid] data_dict: key unknown 'fontlib.foo'
[schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ...
If ``schema_dict`` or ``data_dict`` is not a dictionary type a
:py:obj:`SchemaIssue` is raised.
"""
names = []
is_valid = True
issue_list = []
if not isinstance(schema_dict, dict):
raise SchemaIssue('invalid', "schema_dict is not a dict type")
if not isinstance(data_dict, dict):
raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type")
is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated)
return is_valid, issue_list
def _validate(
names: typing.List,
issue_list: typing.List,
schema_dict: typing.Dict,
data_dict: typing.Dict,
deprecated: typing.Dict[str, str],
) -> typing.Tuple[bool, typing.List]:
is_valid = True
for key, data_value in data_dict.items():
names.append(key)
name = '.'.join(names)
deprecated_msg = deprecated.get(name)
# print("XXX %s: key %s // data_value: %s" % (name, key, data_value))
if deprecated_msg:
issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}"))
schema_value = value(name, schema_dict)
# print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value))
if schema_value is UNSET:
if not deprecated_msg:
issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict"))
is_valid = False
elif type(schema_value) != type(data_value): # pylint: disable=unidiomatic-typecheck
issue_list.append(
SchemaIssue(
'invalid',
(f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"),
)
)
is_valid = False
elif isinstance(data_value, dict):
_valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated)
is_valid = is_valid and _valid
names.pop()
return is_valid, issue_list
def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None):
"""Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``.
For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``:
0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a
:py:obj:`TypeError`.
1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``.
2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a
(deep-) copy of ``upd_val``.
3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the
list in ``upd_val``.
4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in
``upd_val``.
"""
# pylint: disable=too-many-branches
if not isinstance(base_dict, dict):
raise TypeError("argument 'base_dict' is not a dictionary type")
if not isinstance(upd_dict, dict):
raise TypeError("argument 'upd_dict' is not a dictionary type")
if names is None:
names = []
for upd_key, upd_val in upd_dict.items():
# For each upd_key & upd_val pair in upd_dict:
if isinstance(upd_val, dict):
if upd_key in base_dict:
# if base_dict[upd_key] exists, recursively deep-update it
if not isinstance(base_dict[upd_key], dict):
raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict")
dict_deepupdate(
base_dict[upd_key],
upd_val,
names
+ [
upd_key,
],
)
else:
# if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val
base_dict[upd_key] = copy.deepcopy(upd_val)
elif isinstance(upd_val, list):
if upd_key in base_dict:
# if base_dict[upd_key] exists, base_dict[up_key] is extended by
# the list from upd_val
if not isinstance(base_dict[upd_key], list):
raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict")
base_dict[upd_key].extend(upd_val)
else:
# if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the
# list in upd_val.
base_dict[upd_key] = copy.deepcopy(upd_val)
elif isinstance(upd_val, set):
if upd_key in base_dict:
# if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val
if not isinstance(base_dict[upd_key], set):
raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict")
base_dict[upd_key].update(upd_val.copy())
else:
# if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the
# set in upd_val
base_dict[upd_key] = upd_val.copy()
else:
# for any other type of upd_val replace or add base_dict[upd_key] by a copy
# of upd_val
base_dict[upd_key] = copy.copy(upd_val)

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Method ``http_accept``
----------------------
The ``http_accept`` method evaluates a request as the request of a bot if the
Accept_ header ..
- did not contain ``text/html``
.. _Accept:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
"""
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import werkzeug
from searx.extended_types import SXNG_Request
from . import config
from ._helpers import too_many_requests
def filter_request(
network: IPv4Network | IPv6Network,
request: SXNG_Request,
cfg: config.Config, # pylint: disable=unused-argument
) -> werkzeug.Response | None:
if 'text/html' not in request.accept_mimetypes:
return too_many_requests(network, "HTTP header Accept did not contain text/html")
return None

View File

@@ -0,0 +1,40 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Method ``http_accept_encoding``
-------------------------------
The ``http_accept_encoding`` method evaluates a request as the request of a
bot if the Accept-Encoding_ header ..
- did not contain ``gzip`` AND ``deflate`` (if both values are missed)
- did not contain ``text/html``
.. _Accept-Encoding:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
"""
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import werkzeug
from searx.extended_types import SXNG_Request
from . import config
from ._helpers import too_many_requests
def filter_request(
network: IPv4Network | IPv6Network,
request: SXNG_Request,
cfg: config.Config, # pylint: disable=unused-argument
) -> werkzeug.Response | None:
accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if not ('gzip' in accept_list or 'deflate' in accept_list):
return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate")
return None

View File

@@ -0,0 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Method ``http_accept_language``
-------------------------------
The ``http_accept_language`` method evaluates a request as the request of a bot
if the Accept-Language_ header is unset.
.. _Accept-Language:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
"""
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import werkzeug
from searx.extended_types import SXNG_Request
from . import config
from ._helpers import too_many_requests
def filter_request(
network: IPv4Network | IPv6Network,
request: SXNG_Request,
cfg: config.Config, # pylint: disable=unused-argument
) -> werkzeug.Response | None:
if request.headers.get('Accept-Language', '').strip() == '':
return too_many_requests(network, "missing HTTP header Accept-Language")
return None

View File

@@ -0,0 +1,36 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Method ``http_connection``
--------------------------
The ``http_connection`` method evaluates a request as the request of a bot if
the Connection_ header is set to ``close``.
.. _Connection:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
"""
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import werkzeug
from searx.extended_types import SXNG_Request
from . import config
from ._helpers import too_many_requests
def filter_request(
network: IPv4Network | IPv6Network,
request: SXNG_Request,
cfg: config.Config, # pylint: disable=unused-argument
) -> werkzeug.Response | None:
if request.headers.get('Connection', '').strip() == 'close':
return too_many_requests(network, "HTTP header 'Connection=close")
return None

View File

@@ -0,0 +1,103 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Method ``http_sec_fetch``
-------------------------
The ``http_sec_fetch`` method protect resources from web attacks with `Fetch
Metadata`_. A request is filtered out in case of:
- http header Sec-Fetch-Mode_ is invalid
- http header Sec-Fetch-Dest_ is invalid
.. _Fetch Metadata:
https://developer.mozilla.org/en-US/docs/Glossary/Fetch_metadata_request_header
.. _Sec-Fetch-Dest:
https://developer.mozilla.org/en-US/docs/Web/API/Request/destination
.. _Sec-Fetch-Mode:
https://developer.mozilla.org/en-US/docs/Web/API/Request/mode
"""
# pylint: disable=unused-argument
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import re
import flask
import werkzeug
from searx.extended_types import SXNG_Request
from . import config
from ._helpers import logger
def is_browser_supported(user_agent: str) -> bool:
"""Check if the browser supports Sec-Fetch headers.
https://caniuse.com/mdn-http_headers_sec-fetch-dest
https://caniuse.com/mdn-http_headers_sec-fetch-mode
https://caniuse.com/mdn-http_headers_sec-fetch-site
Supported browsers:
- Chrome >= 80
- Firefox >= 90
- Safari >= 16.4
- Edge (mirrors Chrome)
- Opera (mirrors Chrome)
"""
user_agent = user_agent.lower()
# Chrome/Chromium/Edge/Opera
chrome_match = re.search(r'chrome/(\d+)', user_agent)
if chrome_match:
version = int(chrome_match.group(1))
return version >= 80
# Firefox
firefox_match = re.search(r'firefox/(\d+)', user_agent)
if firefox_match:
version = int(firefox_match.group(1))
return version >= 90
# Safari
safari_match = re.search(r'version/(\d+)\.(\d+)', user_agent)
if safari_match:
major = int(safari_match.group(1))
minor = int(safari_match.group(2))
return major > 16 or (major == 16 and minor >= 4)
return False
def filter_request(
network: IPv4Network | IPv6Network,
request: SXNG_Request,
cfg: config.Config,
) -> werkzeug.Response | None:
# Only check Sec-Fetch headers for supported browsers
user_agent = request.headers.get('User-Agent', '')
if is_browser_supported(user_agent):
val = request.headers.get("Sec-Fetch-Mode", "")
if val not in ('navigate', 'cors'):
logger.debug("invalid Sec-Fetch-Mode '%s'", val)
return flask.redirect(flask.url_for('index'), code=302)
val = request.headers.get("Sec-Fetch-Site", "")
if val not in ('same-origin', 'same-site', 'none'):
logger.debug("invalid Sec-Fetch-Site '%s'", val)
flask.redirect(flask.url_for('index'), code=302)
val = request.headers.get("Sec-Fetch-Dest", "")
if val not in ('document', 'empty'):
logger.debug("invalid Sec-Fetch-Dest '%s'", val)
flask.redirect(flask.url_for('index'), code=302)
return None

View File

@@ -0,0 +1,66 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Method ``http_user_agent``
--------------------------
The ``http_user_agent`` method evaluates a request as the request of a bot if
the User-Agent_ header is unset or matches the regular expression
:py:obj:`USER_AGENT`.
.. _User-Agent:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
"""
from __future__ import annotations
import re
from ipaddress import (
IPv4Network,
IPv6Network,
)
import werkzeug
from searx.extended_types import SXNG_Request
from . import config
from ._helpers import too_many_requests
USER_AGENT = (
r'('
+ r'unknown'
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
+ r'|ZmEu|BLEXBot|bitlybot|HeadlessChrome'
# unmaintained Farside instances
+ r'|'
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
# other bots and client to block
+ '|.*PetalBot.*'
+ r')'
)
"""Regular expression that matches to User-Agent_ from known *bots*"""
_regexp = None
def regexp_user_agent():
global _regexp # pylint: disable=global-statement
if not _regexp:
_regexp = re.compile(USER_AGENT)
return _regexp
def filter_request(
network: IPv4Network | IPv6Network,
request: SXNG_Request,
cfg: config.Config, # pylint: disable=unused-argument
) -> werkzeug.Response | None:
user_agent = request.headers.get('User-Agent', 'unknown')
if regexp_user_agent().match(user_agent):
return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
return None

View File

@@ -0,0 +1,149 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. _botdetection.ip_limit:
Method ``ip_limit``
-------------------
The ``ip_limit`` method counts request from an IP in *sliding windows*. If
there are to many requests in a sliding window, the request is evaluated as a
bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_
header. To take privacy only the hash value of an IP is stored in the redis DB
and at least for a maximum of 10 minutes.
The :py:obj:`.link_token` method can be used to investigate whether a request is
*suspicious*. To activate the :py:obj:`.link_token` method in the
:py:obj:`.ip_limit` method add the following configuration:
.. code:: toml
[botdetection.ip_limit]
link_token = true
If the :py:obj:`.link_token` method is activated and a request is *suspicious*
the request rates are reduced:
- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS`
- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS`
To intercept bots that get their IPs from a range of IPs, there is a
:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored
for a longer time. IPs stored in this sliding window have a maximum of
:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP
makes a request that is not suspicious, the sliding window for this IP is
dropped.
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)
import flask
import werkzeug
from searx.extended_types import SXNG_Request
from searx import redisdb
from searx.redislib import incr_sliding_window, drop_counter
from . import link_token
from . import config
from ._helpers import (
too_many_requests,
logger,
)
logger = logger.getChild('ip_limit')
BURST_WINDOW = 20
"""Time (sec) before sliding window for *burst* requests expires."""
BURST_MAX = 15
"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`"""
BURST_MAX_SUSPICIOUS = 2
"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`"""
LONG_WINDOW = 600
"""Time (sec) before the longer sliding window expires."""
LONG_MAX = 150
"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`"""
LONG_MAX_SUSPICIOUS = 10
"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`"""
API_WINDOW = 3600
"""Time (sec) before sliding window for API requests (format != html) expires."""
API_MAX = 4
"""Maximum requests from one IP in the :py:obj:`API_WINDOW`"""
SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30
"""Time (sec) before sliding window for one suspicious IP expires."""
SUSPICIOUS_IP_MAX = 3
"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`."""
def filter_request(
network: IPv4Network | IPv6Network,
request: SXNG_Request,
cfg: config.Config,
) -> werkzeug.Response | None:
# pylint: disable=too-many-return-statements
redis_client = redisdb.client()
if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']:
logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed)
return None
if request.args.get('format', 'html') != 'html':
c = incr_sliding_window(redis_client, 'ip_limit.API_WINDOW:' + network.compressed, API_WINDOW)
if c > API_MAX:
return too_many_requests(network, "too many request in API_WINDOW")
if cfg['botdetection.ip_limit.link_token']:
suspicious = link_token.is_suspicious(network, request, True)
if not suspicious:
# this IP is no longer suspicious: release ip again / delete the counter of this IP
drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed)
return None
# this IP is suspicious: count requests from this IP
c = incr_sliding_window(
redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW
)
if c > SUSPICIOUS_IP_MAX:
logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network)
response = flask.redirect(flask.url_for('index'), code=302)
response.headers["Cache-Control"] = "no-store, max-age=0"
return response
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
if c > BURST_MAX_SUSPICIOUS:
return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
if c > LONG_MAX_SUSPICIOUS:
return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
return None
# vanilla limiter without extensions counts BURST_MAX and LONG_MAX
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
if c > BURST_MAX:
return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)")
c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
if c > LONG_MAX:
return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)")
return None

View File

@@ -0,0 +1,84 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. _botdetection.ip_lists:
Method ``ip_lists``
-------------------
The ``ip_lists`` method implements IP :py:obj:`block- <block_ip>` and
:py:obj:`pass-lists <pass_ip>`.
.. code:: toml
[botdetection.ip_lists]
pass_ip = [
'167.235.158.251', # IPv4 of check.searx.space
'192.168.0.0/16', # IPv4 private network
'fe80::/10' # IPv6 linklocal
]
block_ip = [
'93.184.216.34', # IPv4 of example.org
'257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class
]
"""
# pylint: disable=unused-argument
from __future__ import annotations
from typing import Tuple
from ipaddress import (
ip_network,
IPv4Address,
IPv6Address,
)
from . import config
from ._helpers import logger
logger = logger.getChild('ip_limit')
SEARXNG_ORG = [
# https://github.com/searxng/searxng/pull/2484#issuecomment-1576639195
'167.235.158.251', # IPv4 check.searx.space
'2a01:04f8:1c1c:8fc2::/64', # IPv6 check.searx.space
]
"""Passlist of IPs from the SearXNG organization, e.g. `check.searx.space`."""
def pass_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]:
"""Checks if the IP on the subnet is in one of the members of the
``botdetection.ip_lists.pass_ip`` list.
"""
if cfg.get('botdetection.ip_lists.pass_searxng_org', default=True):
for net in SEARXNG_ORG:
net = ip_network(net, strict=False)
if real_ip.version == net.version and real_ip in net:
return True, f"IP matches {net.compressed} in SEARXNG_ORG list."
return ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.pass_ip', cfg)
def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]:
"""Checks if the IP on the subnet is in one of the members of the
``botdetection.ip_lists.block_ip`` list.
"""
block, msg = ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.block_ip', cfg)
if block:
msg += " To remove IP from list, please contact the maintainer of the service."
return block, msg
def ip_is_subnet_of_member_in_list(
real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config
) -> Tuple[bool, str]:
for net in cfg.get(list_name, default=[]):
try:
net = ip_network(net, strict=False)
except ValueError:
logger.error("invalid IP %s in %s", net, list_name)
continue
if real_ip.version == net.version and real_ip in net:
return True, f"IP matches {net.compressed} in {list_name}."
return False, f"IP is not a member of an item in the f{list_name} list"

View File

@@ -0,0 +1,154 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Method ``link_token``
---------------------
The ``link_token`` method evaluates a request as :py:obj:`suspicious
<is_suspicious>` if the URL ``/client<token>.css`` is not requested by the
client. By adding a random component (the token) in the URL, a bot can not send
a ping by request a static URL.
.. note::
This method requires a redis DB and needs a HTTP X-Forwarded-For_ header.
To get in use of this method a flask URL route needs to be added:
.. code:: python
@app.route('/client<token>.css', methods=['GET', 'POST'])
def client_token(token=None):
link_token.ping(request, token)
return Response('', mimetype='text/css')
And in the HTML template from flask a stylesheet link is needed (the value of
``link_token`` comes from :py:obj:`get_token`):
.. code:: html
<link rel="stylesheet"
href="{{ url_for('client_token', token=link_token) }}"
type="text/css" >
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
ip_address,
)
import string
import random
from searx import logger
from searx import redisdb
from searx.redislib import secret_hash
from searx.extended_types import SXNG_Request
from ._helpers import (
get_network,
get_real_ip,
)
TOKEN_LIVE_TIME = 600
"""Lifetime (sec) of limiter's CSS token."""
PING_LIVE_TIME = 3600
"""Lifetime (sec) of the ping-key from a client (request)"""
PING_KEY = 'SearXNG_limiter.ping'
"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`"""
TOKEN_KEY = 'SearXNG_limiter.token'
"""Key for which the current token is stored in the DB"""
logger = logger.getChild('botdetection.link_token')
def is_suspicious(network: IPv4Network | IPv6Network, request: SXNG_Request, renew: bool = False):
"""Checks whether a valid ping is exists for this (client) network, if not
this request is rated as *suspicious*. If a valid ping exists and argument
``renew`` is ``True`` the expire time of this ping is reset to
:py:obj:`PING_LIVE_TIME`.
"""
redis_client = redisdb.client()
if not redis_client:
return False
ping_key = get_ping_key(network, request)
if not redis_client.get(ping_key):
logger.info("missing ping (IP: %s) / request: %s", network.compressed, ping_key)
return True
if renew:
redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key)
return False
def ping(request: SXNG_Request, token: str):
"""This function is called by a request to URL ``/client<token>.css``. If
``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB.
The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`.
"""
from . import redis_client, cfg # pylint: disable=import-outside-toplevel, cyclic-import
if not redis_client:
return
if not token_is_valid(token):
return
real_ip = ip_address(get_real_ip(request))
network = get_network(real_ip, cfg)
ping_key = get_ping_key(network, request)
logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key)
redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
def get_ping_key(network: IPv4Network | IPv6Network, request: SXNG_Request) -> str:
"""Generates a hashed key that fits (more or less) to a *WEB-browser
session* in a network."""
return (
PING_KEY
+ "["
+ secret_hash(
network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '')
)
+ "]"
)
def token_is_valid(token) -> bool:
valid = token == get_token()
logger.debug("token is valid --> %s", valid)
return valid
def get_token() -> str:
"""Returns current token. If there is no currently active token a new token
is generated randomly and stored in the redis DB.
- :py:obj:`TOKEN_LIVE_TIME`
- :py:obj:`TOKEN_KEY`
"""
redis_client = redisdb.client()
if not redis_client:
# This function is also called when limiter is inactive / no redis DB
# (see render function in webapp.py)
return '12345678'
token = redis_client.get(TOKEN_KEY)
if token:
token = token.decode('UTF-8')
else:
token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16))
redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME)
return token

420
searx/cache.py Normal file
View File

@@ -0,0 +1,420 @@
"""Implementation of caching solutions.
- :py:obj:`searx.cache.ExpireCache` and its :py:obj:`searx.cache.ExpireCacheCfg`
----
"""
from __future__ import annotations
__all__ = ["ExpireCacheCfg", "ExpireCacheStats", "ExpireCache", "ExpireCacheSQLite"]
import abc
from collections.abc import Iterator
import dataclasses
import datetime
import hashlib
import hmac
import os
import pickle
import sqlite3
import string
import tempfile
import time
import typing
import msgspec
from searx import sqlitedb
from searx import logger
from searx import get_setting
log = logger.getChild("cache")
class ExpireCacheCfg(msgspec.Struct): # pylint: disable=too-few-public-methods
"""Configuration of a :py:obj:`ExpireCache` cache."""
name: str
"""Name of the cache."""
db_url: str = ""
"""URL of the SQLite DB, the path to the database file. If unset a default
DB will be created in `/tmp/sxng_cache_{self.name}.db`"""
MAX_VALUE_LEN: int = 1024 * 10
"""Max length of a *serialized* value."""
MAXHOLD_TIME: int = 60 * 60 * 24 * 7 # 7 days
"""Hold time (default in sec.), after which a value is removed from the cache."""
MAINTENANCE_PERIOD: int = 60 * 60 # 2h
"""Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
``auto``."""
MAINTENANCE_MODE: typing.Literal["auto", "off"] = "auto"
"""Type of maintenance mode
``auto``:
Maintenance is carried out automatically as part of the maintenance
intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required.
``off``:
Maintenance is switched off and must be carried out by an external process
if required.
"""
password: bytes = get_setting("server.secret_key").encode() # type: ignore
"""Password used by :py:obj:`ExpireCache.secret_hash`.
The default password is taken from :ref:`secret_key <server.secret_key>`.
When the password is changed, the hashed keys in the cache can no longer be
used, which is why all values in the cache are deleted when the password is
changed.
"""
def __post_init__(self):
# if db_url is unset, use a default DB in /tmp/sxng_cache_{name}.db
if not self.db_url:
self.db_url = tempfile.gettempdir() + os.sep + f"sxng_cache_{ExpireCache.normalize_name(self.name)}.db"
@dataclasses.dataclass
class ExpireCacheStats:
"""Dataclass which provides information on the status of the cache."""
cached_items: dict[str, list[tuple[str, typing.Any, int]]]
"""Values in the cache mapped by context name.
.. code: python
{
"context name": [
("foo key": "foo value", <expire>),
("bar key": "bar value", <expire>),
# ...
],
# ...
}
"""
def report(self):
c_ctx = 0
c_kv = 0
lines = []
for ctx_name, kv_list in self.cached_items.items():
c_ctx += 1
if not kv_list:
lines.append(f"[{ctx_name:20s}] empty")
continue
for key, value, expire in kv_list:
valid_until = datetime.datetime.fromtimestamp(expire).strftime("%Y-%m-%d %H:%M:%S")
c_kv += 1
lines.append(f"[{ctx_name:20s}] {valid_until} {key:12}" f" --> ({type(value).__name__}) {value} ")
lines.append(f"Number of contexts: {c_ctx}")
lines.append(f"number of key/value pairs: {c_kv}")
return "\n".join(lines)
class ExpireCache(abc.ABC):
"""Abstract base class for the implementation of a key/value cache
with expire date."""
cfg: ExpireCacheCfg
hash_token = "hash_token"
@abc.abstractmethod
def set(self, key: str, value: typing.Any, expire: int | None, ctx: str | None = None) -> bool:
"""Set *key* to *value*. To set a timeout on key use argument
``expire`` (in sec.). If expire is unset the default is taken from
:py:obj:`ExpireCacheCfg.MAXHOLD_TIME`. After the timeout has expired,
the key will automatically be deleted.
The ``ctx`` argument specifies the context of the ``key``. A key is
only unique in its context.
The concrete implementations of this abstraction determine how the
context is mapped in the connected database. In SQL databases, for
example, the context is a DB table or in a Key/Value DB it could be
a prefix for the key.
If the context is not specified (the default is ``None``) then a
default context should be used, e.g. a default table for SQL databases
or a default prefix in a Key/Value DB.
"""
@abc.abstractmethod
def get(self, key: str, default=None, ctx: str | None = None) -> typing.Any:
"""Return *value* of *key*. If key is unset, ``None`` is returned."""
@abc.abstractmethod
def maintenance(self, force: bool = False, truncate: bool = False) -> bool:
"""Performs maintenance on the cache.
``force``:
Maintenance should be carried out even if the maintenance interval has
not yet been reached.
``truncate``:
Truncate the entire cache, which is necessary, for example, if the
password has changed.
"""
@abc.abstractmethod
def state(self) -> ExpireCacheStats:
"""Returns a :py:obj:`ExpireCacheStats`, which provides information
about the status of the cache."""
@staticmethod
def build_cache(cfg: ExpireCacheCfg) -> ExpireCache:
"""Factory to build a caching instance.
.. note::
Currently, only the SQLite adapter is available, but other database
types could be implemented in the future, e.g. a Valkey (Redis)
adapter.
"""
return ExpireCacheSQLite(cfg)
@staticmethod
def normalize_name(name: str) -> str:
"""Returns a normalized name that can be used as a file name or as a SQL
table name (is used, for example, to normalize the context name)."""
_valid = "-_." + string.ascii_letters + string.digits
return "".join([c for c in name if c in _valid])
def serialize(self, value: typing.Any) -> bytes:
dump: bytes = pickle.dumps(value)
return dump
def deserialize(self, value: bytes) -> typing.Any:
obj = pickle.loads(value)
return obj
def secret_hash(self, name: str | bytes) -> str:
"""Creates a hash of the argument ``name``. The hash value is formed
from the ``name`` combined with the :py:obj:`password
<ExpireCacheCfg.password>`. Can be used, for example, to make the
``key`` stored in the DB unreadable for third parties."""
if isinstance(name, str):
name = bytes(name, encoding='utf-8')
m = hmac.new(name + self.cfg.password, digestmod='sha256')
return m.hexdigest()
class ExpireCacheSQLite(sqlitedb.SQLiteAppl, ExpireCache):
"""Cache that manages key/value pairs in a SQLite DB. The DB model in the
SQLite DB is implemented in abstract class :py:obj:`SQLiteAppl
<searx.sqlitedb.SQLiteAppl>`.
The following configurations are required / supported:
- :py:obj:`ExpireCacheCfg.db_url`
- :py:obj:`ExpireCacheCfg.MAXHOLD_TIME`
- :py:obj:`ExpireCacheCfg.MAINTENANCE_PERIOD`
- :py:obj:`ExpireCacheCfg.MAINTENANCE_MODE`
"""
DB_SCHEMA = 1
# The key/value tables will be created on demand by self.create_table
DDL_CREATE_TABLES = {}
CACHE_TABLE_PREFIX = "CACHE-TABLE"
def __init__(self, cfg: ExpireCacheCfg):
"""An instance of the SQLite expire cache is build up from a
:py:obj:`config <ExpireCacheCfg>`."""
self.cfg = cfg
if cfg.db_url == ":memory:":
log.critical("don't use SQLite DB in :memory: in production!!")
super().__init__(cfg.db_url)
def init(self, conn: sqlite3.Connection) -> bool:
ret_val = super().init(conn)
if not ret_val:
return False
new = hashlib.sha256(self.cfg.password).hexdigest()
old = self.properties(self.hash_token)
if old != new:
if old is not None:
log.warning("[%s] hash token changed: truncate all cache tables", self.cfg.name)
self.maintenance(force=True, truncate=True)
self.properties.set(self.hash_token, new)
return True
def maintenance(self, force: bool = False, truncate: bool = False) -> bool:
if not force and int(time.time()) < self.next_maintenance_time:
# log.debug("no maintenance required yet, next maintenance interval is in the future")
return False
# Prevent parallel DB maintenance cycles from other DB connections
# (e.g. in multi thread or process environments).
self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property!
if truncate:
self.truncate_tables(self.table_names)
return True
# drop items by expire time stamp ..
expire = int(time.time())
with self.connect() as conn:
for table in self.table_names:
res = conn.execute(f"DELETE FROM {table} WHERE expire < ?", (expire,))
log.debug("deleted %s keys from table %s (expire date reached)", res.rowcount, table)
# Vacuuming the WALs
# https://www.theunterminatedstring.com/sqlite-vacuuming/
conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
conn.close()
return True
def create_table(self, table: str) -> bool:
"""Create DB ``table`` if it has not yet been created, no recreates are
initiated if the table already exists.
"""
if table in self.table_names:
# log.debug("key/value table %s exists in DB (no need to recreate)", table)
return False
log.info("key/value table '%s' NOT exists in DB -> create DB table ..", table)
sql_table = "\n".join(
[
f"CREATE TABLE IF NOT EXISTS {table} (",
" key TEXT,",
" value BLOB,",
f" expire INTEGER DEFAULT (strftime('%s', 'now') + {self.cfg.MAXHOLD_TIME}),",
"PRIMARY KEY (key))",
]
)
sql_index = f"CREATE INDEX IF NOT EXISTS index_expire_{table} ON {table}(expire);"
with self.connect() as conn:
conn.execute(sql_table)
conn.execute(sql_index)
conn.close()
self.properties.set(f"{self.CACHE_TABLE_PREFIX}-{table}", table)
return True
@property
def table_names(self) -> list[str]:
"""List of key/value tables already created in the DB."""
sql = f"SELECT value FROM properties WHERE name LIKE '{self.CACHE_TABLE_PREFIX}%%'"
rows = self.DB.execute(sql).fetchall() or []
return [r[0] for r in rows]
def truncate_tables(self, table_names: list[str]):
log.debug("truncate table: %s", ",".join(table_names))
with self.connect() as conn:
for table in table_names:
conn.execute(f"DELETE FROM {table}")
conn.close()
return True
@property
def next_maintenance_time(self) -> int:
"""Returns (unix epoch) time of the next maintenance."""
return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE", int(time.time()))
# implement ABC methods of ExpireCache
def set(self, key: str, value: typing.Any, expire: int | None, ctx: str | None = None) -> bool:
"""Set key/value in DB table given by argument ``ctx``. If expire is
unset the default is taken from :py:obj:`ExpireCacheCfg.MAXHOLD_TIME`.
If ``ctx`` argument is ``None`` (the default), a table name is
generated from the :py:obj:`ExpireCacheCfg.name`. If DB table does not
exists, it will be created (on demand) by :py:obj:`self.create_table
<ExpireCacheSQLite.create_table>`.
"""
table = ctx
self.maintenance()
value = self.serialize(value=value)
if len(value) > self.cfg.MAX_VALUE_LEN:
log.warning("ExpireCache.set(): %s.key='%s' - value too big to cache (len: %s) ", table, value, len(value))
return False
if not expire:
expire = self.cfg.MAXHOLD_TIME
expire = int(time.time()) + expire
table_name = table
if not table_name:
table_name = self.normalize_name(self.cfg.name)
self.create_table(table_name)
sql = (
f"INSERT INTO {table_name} (key, value, expire) VALUES (?, ?, ?)"
f" ON CONFLICT DO "
f"UPDATE SET value=?, expire=?"
)
if table:
with self.DB:
self.DB.execute(sql, (key, value, expire, value, expire))
else:
with self.connect() as conn:
conn.execute(sql, (key, value, expire, value, expire))
conn.close()
return True
def get(self, key: str, default=None, ctx: str | None = None) -> typing.Any:
"""Get value of ``key`` from table given by argument ``ctx``. If
``ctx`` argument is ``None`` (the default), a table name is generated
from the :py:obj:`ExpireCacheCfg.name`. If ``key`` not exists (in
table), the ``default`` value is returned.
"""
table = ctx
self.maintenance()
if not table:
table = self.normalize_name(self.cfg.name)
if table not in self.table_names:
return default
sql = f"SELECT value FROM {table} WHERE key = ?"
row = self.DB.execute(sql, (key,)).fetchone()
if row is None:
return default
return self.deserialize(row[0])
def pairs(self, ctx: str) -> Iterator[tuple[str, typing.Any]]:
"""Iterate over key/value pairs from table given by argument ``ctx``.
If ``ctx`` argument is ``None`` (the default), a table name is
generated from the :py:obj:`ExpireCacheCfg.name`."""
table = ctx
self.maintenance()
if not table:
table = self.normalize_name(self.cfg.name)
if table in self.table_names:
for row in self.DB.execute(f"SELECT key, value FROM {table}"):
yield row[0], self.deserialize(row[1])
def state(self) -> ExpireCacheStats:
cached_items = {}
for table in self.table_names:
cached_items[table] = []
for row in self.DB.execute(f"SELECT key, value, expire FROM {table}"):
cached_items[table].append((row[0], self.deserialize(row[1]), row[2]))
return ExpireCacheStats(cached_items=cached_items)

18
searx/compat.py Normal file
View File

@@ -0,0 +1,18 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Compatibility with older versions"""
# pylint: disable=unused-import
__all__ = [
"tomllib",
]
import sys
# TOML (lib) compatibility
# ------------------------
if sys.version_info >= (3, 11):
import tomllib
else:
import tomli as tomllib

81
searx/data/__init__.py Normal file
View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This module holds the *data* created by::
make data.all
"""
from __future__ import annotations
__all__ = ["ahmia_blacklist_loader"]
import json
import typing
from .core import log, data_dir
from .currencies import CurrenciesDB
from .tracker_patterns import TrackerPatternsDB
CURRENCIES: CurrenciesDB
USER_AGENTS: dict[str, typing.Any]
EXTERNAL_URLS: dict[str, typing.Any]
WIKIDATA_UNITS: dict[str, typing.Any]
EXTERNAL_BANGS: dict[str, typing.Any]
OSM_KEYS_TAGS: dict[str, typing.Any]
ENGINE_DESCRIPTIONS: dict[str, typing.Any]
ENGINE_TRAITS: dict[str, typing.Any]
LOCALES: dict[str, typing.Any]
TRACKER_PATTERNS: TrackerPatternsDB
lazy_globals = {
"CURRENCIES": CurrenciesDB(),
"USER_AGENTS": None,
"EXTERNAL_URLS": None,
"WIKIDATA_UNITS": None,
"EXTERNAL_BANGS": None,
"OSM_KEYS_TAGS": None,
"ENGINE_DESCRIPTIONS": None,
"ENGINE_TRAITS": None,
"LOCALES": None,
"TRACKER_PATTERNS": TrackerPatternsDB(),
}
data_json_files = {
"USER_AGENTS": "useragents.json",
"EXTERNAL_URLS": "external_urls.json",
"WIKIDATA_UNITS": "wikidata_units.json",
"EXTERNAL_BANGS": "external_bangs.json",
"OSM_KEYS_TAGS": "osm_keys_tags.json",
"ENGINE_DESCRIPTIONS": "engine_descriptions.json",
"ENGINE_TRAITS": "engine_traits.json",
"LOCALES": "locales.json",
}
def __getattr__(name):
# lazy init of the global objects
if name not in lazy_globals:
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
data = lazy_globals[name]
if data is not None:
return data
log.debug("init searx.data.%s", name)
with open(data_dir / data_json_files[name], encoding='utf-8') as f:
lazy_globals[name] = json.load(f)
return lazy_globals[name]
def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by::
searxng_extra/update/update_ahmia_blacklist.py
This function is used by :py:mod:`searx.plugins.ahmia_filter`.
"""
with open(data_dir / 'ahmia_blacklist.txt', encoding='utf-8') as f:
return f.read().split()

47814
searx/data/ahmia_blacklist.txt Normal file

File diff suppressed because it is too large Load Diff

29
searx/data/core.py Normal file
View File

@@ -0,0 +1,29 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from __future__ import annotations
import pathlib
from searx import logger
from searx.cache import ExpireCacheCfg, ExpireCacheSQLite
log = logger.getChild("data")
data_dir = pathlib.Path(__file__).parent
_DATA_CACHE: ExpireCacheSQLite = None # type: ignore
def get_cache():
global _DATA_CACHE # pylint: disable=global-statement
if _DATA_CACHE is None:
_DATA_CACHE = ExpireCacheSQLite.build_cache(
ExpireCacheCfg(
name="DATA_CACHE",
# MAX_VALUE_LEN=1024 * 200, # max. 200kB length for a *serialized* value.
# MAXHOLD_TIME=60 * 60 * 24 * 7 * 4, # 4 weeks
)
)
return _DATA_CACHE

15863
searx/data/currencies.json Normal file

File diff suppressed because it is too large Load Diff

55
searx/data/currencies.py Normal file
View File

@@ -0,0 +1,55 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Simple implementation to store currencies data in a SQL database."""
from __future__ import annotations
__all__ = ["CurrenciesDB"]
import json
import pathlib
from .core import get_cache, log
class CurrenciesDB:
# pylint: disable=missing-class-docstring
ctx_names = "data_currencies_names"
ctx_iso4217 = "data_currencies_iso4217"
json_file = pathlib.Path(__file__).parent / "currencies.json"
def __init__(self):
self.cache = get_cache()
def init(self):
if self.cache.properties("currencies loaded") != "OK":
self.load()
self.cache.properties.set("currencies loaded", "OK")
# F I X M E:
# do we need a maintenance .. rember: database is stored
# in /tmp and will be rebuild during the reboot anyway
def load(self):
log.debug("init searx.data.CURRENCIES")
with open(self.json_file, encoding="utf-8") as f:
data_dict = json.load(f)
for key, value in data_dict["names"].items():
self.cache.set(key=key, value=value, ctx=self.ctx_names, expire=None)
for key, value in data_dict["iso4217"].items():
self.cache.set(key=key, value=value, ctx=self.ctx_iso4217, expire=None)
def name_to_iso4217(self, name):
self.init()
ret_val = self.cache.get(key=name, default=name, ctx=self.ctx_names)
if isinstance(ret_val, list):
# if more alternatives, use the last in the list
ret_val = ret_val[-1]
return ret_val
def iso4217_to_name(self, iso4217, language):
self.init()
iso4217_languages: dict = self.cache.get(key=iso4217, default={}, ctx=self.ctx_iso4217)
return iso4217_languages.get(language, iso4217)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

19076
searx/data/external_bangs.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,156 @@
{
"facebook_profile": {
"category_name": "Facebook",
"url_name": "Facebook profile",
"urls": {
"default": "https://facebook.com/$1"
}
},
"youtube_channel": {
"category_name": "YouTube",
"url_name": "YouTube channel",
"urls": {
"default": "https://www.youtube.com/channel/$1"
}
},
"youtube_video": {
"category_name": "YouTube",
"url_name": "YouTube video",
"urls": {
"default": "https://www.youtube.com/watch?v=$1"
}
},
"twitter_profile": {
"category_name": "Twitter",
"url_name": "Twitter profile",
"urls": {
"default": "https://twitter.com/$1"
}
},
"instagram_profile": {
"category_name": "Instagram",
"url_name": "Instagram profile",
"urls": {
"default": "https://www.instagram.com/$1"
}
},
"imdb_title": {
"category_name": "IMDB",
"url_name": "IMDB title",
"urls": {
"default": "https://www.imdb.com/title/$1"
}
},
"imdb_name": {
"category_name": "IMDB",
"url_name": "IMDB name",
"urls": {
"default": "https://www.imdb.com/name/$1"
}
},
"imdb_character": {
"category_name": "IMDB",
"url_name": "IMDB character",
"urls": {
"default": "https://www.imdb.com/character/$1"
}
},
"imdb_company": {
"category_name": "IMDB",
"url_name": "IMDB company",
"urls": {
"default": "https://www.imdb.com/company/$1"
}
},
"imdb_event": {
"category_name": "IMDB",
"url_name": "IMDB event",
"urls": {
"default": "https://www.imdb.com/event/$1"
}
},
"rotten_tomatoes": {
"category_name": "Rotten tomatoes",
"url_name": "Rotten tomatoes title",
"urls": {
"default": "https://www.rottentomatoes.com/$1"
}
},
"spotify_artist_id": {
"category_name": "Spotify",
"url_name": "Spotify artist",
"urls": {
"default": "https://open.spotify.com/artist/$1"
}
},
"itunes_artist_id": {
"category_name": "iTunes",
"url_name": "iTunes artist",
"urls": {
"default": "https://music.apple.com/us/artist/$1"
}
},
"soundcloud_id": {
"category_name": "Soundcloud",
"url_name": "Soundcloud artist",
"urls": {
"default": "https://soundcloud.com/$1"
}
},
"netflix_id": {
"category_name": "Netflix",
"url_name": "Netflix movie",
"urls": {
"default": "https://www.netflix.com/watch/$1"
}
},
"github_profile": {
"category_name": "Github",
"url_name": "Github profile",
"urls": {
"default": "https://wwww.github.com/$1"
}
},
"musicbrainz_artist": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz artist",
"urls": {
"default": "http://musicbrainz.org/artist/$1"
}
},
"musicbrainz_work": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz work",
"urls": {
"default": "http://musicbrainz.org/work/$1"
}
},
"musicbrainz_release_group": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz release group",
"urls": {
"default": "http://musicbrainz.org/release-group/$1"
}
},
"musicbrainz_label": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz label",
"urls": {
"default": "http://musicbrainz.org/label/$1"
}
},
"wikimedia_image": {
"category_name": "Wikipedia",
"url_name": "Wikipedia image",
"urls": {
"default": "https://commons.wikimedia.org/wiki/Special:FilePath/$1?width=500&height=400"
}
},
"map": {
"category_name": "Map",
"url_name": "geo map",
"urls": {
"default": "https://www.openstreetmap.org/?lat=${latitude}&lon=${longitude}&zoom=${zoom}&layers=M"
}
}
}

BIN
searx/data/lid.176.ftz Executable file

Binary file not shown.

71
searx/data/locales.json Normal file
View File

@@ -0,0 +1,71 @@
{
"LOCALE_NAMES": {
"af": "Afrikaans",
"ar": "العربية (Arabic)",
"bg": "Български (Bulgarian)",
"bn": "বাংলা (Bangla)",
"bo": "བོད་སྐད་ (Tibetan)",
"ca": "Català (Catalan)",
"cs": "Čeština (Czech)",
"cy": "Cymraeg (Welsh)",
"da": "Dansk (Danish)",
"de": "Deutsch (German)",
"dv": "ދިވެހި (Dhivehi)",
"el-GR": "Ελληνικά, Ελλάδα (Greek, Greece)",
"en": "English",
"eo": "Esperanto",
"es": "Español (Spanish)",
"et": "Eesti (Estonian)",
"eu": "Euskara (Basque)",
"fa-IR": "فارسی, ایران (Persian, Iran)",
"fi": "Suomi (Finnish)",
"fil": "Filipino",
"fr": "Français (French)",
"ga": "Gaeilge (Irish)",
"gl": "Galego (Galician)",
"he": "עברית (Hebrew)",
"hr": "Hrvatski (Croatian)",
"hu": "Magyar (Hungarian)",
"ia": "Interlingua",
"id": "Indonesia (Indonesian)",
"it": "Italiano (Italian)",
"ja": "日本語 (Japanese)",
"ko": "한국어 (Korean)",
"lt": "Lietuvių (Lithuanian)",
"lv": "Latviešu (Latvian)",
"ml": "മലയാളം (Malayalam)",
"ms": "Melayu (Malay)",
"nb-NO": "Norsk bokmål, Norge (Norwegian bokmål, Norway)",
"nl": "Nederlands (Dutch)",
"nl-BE": "Nederlands, België (Dutch, Belgium)",
"oc": "Occitan",
"pa": "ਪੰਜਾਬੀ (Punjabi)",
"pap": "Papiamento",
"pl": "Polski (Polish)",
"pt": "Português (Portuguese)",
"pt-BR": "Português, Brasil (Portuguese, Brazil)",
"ro": "Română (Romanian)",
"ru": "Русский (Russian)",
"si": "සිංහල (Sinhala)",
"sk": "Slovenčina (Slovak)",
"sl": "Slovenščina (Slovenian)",
"sr": "Српски (Serbian)",
"sv": "Svenska (Swedish)",
"szl": "Ślōnski (Silesian)",
"ta": "தமிழ் (Tamil)",
"te": "తెలుగు (Telugu)",
"th": "ไทย (Thai)",
"tr": "Türkçe (Turkish)",
"tt": "Татар (Tatar)",
"uk": "Українська (Ukrainian)",
"vi": "Tiếng việt (Vietnamese)",
"zh-HK": "中文, 中國香港特別行政區 (Chinese, Hong Kong SAR China)",
"zh-Hans-CN": "中文, 中国 (Chinese, China)",
"zh-Hant-TW": "中文, 台灣 (Chinese, Taiwan)"
},
"RTL_LOCALES": [
"ar",
"fa-IR",
"he"
]
}

63227
searx/data/osm_keys_tags.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,142 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Simple implementation to store TrackerPatterns data in a SQL database."""
from __future__ import annotations
import typing
__all__ = ["TrackerPatternsDB"]
import re
import pathlib
from collections.abc import Iterator
from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
import httpx
from searx.data.core import get_cache, log
RuleType = tuple[str, list[str], list[str]]
class TrackerPatternsDB:
# pylint: disable=missing-class-docstring
ctx_name = "data_tracker_patterns"
json_file = pathlib.Path(__file__).parent / "tracker_patterns.json"
CLEAR_LIST_URL = [
# ClearURL rule lists, the first one that responds HTTP 200 is used
"https://rules1.clearurls.xyz/data.minify.json",
"https://rules2.clearurls.xyz/data.minify.json",
"https://raw.githubusercontent.com/ClearURLs/Rules/refs/heads/master/data.min.json",
]
class Fields:
# pylint: disable=too-few-public-methods, invalid-name
url_regexp: typing.Final = 0 # URL (regular expression) match condition of the link
url_ignore: typing.Final = 1 # URL (regular expression) to ignore
del_args: typing.Final = 2 # list of URL arguments (regular expression) to delete
def __init__(self):
self.cache = get_cache()
def init(self):
if self.cache.properties("tracker_patterns loaded") != "OK":
self.load()
self.cache.properties.set("tracker_patterns loaded", "OK")
# F I X M E:
# do we need a maintenance .. rember: database is stored
# in /tmp and will be rebuild during the reboot anyway
def load(self):
log.debug("init searx.data.TRACKER_PATTERNS")
for rule in self.iter_clear_list():
self.add(rule)
def add(self, rule: RuleType):
self.cache.set(
key=rule[self.Fields.url_regexp],
value=(
rule[self.Fields.url_ignore],
rule[self.Fields.del_args],
),
ctx=self.ctx_name,
expire=None,
)
def rules(self) -> Iterator[RuleType]:
self.init()
for key, value in self.cache.pairs(ctx=self.ctx_name):
yield key, value[0], value[1]
def iter_clear_list(self) -> Iterator[RuleType]:
resp = None
for url in self.CLEAR_LIST_URL:
resp = httpx.get(url, timeout=3)
if resp.status_code == 200:
break
log.warning(f"TRACKER_PATTERNS: ClearURL ignore HTTP {resp.status_code} {url}")
if resp is None:
log.error("TRACKER_PATTERNS: failed fetching ClearURL rule lists")
return
for rule in resp.json()["providers"].values():
yield (
rule["urlPattern"].replace("\\\\", "\\"), # fix javascript regex syntax
[exc.replace("\\\\", "\\") for exc in rule.get("exceptions", [])],
rule.get("rules", []),
)
def clean_url(self, url: str) -> bool | str:
"""The URL arguments are normalized and cleaned of tracker parameters.
Returns bool ``True`` to use URL unchanged (``False`` to ignore URL).
If URL should be modified, the returned string is the new URL to use.
"""
new_url = url
parsed_new_url = urlparse(url=new_url)
for rule in self.rules():
if not re.match(rule[self.Fields.url_regexp], new_url):
# no match / ignore pattern
continue
do_ignore = False
for pattern in rule[self.Fields.url_ignore]:
if re.match(pattern, new_url):
do_ignore = True
break
if do_ignore:
# pattern is in the list of exceptions / ignore pattern
# HINT:
# we can't break the outer pattern loop since we have
# overlapping urlPattern like ".*"
continue
# remove tracker arguments from the url-query part
query_args: list[tuple[str, str]] = list(parse_qsl(parsed_new_url.query))
for name, val in query_args.copy():
# remove URL arguments
for pattern in rule[self.Fields.del_args]:
if re.match(pattern, name):
log.debug("TRACKER_PATTERNS: %s remove tracker arg: %s='%s'", parsed_new_url.netloc, name, val)
query_args.remove((name, val))
parsed_new_url = parsed_new_url._replace(query=urlencode(query_args))
new_url = urlunparse(parsed_new_url)
if new_url != url:
return new_url
return True
if __name__ == "__main__":
db = TrackerPatternsDB()
for r in db.rules():
print(r)

View File

@@ -0,0 +1,11 @@
{
"os": [
"Windows NT 10.0; Win64; x64",
"X11; Linux x86_64"
],
"ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}",
"versions": [
"139.0",
"138.0"
]
}

File diff suppressed because it is too large Load Diff

299
searx/enginelib/__init__.py Normal file
View File

@@ -0,0 +1,299 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations of the framework for the SearXNG engines.
- :py:obj:`searx.enginelib.EngineCache`
- :py:obj:`searx.enginelib.Engine`
- :py:obj:`searx.enginelib.traits`
There is a command line for developer purposes and for deeper analysis. Here is
an example in which the command line is called in the development environment::
$ ./manage pyenv.cmd bash --norc --noprofile
(py3) python -m searx.enginelib --help
.. hint::
The long term goal is to modularize all implementations of the engine
framework here in this Python package. ToDo:
- move implementations of the :ref:`searx.engines loader` to a new module in
the :py:obj:`searx.enginelib` namespace.
-----
"""
from __future__ import annotations
__all__ = ["EngineCache", "Engine", "ENGINES_CACHE"]
from typing import List, Callable, TYPE_CHECKING, Any
import string
import typer
from ..cache import ExpireCache, ExpireCacheCfg
if TYPE_CHECKING:
from searx.enginelib import traits
ENGINES_CACHE = ExpireCache.build_cache(
ExpireCacheCfg(
name="ENGINES_CACHE",
MAXHOLD_TIME=60 * 60 * 24 * 7, # 7 days
MAINTENANCE_PERIOD=60 * 60, # 2h
)
)
"""Global :py:obj:`searx.cache.ExpireCacheSQLite` instance where the cached
values from all engines are stored. The `MAXHOLD_TIME` is 7 days and the
`MAINTENANCE_PERIOD` is set to two hours."""
app = typer.Typer()
@app.command()
def state():
"""Show state for the caches of the engines."""
title = "cache tables and key/values"
print(title)
print("=" * len(title))
print(ENGINES_CACHE.state().report())
print()
title = f"properties of {ENGINES_CACHE.cfg.name}"
print(title)
print("=" * len(title))
print(str(ENGINES_CACHE.properties)) # type: ignore
@app.command()
def maintenance(force: bool = True):
"""Carry out maintenance on cache of the engines."""
ENGINES_CACHE.maintenance(force=force)
class EngineCache:
"""Persistent (SQLite) key/value cache that deletes its values again after
``expire`` seconds (default/max: :py:obj:`MAXHOLD_TIME
<searx.cache.ExpireCacheCfg.MAXHOLD_TIME>`). This class is a wrapper around
:py:obj:`ENGINES_CACHE` (:py:obj:`ExpireCacheSQLite
<searx.cache.ExpireCacheSQLite>`).
In the :origin:`searx/engines/demo_offline.py` engine you can find an
exemplary implementation of such a cache other examples are implemented
in:
- :origin:`searx/engines/radio_browser.py`
- :origin:`searx/engines/soundcloud.py`
- :origin:`searx/engines/startpage.py`
.. code: python
from searx.enginelib import EngineCache
CACHE: EngineCache
def init(engine_settings):
global CACHE
CACHE = EngineCache(engine_settings["name"])
def request(query, params):
token = CACHE.get(key="token")
if token is None:
token = get_token()
# cache token of this engine for 1h
CACHE.set(key="token", value=token, expire=3600)
...
For introspection of the DB, jump into developer environment and run command to
show cache state::
$ ./manage pyenv.cmd bash --norc --noprofile
(py3) python -m searx.enginelib cache state
cache tables and key/values
===========================
[demo_offline ] 2025-04-22 11:32:50 count --> (int) 4
[startpage ] 2025-04-22 12:32:30 SC_CODE --> (str) fSOBnhEMlDfE20
[duckduckgo ] 2025-04-22 12:32:31 4dff493e.... --> (str) 4-128634958369380006627592672385352473325
[duckduckgo ] 2025-04-22 12:40:06 3e2583e2.... --> (str) 4-263126175288871260472289814259666848451
[radio_browser ] 2025-04-23 11:33:08 servers --> (list) ['https://de2.api.radio-browser.info', ...]
[soundcloud ] 2025-04-29 11:40:06 guest_client_id --> (str) EjkRJG0BLNEZquRiPZYdNtJdyGtTuHdp
[wolframalpha ] 2025-04-22 12:40:06 code --> (str) 5aa79f86205ad26188e0e26e28fb7ae7
number of tables: 6
number of key/value pairs: 7
In the "cache tables and key/values" section, the table name (engine name) is at
first position on the second there is the calculated expire date and on the
third and fourth position the key/value is shown.
About duckduckgo: The *vqd coode* of ddg depends on the query term and therefore
the key is a hash value of the query term (to not to store the raw query term).
In the "properties of ENGINES_CACHE" section all properties of the SQLiteAppl /
ExpireCache and their last modification date are shown::
properties of ENGINES_CACHE
===========================
[last modified: 2025-04-22 11:32:27] DB_SCHEMA : 1
[last modified: 2025-04-22 11:32:27] LAST_MAINTENANCE :
[last modified: 2025-04-22 11:32:27] crypt_hash : ca612e3566fdfd7cf7efe...
[last modified: 2025-04-22 11:32:30] CACHE-TABLE--demo_offline: demo_offline
[last modified: 2025-04-22 11:32:30] CACHE-TABLE--startpage: startpage
[last modified: 2025-04-22 11:32:31] CACHE-TABLE--duckduckgo: duckduckgo
[last modified: 2025-04-22 11:33:08] CACHE-TABLE--radio_browser: radio_browser
[last modified: 2025-04-22 11:40:06] CACHE-TABLE--soundcloud: soundcloud
[last modified: 2025-04-22 11:40:06] CACHE-TABLE--wolframalpha: wolframalpha
These properties provide information about the state of the ExpireCache and
control the behavior. For example, the maintenance intervals are controlled by
the last modification date of the LAST_MAINTENANCE property and the hash value
of the password can be used to detect whether the password has been changed (in
this case the DB entries can no longer be decrypted and the entire cache must be
discarded).
"""
def __init__(self, engine_name: str, expire: int | None = None):
self.expire = expire or ENGINES_CACHE.cfg.MAXHOLD_TIME
_valid = "-_." + string.ascii_letters + string.digits
self.table_name = "".join([c if c in _valid else "_" for c in engine_name])
def set(self, key: str, value: Any, expire: int | None = None) -> bool:
return ENGINES_CACHE.set(
key=key,
value=value,
expire=expire or self.expire,
ctx=self.table_name,
)
def get(self, key: str, default=None) -> Any:
return ENGINES_CACHE.get(key, default=default, ctx=self.table_name)
def secret_hash(self, name: str | bytes) -> str:
return ENGINES_CACHE.secret_hash(name=name)
class Engine: # pylint: disable=too-few-public-methods
"""Class of engine instances build from YAML settings.
Further documentation see :ref:`general engine configuration`.
.. hint::
This class is currently never initialized and only used for type hinting.
"""
# Common options in the engine module
engine_type: str
"""Type of the engine (:ref:`searx.search.processors`)"""
paging: bool
"""Engine supports multiple pages."""
time_range_support: bool
"""Engine supports search time range."""
safesearch: bool
"""Engine supports SafeSearch"""
language_support: bool
"""Engine supports languages (locales) search."""
language: str
"""For an engine, when there is ``language: ...`` in the YAML settings the engine
does support only this one language:
.. code:: yaml
- name: google french
engine: google
language: fr
"""
region: str
"""For an engine, when there is ``region: ...`` in the YAML settings the engine
does support only this one region::
.. code:: yaml
- name: google belgium
engine: google
region: fr-BE
"""
fetch_traits: Callable
"""Function to to fetch engine's traits from origin."""
traits: traits.EngineTraits
"""Traits of the engine."""
# settings.yml
categories: List[str]
"""Specifies to which :ref:`engine categories` the engine should be added."""
name: str
"""Name that will be used across SearXNG to define this engine. In settings, on
the result page .."""
engine: str
"""Name of the python file used to handle requests and responses to and from
this search engine (file name from :origin:`searx/engines` without
``.py``)."""
enable_http: bool
"""Enable HTTP (by default only HTTPS is enabled)."""
shortcut: str
"""Code used to execute bang requests (``!foo``)"""
timeout: float
"""Specific timeout for search-engine."""
display_error_messages: bool
"""Display error messages on the web UI."""
proxies: dict
"""Set proxies for a specific engine (YAML):
.. code:: yaml
proxies :
http: socks5://proxy:port
https: socks5://proxy:port
"""
disabled: bool
"""To disable by default the engine, but not deleting it. It will allow the
user to manually activate it in the settings."""
inactive: bool
"""Remove the engine from the settings (*disabled & removed*)."""
about: dict
"""Additional fields describing the engine.
.. code:: yaml
about:
website: https://example.com
wikidata_id: Q306656
official_api_documentation: https://example.com/api-doc
use_official_api: true
require_api_key: true
results: HTML
"""
using_tor_proxy: bool
"""Using tor proxy (``true``) or not (``false``) for this engine."""
send_accept_language_header: bool
"""When this option is activated, the language (locale) that is selected by
the user is used to build and send a ``Accept-Language`` header in the
request to the origin search engine."""
tokens: List[str]
"""A list of secret tokens to make this engine *private*, more details see
:ref:`private engines`."""
weight: int
"""Weighting of the results of this engine (:ref:`weight <settings engines>`)."""

View File

@@ -0,0 +1,21 @@
"""Implementation of a command line for development purposes. To start a
command, switch to the environment and run library module as a script::
$ ./manage pyenv.cmd bash --norc --noprofile
(py3) python -m searx.enginelib --help
The following commands can be used for maintenance and introspection
(development) of the engine cache::
(py3) python -m searx.enginelib cache state
(py3) python -m searx.enginelib cache maintenance
"""
import typer
from .. import enginelib
app = typer.Typer()
app.add_typer(enginelib.app, name="cache", help="Commands related to the cache of the engines.")
app()

264
searx/enginelib/traits.py Normal file
View File

@@ -0,0 +1,264 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Engine's traits are fetched from the origin engines and stored in a JSON file
in the *data folder*. Most often traits are languages and region codes and
their mapping from SearXNG's representation to the representation in the origin
search engine. For new traits new properties can be added to the class
:py:class:`EngineTraits`.
To load traits from the persistence :py:obj:`EngineTraitsMap.from_data` can be
used.
"""
from __future__ import annotations
import os
import json
import dataclasses
import types
from typing import Dict, Literal, Iterable, Union, Callable, Optional, TYPE_CHECKING
from searx import locales
from searx.data import data_dir, ENGINE_TRAITS
if TYPE_CHECKING:
from . import Engine
class EngineTraitsEncoder(json.JSONEncoder):
"""Encodes :class:`EngineTraits` to a serializable object, see
:class:`json.JSONEncoder`."""
def default(self, o):
"""Return dictionary of a :class:`EngineTraits` object."""
if isinstance(o, EngineTraits):
return o.__dict__
return super().default(o)
@dataclasses.dataclass
class EngineTraits:
"""The class is intended to be instantiated for each engine."""
regions: Dict[str, str] = dataclasses.field(default_factory=dict)
"""Maps SearXNG's internal representation of a region to the one of the engine.
SearXNG's internal representation can be parsed by babel and the value is
send to the engine:
.. code:: python
regions ={
'fr-BE' : <engine's region name>,
}
for key, egnine_region regions.items():
searxng_region = babel.Locale.parse(key, sep='-')
...
"""
languages: Dict[str, str] = dataclasses.field(default_factory=dict)
"""Maps SearXNG's internal representation of a language to the one of the engine.
SearXNG's internal representation can be parsed by babel and the value is
send to the engine:
.. code:: python
languages = {
'ca' : <engine's language name>,
}
for key, egnine_lang in languages.items():
searxng_lang = babel.Locale.parse(key)
...
"""
all_locale: Optional[str] = None
"""To which locale value SearXNG's ``all`` language is mapped (shown a "Default
language").
"""
data_type: Literal['traits_v1'] = 'traits_v1'
"""Data type, default is 'traits_v1'.
"""
custom: Dict[str, Union[Dict[str, Dict], Iterable[str]]] = dataclasses.field(default_factory=dict)
"""A place to store engine's custom traits, not related to the SearXNG core.
"""
def get_language(self, searxng_locale: str, default=None):
"""Return engine's language string that *best fits* to SearXNG's locale.
:param searxng_locale: SearXNG's internal representation of locale
selected by the user.
:param default: engine's default language
The *best fits* rules are implemented in
:py:obj:`searx.locales.get_engine_locale`. Except for the special value ``all``
which is determined from :py:obj:`EngineTraits.all_locale`.
"""
if searxng_locale == 'all' and self.all_locale is not None:
return self.all_locale
return locales.get_engine_locale(searxng_locale, self.languages, default=default)
def get_region(self, searxng_locale: str, default=None):
"""Return engine's region string that best fits to SearXNG's locale.
:param searxng_locale: SearXNG's internal representation of locale
selected by the user.
:param default: engine's default region
The *best fits* rules are implemented in
:py:obj:`searx.locales.get_engine_locale`. Except for the special value ``all``
which is determined from :py:obj:`EngineTraits.all_locale`.
"""
if searxng_locale == 'all' and self.all_locale is not None:
return self.all_locale
return locales.get_engine_locale(searxng_locale, self.regions, default=default)
def is_locale_supported(self, searxng_locale: str) -> bool:
"""A *locale* (SearXNG's internal representation) is considered to be
supported by the engine if the *region* or the *language* is supported
by the engine.
For verification the functions :py:func:`EngineTraits.get_region` and
:py:func:`EngineTraits.get_language` are used.
"""
if self.data_type == 'traits_v1':
return bool(self.get_region(searxng_locale) or self.get_language(searxng_locale))
raise TypeError('engine traits of type %s is unknown' % self.data_type)
def copy(self):
"""Create a copy of the dataclass object."""
return EngineTraits(**dataclasses.asdict(self))
@classmethod
def fetch_traits(cls, engine: Engine) -> Union['EngineTraits', None]:
"""Call a function ``fetch_traits(engine_traits)`` from engines namespace to fetch
and set properties from the origin engine in the object ``engine_traits``. If
function does not exists, ``None`` is returned.
"""
fetch_traits = getattr(engine, 'fetch_traits', None)
engine_traits = None
if fetch_traits:
engine_traits = cls()
fetch_traits(engine_traits)
return engine_traits
def set_traits(self, engine: Engine):
"""Set traits from self object in a :py:obj:`.Engine` namespace.
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
"""
if self.data_type == 'traits_v1':
self._set_traits_v1(engine)
else:
raise TypeError('engine traits of type %s is unknown' % self.data_type)
def _set_traits_v1(self, engine: Engine):
# For an engine, when there is `language: ...` in the YAML settings the engine
# does support only this one language (region)::
#
# - name: google italian
# engine: google
# language: it
# region: it-IT # type: ignore
traits = self.copy()
_msg = "settings.yml - engine: '%s' / %s: '%s' not supported"
languages = traits.languages
if hasattr(engine, 'language'):
if engine.language not in languages:
raise ValueError(_msg % (engine.name, 'language', engine.language))
traits.languages = {engine.language: languages[engine.language]}
regions = traits.regions
if hasattr(engine, 'region'):
if engine.region not in regions:
raise ValueError(_msg % (engine.name, 'region', engine.region))
traits.regions = {engine.region: regions[engine.region]}
engine.language_support = bool(traits.languages or traits.regions)
# set the copied & modified traits in engine's namespace
engine.traits = traits
class EngineTraitsMap(Dict[str, EngineTraits]):
"""A python dictionary to map :class:`EngineTraits` by engine name."""
ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve()
"""File with persistence of the :py:obj:`EngineTraitsMap`."""
def save_data(self):
"""Store EngineTraitsMap in in file :py:obj:`self.ENGINE_TRAITS_FILE`"""
with open(self.ENGINE_TRAITS_FILE, 'w', encoding='utf-8') as f:
json.dump(self, f, indent=2, sort_keys=True, cls=EngineTraitsEncoder)
@classmethod
def from_data(cls) -> 'EngineTraitsMap':
"""Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`"""
obj = cls()
for k, v in ENGINE_TRAITS.items():
obj[k] = EngineTraits(**v)
return obj
@classmethod
def fetch_traits(cls, log: Callable) -> 'EngineTraitsMap':
from searx import engines # pylint: disable=cyclic-import, import-outside-toplevel
names = list(engines.engines)
names.sort()
obj = cls()
for engine_name in names:
engine = engines.engines[engine_name]
traits = None
# pylint: disable=broad-exception-caught
try:
traits = EngineTraits.fetch_traits(engine)
except Exception as exc:
log("FATAL: while fetch_traits %s: %s" % (engine_name, exc))
if os.environ.get('FORCE', '').lower() not in ['on', 'true', '1']:
raise
v = ENGINE_TRAITS.get(engine_name)
if v:
log("FORCE: re-use old values from fetch_traits - ENGINE_TRAITS[%s]" % engine_name)
traits = EngineTraits(**v)
if traits is not None:
log("%-20s: SearXNG languages --> %s " % (engine_name, len(traits.languages)))
log("%-20s: SearXNG regions --> %s" % (engine_name, len(traits.regions)))
obj[engine_name] = traits
return obj
def set_traits(self, engine: Engine | types.ModuleType):
"""Set traits in a :py:obj:`Engine` namespace.
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
"""
engine_traits = EngineTraits(data_type='traits_v1')
if engine.name in self.keys():
engine_traits = self[engine.name]
elif engine.engine in self.keys():
# The key of the dictionary traits_map is the *engine name*
# configured in settings.xml. When multiple engines are configured
# in settings.yml to use the same origin engine (python module)
# these additional engines can use the languages from the origin
# engine. For this use the configured ``engine: ...`` from
# settings.yml
engine_traits = self[engine.engine]
engine_traits.set_traits(engine)

56
searx/engines/1337x.py Normal file
View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""1337x
"""
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://1337x.to/',
"wikidata_id": 'Q28134166',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
url = 'https://1337x.to/'
search_url = url + 'search/{search_term}/{pageno}/'
categories = ['files']
paging = True
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//table[contains(@class, "table-list")]/tbody//tr'):
href = urljoin(url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0))
title = extract_text(eval_xpath(result, './td[contains(@class, "name")]/a[2]'))
seed = extract_text(eval_xpath(result, './/td[contains(@class, "seeds")]'))
leech = extract_text(eval_xpath(result, './/td[contains(@class, "leeches")]'))
filesize = extract_text(eval_xpath(result, './/td[contains(@class, "size")]/text()'))
results.append(
{
'url': href,
'title': title,
'seed': seed,
'leech': leech,
'filesize': filesize,
'template': 'torrent.html',
}
)
return results

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""360Search search engine for searxng"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text
# Metadata
about = {
"website": "https://www.so.com/",
"wikidata_id": "Q10846064",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
"language": "zh",
}
# Engine Configuration
categories = ["general"]
paging = True
time_range_support = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
# Base URL
base_url = "https://www.so.com"
def request(query, params):
query_params = {
"pn": params["pageno"],
"q": query,
}
if time_range_dict.get(params['time_range']):
query_params["adv_t"] = time_range_dict.get(params['time_range'])
params["url"] = f"{base_url}/s?{urlencode(query_params)}"
return params
def response(resp):
dom = html.fromstring(resp.text)
results = []
for item in dom.xpath('//li[contains(@class, "res-list")]'):
title = extract_text(item.xpath('.//h3[contains(@class, "res-title")]/a'))
url = extract_text(item.xpath('.//h3[contains(@class, "res-title")]/a/@data-mdurl'))
if not url:
url = extract_text(item.xpath('.//h3[contains(@class, "res-title")]/a/@href'))
content = extract_text(item.xpath('.//p[@class="res-desc"]'))
if not content:
content = extract_text(item.xpath('.//span[@class="res-list-summary"]'))
if title and url:
results.append(
{
"title": title,
"url": url,
"content": content,
}
)
return results

View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""360Search-Videos: A search engine for retrieving videos from 360Search."""
from urllib.parse import urlencode
from datetime import datetime
from searx.exceptions import SearxEngineAPIException
from searx.utils import html_to_text, get_embeded_stream_url
about = {
"website": "https://tv.360kan.com/",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
paging = True
results_per_page = 10
categories = ["videos"]
base_url = "https://tv.360kan.com"
def request(query, params):
query_params = {"count": 10, "q": query, "start": params["pageno"] * 10}
params["url"] = f"{base_url}/v1/video/list?{urlencode(query_params)}"
return params
def response(resp):
try:
data = resp.json()
except Exception as e:
raise SearxEngineAPIException(f"Invalid response: {e}") from e
results = []
if "data" not in data or "result" not in data["data"]:
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["result"]:
if not entry.get("title") or not entry.get("play_url"):
continue
published_date = None
if entry.get("publish_time"):
try:
published_date = datetime.fromtimestamp(int(entry["publish_time"]))
except (ValueError, TypeError):
published_date = None
results.append(
{
'url': entry["play_url"],
'title': html_to_text(entry["title"]),
'content': html_to_text(entry["description"]),
'template': 'videos.html',
'publishedDate': published_date,
'thumbnail': entry["cover_img"],
"iframe_src": get_embeded_stream_url(entry["play_url"]),
}
)
return results

76
searx/engines/9gag.py Normal file
View File

@@ -0,0 +1,76 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""9GAG (social media)"""
from json import loads
from datetime import datetime
from urllib.parse import urlencode
about = {
"website": 'https://9gag.com/',
"wikidata_id": 'Q277421',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['social media']
paging = True
search_url = "https://9gag.com/v1/search-posts?{query}"
page_size = 10
def request(query, params):
query = urlencode({'query': query, 'c': (params['pageno'] - 1) * page_size})
params['url'] = search_url.format(query=query)
return params
def response(resp):
results = []
json_results = loads(resp.text)['data']
for result in json_results['posts']:
result_type = result['type']
# Get the not cropped version of the thumbnail when the image height is not too important
if result['images']['image700']['height'] > 400:
thumbnail = result['images']['imageFbThumbnail']['url']
else:
thumbnail = result['images']['image700']['url']
if result_type == 'Photo':
results.append(
{
'template': 'images.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.fromtimestamp(result['creationTs']),
'img_src': result['images']['image700']['url'],
'thumbnail_src': thumbnail,
}
)
elif result_type == 'Animated':
results.append(
{
'template': 'videos.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.fromtimestamp(result['creationTs']),
'thumbnail': thumbnail,
'iframe_src': result['images'].get('image460sv', {}).get('url'),
}
)
if 'tags' in json_results:
for suggestion in json_results['tags']:
results.append({'suggestion': suggestion['key']})
return results

253
searx/engines/__init__.py Normal file
View File

@@ -0,0 +1,253 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Load and initialize the ``engines``, see :py:func:`load_engines` and register
:py:obj:`engine_shortcuts`.
usage::
load_engines( settings['engines'] )
"""
from __future__ import annotations
import sys
import copy
from os.path import realpath, dirname
from typing import TYPE_CHECKING, Dict
import types
import inspect
from searx import logger, settings
from searx.utils import load_module
if TYPE_CHECKING:
from searx.enginelib import Engine
logger = logger.getChild('engines')
ENGINE_DIR = dirname(realpath(__file__))
ENGINE_DEFAULT_ARGS = {
# Common options in the engine module
"engine_type": "online",
"paging": False,
"time_range_support": False,
"safesearch": False,
# settings.yml
"categories": ["general"],
"enable_http": False,
"shortcut": "-",
"timeout": settings["outgoing"]["request_timeout"],
"display_error_messages": True,
"disabled": False,
"inactive": False,
"about": {},
"using_tor_proxy": False,
"send_accept_language_header": False,
"tokens": [],
"max_page": 0,
}
# set automatically when an engine does not have any tab category
DEFAULT_CATEGORY = 'other'
# Defaults for the namespace of an engine module, see :py:func:`load_engine`
categories = {'general': []}
engines: Dict[str, Engine | types.ModuleType] = {}
engine_shortcuts = {}
"""Simple map of registered *shortcuts* to name of the engine (or ``None``).
::
engine_shortcuts[engine.shortcut] = engine.name
:meta hide-value:
"""
def check_engine_module(module: types.ModuleType):
# probe unintentional name collisions / for example name collisions caused
# by import statements in the engine module ..
# network: https://github.com/searxng/searxng/issues/762#issuecomment-1605323861
obj = getattr(module, 'network', None)
if obj and inspect.ismodule(obj):
msg = f'type of {module.__name__}.network is a module ({obj.__name__}), expected a string'
# logger.error(msg)
raise TypeError(msg)
def load_engine(engine_data: dict) -> Engine | types.ModuleType | None:
"""Load engine from ``engine_data``.
:param dict engine_data: Attributes from YAML ``settings:engines/<engine>``
:return: initialized namespace of the ``<engine>``.
1. create a namespace and load module of the ``<engine>``
2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS`
3. update namespace with values from ``engine_data``
If engine *is active*, return namespace of the engine, otherwise return
``None``.
This function also returns ``None`` if initialization of the namespace fails
for one of the following reasons:
- engine name contains underscore
- engine name is not lowercase
- required attribute is not set :py:func:`is_missing_required_attributes`
"""
# pylint: disable=too-many-return-statements
engine_name = engine_data.get('name')
if engine_name is None:
logger.error('An engine does not have a "name" field')
return None
if '_' in engine_name:
logger.error('Engine name contains underscore: "{}"'.format(engine_name))
return None
if engine_name.lower() != engine_name:
logger.warning('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name))
engine_name = engine_name.lower()
engine_data['name'] = engine_name
# load_module
module_name = engine_data.get('engine')
if module_name is None:
logger.error('The "engine" field is missing for the engine named "{}"'.format(engine_name))
return None
try:
engine = load_module(module_name + '.py', ENGINE_DIR)
except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError):
logger.exception('Fatal exception in engine "{}"'.format(module_name))
sys.exit(1)
except BaseException:
logger.exception('Cannot load engine "{}"'.format(module_name))
return None
check_engine_module(engine)
update_engine_attributes(engine, engine_data)
update_attributes_for_tor(engine)
# avoid cyclic imports
# pylint: disable=import-outside-toplevel
from searx.enginelib.traits import EngineTraitsMap
trait_map = EngineTraitsMap.from_data()
trait_map.set_traits(engine)
if not is_engine_active(engine):
return None
if is_missing_required_attributes(engine):
return None
set_loggers(engine, engine_name)
if not any(cat in settings['categories_as_tabs'] for cat in engine.categories):
engine.categories.append(DEFAULT_CATEGORY)
return engine
def set_loggers(engine, engine_name):
# set the logger for engine
engine.logger = logger.getChild(engine_name)
# the engine may have load some other engines
# may sure the logger is initialized
# use sys.modules.copy() to avoid "RuntimeError: dictionary changed size during iteration"
# see https://github.com/python/cpython/issues/89516
# and https://docs.python.org/3.10/library/sys.html#sys.modules
modules = sys.modules.copy()
for module_name, module in modules.items():
if (
module_name.startswith("searx.engines")
and module_name != "searx.engines.__init__"
and not hasattr(module, "logger")
):
module_engine_name = module_name.split(".")[-1]
module.logger = logger.getChild(module_engine_name) # type: ignore
def update_engine_attributes(engine: Engine | types.ModuleType, engine_data):
# set engine attributes from engine_data
for param_name, param_value in engine_data.items():
if param_name == 'categories':
if isinstance(param_value, str):
param_value = list(map(str.strip, param_value.split(',')))
engine.categories = param_value # type: ignore
elif hasattr(engine, 'about') and param_name == 'about':
engine.about = {**engine.about, **engine_data['about']} # type: ignore
else:
setattr(engine, param_name, param_value)
# set default attributes
for arg_name, arg_value in ENGINE_DEFAULT_ARGS.items():
if not hasattr(engine, arg_name):
setattr(engine, arg_name, copy.deepcopy(arg_value))
def update_attributes_for_tor(engine: Engine | types.ModuleType):
if using_tor_proxy(engine) and hasattr(engine, 'onion_url'):
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') # type: ignore
engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) # type: ignore
def is_missing_required_attributes(engine):
"""An attribute is required when its name doesn't start with ``_`` (underline).
Required attributes must not be ``None``.
"""
missing = False
for engine_attr in dir(engine):
if not engine_attr.startswith('_') and getattr(engine, engine_attr) is None:
logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr))
missing = True
return missing
def using_tor_proxy(engine: Engine | types.ModuleType):
"""Return True if the engine configuration declares to use Tor."""
return settings['outgoing'].get('using_tor_proxy') or getattr(engine, 'using_tor_proxy', False)
def is_engine_active(engine: Engine | types.ModuleType):
# check if engine is inactive
if engine.inactive is True:
return False
# exclude onion engines if not using tor
if 'onions' in engine.categories and not using_tor_proxy(engine):
return False
return True
def register_engine(engine: Engine | types.ModuleType):
if engine.name in engines:
logger.error('Engine config error: ambiguous name: {0}'.format(engine.name))
sys.exit(1)
engines[engine.name] = engine
if engine.shortcut in engine_shortcuts:
logger.error('Engine config error: ambiguous shortcut: {0}'.format(engine.shortcut))
sys.exit(1)
engine_shortcuts[engine.shortcut] = engine.name
for category_name in engine.categories:
categories.setdefault(category_name, []).append(engine)
def load_engines(engine_list):
"""usage: ``engine_list = settings['engines']``"""
engines.clear()
engine_shortcuts.clear()
categories.clear()
categories['general'] = []
for engine_data in engine_list:
engine = load_engine(engine_data)
if engine:
register_engine(engine)
return engines

109
searx/engines/acfun.py Normal file
View File

@@ -0,0 +1,109 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Acfun search engine for searxng"""
from urllib.parse import urlencode
import re
import json
from datetime import datetime, timedelta
from lxml import html
from searx.utils import extract_text
# Metadata
about = {
"website": "https://www.acfun.cn/",
"wikidata_id": "Q3077675",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
"language": "zh",
}
# Engine Configuration
categories = ["videos"]
paging = True
# Base URL
base_url = "https://www.acfun.cn"
def request(query, params):
query_params = {"keyword": query, "pCursor": params["pageno"]}
params["url"] = f"{base_url}/search?{urlencode(query_params)}"
return params
def response(resp):
results = []
matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL)
if not matches:
return results
for match in matches:
try:
json_data = json.loads(match)
raw_html = json_data.get("html", "")
if not raw_html:
continue
tree = html.fromstring(raw_html)
video_blocks = tree.xpath('//div[contains(@class, "search-video")]')
if not video_blocks:
continue
for video_block in video_blocks:
video_info = extract_video_data(video_block)
if video_info and video_info["title"] and video_info["url"]:
results.append(video_info)
except json.JSONDecodeError:
continue
return results
def extract_video_data(video_block):
try:
data_exposure_log = video_block.get('data-exposure-log')
video_data = json.loads(data_exposure_log)
content_id = video_data.get("content_id", "")
title = video_data.get("title", "")
url = f"{base_url}/v/ac{content_id}"
iframe_src = f"{base_url}/player/ac{content_id}"
create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]'))
video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0])
video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]'))
video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]'))
published_date = None
if create_time:
try:
published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d")
except (ValueError, TypeError):
pass
length = None
if video_duration:
try:
timediff = datetime.strptime(video_duration.strip(), "%M:%S")
length = timedelta(minutes=timediff.minute, seconds=timediff.second)
except (ValueError, TypeError):
pass
return {
"title": title,
"url": url,
"content": video_intro,
"thumbnail": video_cover,
"length": length,
"publishedDate": published_date,
"iframe_src": iframe_src,
}
except (json.JSONDecodeError, AttributeError, TypeError, ValueError):
return None

View File

@@ -0,0 +1,229 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Adobe Stock`_ is a service that gives access to millions of royalty-free
assets. Assets types include photos, vectors, illustrations, templates, 3D
assets, videos, motion graphics templates and audio tracks.
.. Adobe Stock: https://stock.adobe.com/
Configuration
=============
The engine has the following mandatory setting:
- SearXNG's :ref:`engine categories`
- Adobe-Stock's :py:obj:`adobe_order`
- Adobe-Stock's :py:obj:`adobe_content_types`
.. code:: yaml
- name: adobe stock
engine: adobe_stock
shortcut: asi
categories: [images]
adobe_order: relevance
adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
- name: adobe stock video
engine: adobe_stock
network: adobe stock
shortcut: asi
categories: [videos]
adobe_order: relevance
adobe_content_types: ["video"]
Implementation
==============
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from datetime import datetime, timedelta
from urllib.parse import urlencode
import isodate
if TYPE_CHECKING:
import logging
logger: logging.Logger
about = {
"website": "https://stock.adobe.com/",
"wikidata_id": "Q5977430",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
categories = []
paging = True
send_accept_language_header = True
results_per_page = 10
base_url = "https://stock.adobe.com"
adobe_order: str = ""
"""Sort order, can be one of:
- ``relevance`` or
- ``featured`` or
- ``creation`` (most recent) or
- ``nb_downloads`` (number of downloads)
"""
ADOBE_VALID_TYPES = ["photo", "illustration", "zip_vector", "video", "template", "3d", "audio", "image"]
adobe_content_types: list = []
"""A list of of content types. The following content types are offered:
- Images: ``image``
- Videos: ``video``
- Templates: ``template``
- 3D: ``3d``
- Audio ``audio``
Additional subcategories:
- Photos: ``photo``
- Illustrations: ``illustration``
- Vectors: ``zip_vector`` (Vectors),
"""
# Do we need support for "free_collection" and "include_stock_enterprise"?
def init(_):
if not categories:
raise ValueError("adobe_stock engine: categories is unset")
# adobe_order
if not adobe_order:
raise ValueError("adobe_stock engine: adobe_order is unset")
if adobe_order not in ["relevance", "featured", "creation", "nb_downloads"]:
raise ValueError(f"unsupported adobe_order: {adobe_order}")
# adobe_content_types
if not adobe_content_types:
raise ValueError("adobe_stock engine: adobe_content_types is unset")
if isinstance(adobe_content_types, list):
for t in adobe_content_types:
if t not in ADOBE_VALID_TYPES:
raise ValueError("adobe_stock engine: adobe_content_types: '%s' is invalid" % t)
else:
raise ValueError(
"adobe_stock engine: adobe_content_types must be a list of strings not %s" % type(adobe_content_types)
)
def request(query, params):
args = {
"k": query,
"limit": results_per_page,
"order": adobe_order,
"search_page": params["pageno"],
"search_type": "pagination",
}
for content_type in ADOBE_VALID_TYPES:
args[f"filters[content_type:{content_type}]"] = 1 if content_type in adobe_content_types else 0
params["url"] = f"{base_url}/de/Ajax/Search?{urlencode(args)}"
# headers required to bypass bot-detection
if params["searxng_locale"] == "all":
params["headers"]["Accept-Language"] = "en-US,en;q=0.5"
return params
def parse_image_item(item):
return {
"template": "images.html",
"url": item["content_url"],
"title": item["title"],
"content": item["asset_type"],
"img_src": item["content_thumb_extra_large_url"],
"thumbnail_src": item["thumbnail_url"],
"resolution": f"{item['content_original_width']}x{item['content_original_height']}",
"img_format": item["format"],
"author": item["author"],
}
def parse_video_item(item):
# in video items, the title is more or less a "content description", we try
# to reduce the length of the title ..
title = item["title"]
content = ""
if "." in title.strip()[:-1]:
content = title
title = title.split(".", 1)[0]
elif "," in title:
content = title
title = title.split(",", 1)[0]
elif len(title) > 50:
content = title
title = ""
for w in content.split(" "):
title += f" {w}"
if len(title) > 50:
title = title.strip() + "\u2026"
break
return {
"template": "videos.html",
"url": item["content_url"],
"title": title,
"content": content,
# https://en.wikipedia.org/wiki/ISO_8601#Durations
"length": isodate.parse_duration(item["time_duration"]),
"publishedDate": datetime.fromisoformat(item["creation_date"]),
"thumbnail": item["thumbnail_url"],
"iframe_src": item["video_small_preview_url"],
"metadata": item["asset_type"],
}
def parse_audio_item(item):
audio_data = item["audio_data"]
content = audio_data.get("description") or ""
if audio_data.get("album"):
content = audio_data["album"] + " - " + content
return {
"url": item["content_url"],
"title": item["title"],
"content": content,
# "thumbnail": base_url + item["thumbnail_url"],
"iframe_src": audio_data["preview"]["url"],
"publishedDate": datetime.fromisoformat(audio_data["release_date"]) if audio_data["release_date"] else None,
"length": timedelta(seconds=round(audio_data["duration"] / 1000)) if audio_data["duration"] else None,
"author": item.get("artist_name"),
}
def response(resp):
results = []
json_resp = resp.json()
if isinstance(json_resp["items"], list):
return None
for item in json_resp["items"].values():
if item["asset_type"].lower() in ["image", "premium-image", "illustration", "vector"]:
result = parse_image_item(item)
elif item["asset_type"].lower() == "video":
result = parse_video_item(item)
elif item["asset_type"].lower() == "audio":
result = parse_audio_item(item)
else:
logger.error("no handle for %s --> %s", item["asset_type"], item)
continue
results.append(result)
return results

80
searx/engines/ahmia.py Normal file
View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Ahmia (Onions)
"""
from urllib.parse import urlencode, urlparse, parse_qs
from lxml.html import fromstring
from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath
# about
about = {
"website": 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion',
"wikidata_id": 'Q18693938',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine config
categories = ['onions']
paging = True
page_size = 10
# search url
search_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?{query}'
time_range_support = True
time_range_dict = {'day': 1, 'week': 7, 'month': 30}
# xpaths
results_xpath = '//li[@class="result"]'
url_xpath = './h4/a/@href'
title_xpath = './h4/a[1]'
content_xpath = './/p[1]'
correction_xpath = '//*[@id="didYouMean"]//a'
number_of_results_xpath = '//*[@id="totalResults"]'
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
if params['time_range'] in time_range_dict:
params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
return params
def response(resp):
results = []
dom = fromstring(resp.text)
# trim results so there's not way too many at once
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
all_results = eval_xpath_list(dom, results_xpath)
trimmed_results = all_results[first_result_index : first_result_index + page_size]
# get results
for result in trimmed_results:
# remove ahmia url and extract the actual url for the result
raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
title = extract_text(eval_xpath(result, title_xpath))
content = extract_text(eval_xpath(result, content_xpath))
results.append({'url': cleaned_url, 'title': title, 'content': content, 'is_onion': True})
# get spelling corrections
for correction in eval_xpath_list(dom, correction_xpath):
results.append({'correction': extract_text(correction)})
# get number of results
number_of_results = eval_xpath(dom, number_of_results_xpath)
if number_of_results:
try:
results.append({'number_of_results': int(extract_text(number_of_results))})
except: # pylint: disable=bare-except
pass
return results

View File

@@ -0,0 +1,83 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Alpine Linux binary packages`_. `Alpine Linux`_ is a Linux-based operation
system designed to be small, simple and secure. Contrary to many other Linux
distributions, it uses musl, BusyBox and OpenRC. Alpine is mostly used on
servers and for Docker images.
.. _Alpine Linux binary packages: https://pkgs.alpinelinux.org
.. _Alpine Linux: https://www.alpinelinux.org
"""
import re
from urllib.parse import urlencode
from lxml import html
from dateutil import parser
from searx.utils import eval_xpath, eval_xpath_list, extract_text
about = {
'website': 'https://www.alpinelinux.org',
'wikidata_id': 'Q4033826',
'use_official_api': False,
'official_api_documentation': None,
'require_api_key': False,
'results': 'HTML',
}
paging = True
categories = ['packages', 'it']
base_url = "https://pkgs.alpinelinux.org"
alpine_arch = 'x86_64'
"""Kernel architecture: ``x86_64``, ``x86``, ``aarch64``, ``armhf``,
``ppc64le``, ``s390x``, ``armv7`` or ``riscv64``"""
ARCH_RE = re.compile("x86_64|x86|aarch64|armhf|ppc64le|s390x|armv7|riscv64")
"""Regular expression to match supported architectures in the query string."""
def request(query, params):
query_arch = ARCH_RE.search(query)
if query_arch:
query_arch = query_arch.group(0)
query = query.replace(query_arch, '').strip()
args = {
# use wildcards to match more than just packages with the exact same
# name as the query
'name': f"*{query}*",
'page': params['pageno'],
'arch': query_arch or alpine_arch,
}
params['url'] = f"{base_url}/packages?{urlencode(args)}"
return params
def response(resp):
results = []
doc = html.fromstring(resp.text)
for result in eval_xpath_list(doc, "//table/tbody/tr"):
if len(result.xpath("./td")) < 9:
# skip non valid entries in the result table
# e.g the "No item found..." message
continue
results.append(
{
'template': 'packages.html',
'url': base_url + extract_text(eval_xpath(result, './td[contains(@class, "package")]/a/@href')),
'title': extract_text(eval_xpath(result, './td[contains(@class, "package")]')),
'package_name': extract_text(eval_xpath(result, './td[contains(@class, "package")]')),
'publishedDate': parser.parse(extract_text(eval_xpath(result, './td[contains(@class, "bdate")]'))),
'version': extract_text(eval_xpath(result, './td[contains(@class, "version")]')),
'homepage': extract_text(eval_xpath(result, './td[contains(@class, "url")]/a/@href')),
'maintainer': extract_text(eval_xpath(result, './td[contains(@class, "maintainer")]')),
'license_name': extract_text(eval_xpath(result, './td[contains(@class, "license")]')),
'tags': [extract_text(eval_xpath(result, './td[contains(@class, "repo")]'))],
}
)
return results

View File

@@ -0,0 +1,202 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Anna's Archive`_ is a free non-profit online shadow library metasearch
engine providing access to a variety of book resources (also via IPFS), created
by a team of anonymous archivists (AnnaArchivist_).
.. _Anna's Archive: https://annas-archive.org/
.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive
Configuration
=============
The engine has the following additional settings:
- :py:obj:`aa_content`
- :py:obj:`aa_ext`
- :py:obj:`aa_sort`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific searches in Anna's Archive. For example a engine to search
for *newest* articles and journals (PDF) / by shortcut ``!aaa <search-term>``.
.. code:: yaml
- name: annas articles
engine: annas_archive
shortcut: aaa
aa_content: 'magazine'
aa_ext: 'pdf'
aa_sort: 'newest'
Implementations
===============
"""
from typing import List, Dict, Any, Optional
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_getindex, eval_xpath_list
from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
# about
about: Dict[str, Any] = {
"website": "https://annas-archive.org/",
"wikidata_id": "Q115288326",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
# engine dependent config
categories: List[str] = ["files"]
paging: bool = True
# search-url
base_url: str = "https://annas-archive.org"
aa_content: str = ""
"""Anan's search form field **Content** / possible values::
book_fiction, book_unknown, book_nonfiction,
book_comic, magazine, standards_document
To not filter use an empty string (default).
"""
aa_sort: str = ''
"""Sort Anna's results, possible values::
newest, oldest, largest, smallest
To sort by *most relevant* use an empty string (default)."""
aa_ext: str = ''
"""Filter Anna's results by a file ending. Common filters for example are
``pdf`` and ``epub``.
.. note::
Anna's Archive is a beta release: Filter results by file extension does not
really work on Anna's Archive.
"""
def init(engine_settings=None): # pylint: disable=unused-argument
"""Check of engine's settings."""
traits = EngineTraits(**ENGINE_TRAITS['annas archive'])
if aa_content and aa_content not in traits.custom['content']:
raise ValueError(f'invalid setting content: {aa_content}')
if aa_sort and aa_sort not in traits.custom['sort']:
raise ValueError(f'invalid setting sort: {aa_sort}')
if aa_ext and aa_ext not in traits.custom['ext']:
raise ValueError(f'invalid setting ext: {aa_ext}')
def request(query, params: Dict[str, Any]) -> Dict[str, Any]:
lang = traits.get_language(params["language"], traits.all_locale) # type: ignore
args = {
'lang': lang,
'content': aa_content,
'ext': aa_ext,
'sort': aa_sort,
'q': query,
'page': params['pageno'],
}
# filter out None and empty values
filtered_args = dict((k, v) for k, v in args.items() if v)
params["url"] = f"{base_url}/search?{urlencode(filtered_args)}"
return params
def response(resp) -> List[Dict[str, Optional[str]]]:
results: List[Dict[str, Optional[str]]] = []
dom = html.fromstring(resp.text)
for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'):
results.append(_get_result(item))
# The rendering of the WEB page is very strange; except the first position
# all other positions of Anna's result page are enclosed in SGML comments.
# These comments are *uncommented* by some JS code, see query of class
# '.js-scroll-hidden' in Anna's HTML template:
# https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'):
item = html.fromstring(item.xpath('./comment()')[0].text)
results.append(_get_result(item))
return results
def _get_result(item):
return {
'template': 'paper.html',
'url': base_url + extract_text(eval_xpath_getindex(item, './@href', 0)),
'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
'thumbnail': extract_text(eval_xpath_getindex(item, './/img/@src', 0, default=None), allow_none=True),
}
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and other search arguments from Anna's search form."""
# pylint: disable=import-outside-toplevel
import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag
engine_traits.all_locale = ''
engine_traits.custom['content'] = []
engine_traits.custom['ext'] = []
engine_traits.custom['sort'] = []
resp = get(base_url + '/search')
if not resp.ok: # type: ignore
raise RuntimeError("Response from Anna's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
# supported language codes
lang_map = {}
for x in eval_xpath_list(dom, "//form//input[@name='lang']"):
eng_lang = x.get("value")
if eng_lang in ('', '_empty', 'nl-BE', 'und') or eng_lang.startswith('anti__'):
continue
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
# silently ignore unknown languages
# print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = eng_lang
for x in eval_xpath_list(dom, "//form//input[@name='content']"):
if not x.get("value").startswith("anti__"):
engine_traits.custom['content'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//input[@name='ext']"):
if not x.get("value").startswith("anti__"):
engine_traits.custom['ext'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"):
engine_traits.custom['sort'].append(x.get("value"))
# for better diff; sort the persistence of these traits
engine_traits.custom['content'].sort()
engine_traits.custom['ext'].sort()
engine_traits.custom['sort'].sort()

81
searx/engines/ansa.py Normal file
View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Engine for Ansa, Italy's oldest news agency.
To use this engine add the following entry to your engines
list in ``settings.yml``:
.. code:: yaml
- name: ansa
engine: ansa
shortcut: ans
disabled: false
"""
from urllib.parse import urlencode
from lxml import html
from searx.result_types import EngineResults, MainResult
from searx.utils import eval_xpath, eval_xpath_list, extract_text
engine_type = 'online'
language_support = False
categories = ['news']
paging = True
page_size = 12
base_url = 'https://www.ansa.it'
time_range_support = True
time_range_args = {
'day': 1,
'week': 7,
'month': 31,
'year': 365,
}
# https://www.ansa.it/ricerca/ansait/search.shtml?start=0&any=houthi&periodo=&sort=data%3Adesc
search_api = 'https://www.ansa.it/ricerca/ansait/search.shtml?'
about = {
'website': 'https://www.ansa.it',
'wikidata_id': 'Q392934',
'official_api_documentation': None,
'use_official_api': False,
'require_api_key': False,
'results': 'HTML',
'language': 'it',
}
def request(query, params):
query_params = {
'any': query,
'start': (params['pageno'] - 1) * page_size,
'sort': "data:desc",
}
if params['time_range']:
query_params['periodo'] = time_range_args.get(params['time_range'])
params['url'] = search_api + urlencode(query_params)
return params
def response(resp) -> EngineResults:
res = EngineResults()
doc = html.fromstring(resp.text)
for result in eval_xpath_list(doc, "//div[@class='article']"):
res_obj = MainResult(
title=extract_text(eval_xpath(result, "./div[@class='content']/h2[@class='title']/a")),
content=extract_text(eval_xpath(result, "./div[@class='content']/div[@class='text']")),
url=base_url + extract_text(eval_xpath(result, "./div[@class='content']/h2[@class='title']/a/@href")),
)
thumbnail = extract_text(eval_xpath(result, "./div[@class='image']/a/img/@src"))
if thumbnail:
res_obj.thumbnail = base_url + thumbnail
res.append(res_obj)
return res

View File

@@ -0,0 +1,61 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""APKMirror
"""
# pylint: disable=invalid-name
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
about = {
"website": 'https://www.apkmirror.com',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files', 'apps']
paging = True
time_range_support = False
# search-url
base_url = 'https://www.apkmirror.com'
search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}'
def request(query, params):
params['url'] = search_url.format(
pageno=params['pageno'],
query=urlencode({'s': query}),
)
logger.debug("query_url --> %s", params['url'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']"):
link = eval_xpath_getindex(result, './/h5/a', 0)
url = base_url + link.attrib.get('href') + '#downloads'
title = extract_text(link)
thumbnail = base_url + eval_xpath_getindex(result, './/img/@src', 0)
res = {'url': url, 'title': title, 'thumbnail': thumbnail}
results.append(res)
return results

View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Apple App Store
"""
from json import loads
from urllib.parse import urlencode
from dateutil.parser import parse
about = {
"website": 'https://www.apple.com/app-store/',
"wikidata_id": 'Q368215',
"official_api_documentation": (
'https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/'
'iTuneSearchAPI/UnderstandingSearchResults.html#//apple_ref/doc/uid/TP40017632-CH8-SW1'
),
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['files', 'apps']
safesearch = True
search_url = 'https://itunes.apple.com/search?{query}'
def request(query, params):
explicit = "Yes"
if params['safesearch'] > 0:
explicit = "No"
params['url'] = search_url.format(query=urlencode({'term': query, 'media': 'software', 'explicit': explicit}))
return params
def response(resp):
results = []
json_result = loads(resp.text)
for result in json_result['results']:
results.append(
{
'url': result['trackViewUrl'],
'title': result['trackName'],
'content': result['description'],
'thumbnail': result['artworkUrl100'],
'publishedDate': parse(result['currentVersionReleaseDate']),
'author': result['sellerName'],
}
)
return results

112
searx/engines/apple_maps.py Normal file
View File

@@ -0,0 +1,112 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Apple Maps"""
from json import loads
from time import time
from urllib.parse import urlencode
from searx.network import get as http_get
from searx.engines.openstreetmap import get_key_label
about = {
"website": 'https://www.apple.com/maps/',
"wikidata_id": 'Q276101',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
token = {'value': '', 'last_updated': None}
categories = ['map']
paging = False
search_url = "https://api.apple-mapkit.com/v1/search?{query}&mkjsVersion=5.72.53"
def obtain_token():
update_time = time() - (time() % 1800)
try:
# use duckduckgo's mapkit token
token_response = http_get('https://duckduckgo.com/local.js?get_mk_token=1', timeout=2.0)
actual_token = http_get(
'https://cdn.apple-mapkit.com/ma/bootstrap?apiVersion=2&mkjsVersion=5.72.53&poi=1',
timeout=2.0,
headers={'Authorization': 'Bearer ' + token_response.text},
)
token['value'] = loads(actual_token.text)['authInfo']['access_token']
token['last_updated'] = update_time
# pylint: disable=bare-except
except:
pass
return token
def request(query, params):
if time() - (token['last_updated'] or 0) > 1800:
obtain_token()
params['url'] = search_url.format(query=urlencode({'q': query, 'lang': params['language']}))
params['headers'] = {'Authorization': 'Bearer ' + token['value']}
return params
def response(resp):
results = []
resp_json = loads(resp.text)
user_language = resp.search_params['language']
for result in resp_json['results']:
boundingbox = None
if 'displayMapRegion' in result:
box = result['displayMapRegion']
boundingbox = [box['southLat'], box['northLat'], box['westLng'], box['eastLng']]
links = []
if 'telephone' in result:
telephone = result['telephone']
links.append(
{
'label': get_key_label('phone', user_language),
'url': 'tel:' + telephone,
'url_label': telephone,
}
)
if result.get('urls'):
url = result['urls'][0]
links.append(
{
'label': get_key_label('website', user_language),
'url': url,
'url_label': url,
}
)
results.append(
{
'template': 'map.html',
'type': result.get('poiCategory'),
'title': result['name'],
'links': links,
'latitude': result['center']['lat'],
'longitude': result['center']['lng'],
'url': result['placecardUrl'],
'boundingbox': boundingbox,
'geojson': {'type': 'Point', 'coordinates': [result['center']['lng'], result['center']['lat']]},
'address': {
'name': result['name'],
'house_number': result.get('subThoroughfare'),
'road': result.get('thoroughfare'),
'locality': result.get('locality'),
'postcode': result.get('postCode'),
'country': result.get('country'),
},
}
)
return results

154
searx/engines/archlinux.py Normal file
View File

@@ -0,0 +1,154 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Arch Linux Wiki
~~~~~~~~~~~~~~~
This implementation does not use a official API: Mediawiki provides API, but
Arch Wiki blocks access to it.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urljoin, urlparse
import lxml
import babel
from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.locales import language_tag
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://wiki.archlinux.org/',
"wikidata_id": 'Q101445877',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['it', 'software wikis']
paging = True
main_wiki = 'wiki.archlinux.org'
def request(query, params):
sxng_lang = params['searxng_locale'].split('-')[0]
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
title: str = traits.custom['title'].get(sxng_lang, 'Special:Search') # type: ignore
base_url = 'https://' + netloc + '/index.php?'
offset = (params['pageno'] - 1) * 20
if netloc == main_wiki:
eng_lang: str = traits.get_language(sxng_lang, 'English') # type: ignore
query += ' (' + eng_lang + ')'
# wiki.archlinux.org is protected by anubis
# - https://github.com/searxng/searxng/issues/4646#issuecomment-2817848019
params['headers']['User-Agent'] = "SearXNG"
elif netloc == 'wiki.archlinuxcn.org':
base_url = 'https://' + netloc + '/wzh/index.php?'
args = {
'search': query,
'title': title,
'limit': 20,
'offset': offset,
'profile': 'default',
}
params['url'] = base_url + urlencode(args)
return params
def response(resp):
results = []
dom = lxml.html.fromstring(resp.text) # type: ignore
# get the base URL for the language in which request was made
sxng_lang = resp.search_params['searxng_locale'].split('-')[0]
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
base_url = 'https://' + netloc + '/index.php?'
for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'):
link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0)
content = extract_text(result.xpath('.//div[@class="searchresult"]'))
results.append(
{
'url': urljoin(base_url, link.get('href')), # type: ignore
'title': extract_text(link),
'content': content,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Archlinux-Wiki. The location of the Wiki address of a
language is mapped in a :py:obj:`custom field
<searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``). Depending
on the location, the ``title`` argument in the request is translated.
.. code:: python
"custom": {
"wiki_netloc": {
"de": "wiki.archlinux.de",
# ...
"zh": "wiki.archlinuxcn.org"
}
"title": {
"de": "Spezial:Suche",
# ...
"zh": "Special:\u641c\u7d22"
},
},
"""
# pylint: disable=import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
engine_traits.custom['wiki_netloc'] = {}
engine_traits.custom['title'] = {}
title_map = {
'de': 'Spezial:Suche',
'fa': 'ویژه:جستجو',
'ja': '特別:検索',
'zh': 'Special:搜索',
}
resp = get('https://wiki.archlinux.org/')
if not resp.ok: # type: ignore
print("ERROR: response from wiki.archlinux.org is not OK.")
dom = lxml.html.fromstring(resp.text) # type: ignore
for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"):
sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-'))
# zh_Hans --> zh
sxng_tag = sxng_tag.split('_')[0]
netloc = urlparse(a.get('href')).netloc
if netloc != 'wiki.archlinux.org':
title = title_map.get(sxng_tag)
if not title:
print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag))
continue
engine_traits.custom['wiki_netloc'][sxng_tag] = netloc
engine_traits.custom['title'][sxng_tag] = title # type: ignore
eng_tag = extract_text(eval_xpath_list(a, ".//span"))
engine_traits.languages[sxng_tag] = eng_tag # type: ignore
engine_traits.languages['en'] = 'English'

67
searx/engines/artic.py Normal file
View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""The Art Institute of Chicago
Explore thousands of artworks from The Art Institute of Chicago.
* https://artic.edu
"""
from json import loads
from urllib.parse import urlencode
about = {
"website": 'https://www.artic.edu',
"wikidata_id": 'Q239303',
"official_api_documentation": 'http://api.artic.edu/docs/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
nb_per_page = 20
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
image_api = 'https://www.artic.edu/iiif/2/'
def request(query, params):
args = urlencode(
{
'q': query,
'page': params['pageno'],
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
'limit': nb_per_page,
}
)
params['url'] = search_api + args
logger.debug("query_url --> %s", params['url'])
return params
def response(resp):
results = []
json_data = loads(resp.text)
for result in json_data['data']:
if not result['image_id']:
continue
results.append(
{
'url': 'https://artic.edu/artworks/%(id)s' % result,
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
'content': "%(medium_display)s // %(dimensions)s" % result,
'author': ', '.join(result['artist_titles']),
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
'template': 'images.html',
}
)
return results

110
searx/engines/arxiv.py Normal file
View File

@@ -0,0 +1,110 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""ArXiV (Scientific preprints)
"""
from datetime import datetime
from lxml import etree
from lxml.etree import XPath
from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://arxiv.org',
"wikidata_id": 'Q118398',
"official_api_documentation": 'https://arxiv.org/help/api',
"use_official_api": True,
"require_api_key": False,
"results": 'XML-RSS',
}
categories = ['science', 'scientific publications']
paging = True
base_url = (
'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}'
)
# engine dependent config
number_of_results = 10
# xpaths
arxiv_namespaces = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = {'query': query, 'offset': offset, 'number_of_results': number_of_results}
params['url'] = base_url.format(**string_args)
return params
def response(resp):
results = []
dom = etree.fromstring(resp.content)
for entry in eval_xpath_list(dom, xpath_entry):
title = eval_xpath_getindex(entry, xpath_title, 0).text
url = eval_xpath_getindex(entry, xpath_id, 0).text
abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
# doi
doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
doi = None if doi_element is None else doi_element.text
# pdf
pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
# journal
journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
journal = None if journal_element is None else journal_element.text
# tags
tag_elements = eval_xpath(entry, xpath_category)
tags = [str(tag) for tag in tag_elements]
# comments
comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
comments = None if comments_elements is None else comments_elements.text
publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'publishedDate': publishedDate,
'content': abstract,
'doi': doi,
'authors': authors,
'journal': journal,
'tags': tags,
'comments': comments,
'pdf_url': pdf_url,
}
results.append(res_dict)
return results

75
searx/engines/ask.py Normal file
View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Ask.com"""
from urllib.parse import urlencode
import dateutil
from lxml import html
from searx import utils
# Metadata
about = {
"website": "https://www.ask.com/",
"wikidata_id": 'Q847564',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
# Engine Configuration
categories = ['general']
paging = True
max_page = 5
"""Ask.com has at max 5 pages."""
# Base URL
base_url = "https://www.ask.com/web"
def request(query, params):
query_params = {
"q": query,
"page": params["pageno"],
}
params["url"] = f"{base_url}?{urlencode(query_params)}"
return params
def response(resp):
start_tag = 'window.MESON.initialState = {'
end_tag = '}};'
dom = html.fromstring(resp.text)
script = utils.eval_xpath_getindex(dom, '//script', 0, default=None).text
pos = script.index(start_tag) + len(start_tag) - 1
script = script[pos:]
pos = script.index(end_tag) + len(end_tag) - 1
script = script[:pos]
json_resp = utils.js_variable_to_python(script)
results = []
for item in json_resp['search']['webResults']['results']:
pubdate_original = item.get('pubdate_original')
if pubdate_original:
pubdate_original = dateutil.parser.parse(pubdate_original)
metadata = [item.get(field) for field in ['category_l1', 'catsy'] if item.get(field)]
results.append(
{
"url": item['url'].split('&ueid')[0],
"title": item['title'],
"content": item['abstract'],
"publishedDate": pubdate_original,
# "thumbnail": item.get('image_url') or None, # these are not thumbs / to large
"metadata": ' | '.join(metadata),
}
)
return results

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. sidebar:: info
The Astrophysics Data System (ADS) is a digital library portal for researchers in astronomy and physics,
operated by the Smithsonian Astrophysical Observatory (SAO) under a NASA grant.
The engine is adapted from the solr engine.
"""
# pylint: disable=global-statement
from datetime import datetime
from json import loads
from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
about = {
"website": 'https://ui.adsabs.harvard.edu/',
"wikidata_id": 'Q752099',
"official_api_documentation": 'https://ui.adsabs.harvard.edu/help/api/api-docs.html',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
base_url = 'https://api.adsabs.harvard.edu/v1/search'
result_base_url = 'https://ui.adsabs.harvard.edu/abs/'
rows = 10
sort = '' # sorting: asc or desc
field_list = ['bibcode', 'author', 'title', 'abstract', 'doi', 'date'] # list of field names to display on the UI
default_fields = '' # default field to query
query_fields = '' # query fields
paging = True
api_key = 'unset'
def init(_):
if api_key == 'unset':
raise SearxEngineAPIException('missing ADS API key')
def request(query, params):
query_params = {'q': query, 'rows': rows}
if field_list:
query_params['fl'] = ','.join(field_list)
if query_fields:
query_params['qf'] = ','.join(query_fields)
if default_fields:
query_params['df'] = default_fields
if sort:
query_params['sort'] = sort
query_params['start'] = rows * (params['pageno'] - 1)
params['headers']['Authorization'] = f'Bearer {api_key}'
params['url'] = f"{base_url}/query?{urlencode(query_params)}"
return params
def response(resp):
try:
resp_json = loads(resp.text)
except Exception as e:
raise SearxEngineAPIException("failed to parse response") from e
if 'error' in resp_json:
raise SearxEngineAPIException(resp_json['error']['msg'])
resp_json = resp_json["response"]
result_len = resp_json["numFound"]
results = []
for res in resp_json["docs"]:
author = res.get("author")
if author:
author = author[0] + ' et al.'
results.append(
{
'url': result_base_url + res.get("bibcode") + "/",
'title': res.get("title")[0],
'author': author,
'content': res.get("abstract"),
'doi': res.get("doi"),
'publishedDate': datetime.fromisoformat(res.get("date")),
}
)
results.append({'number_of_results': result_len})
return results

182
searx/engines/baidu.py Normal file
View File

@@ -0,0 +1,182 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Baidu_
.. _Baidu: https://www.baidu.com
"""
# There exits a https://github.com/ohblue/baidu-serp-api/
# but we don't use it here (may we can learn from).
from urllib.parse import urlencode
from datetime import datetime
from html import unescape
import time
import json
from searx.exceptions import SearxEngineAPIException
from searx.utils import html_to_text
about = {
"website": "https://www.baidu.com",
"wikidata_id": "Q14772",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
"language": "zh",
}
paging = True
categories = []
results_per_page = 10
baidu_category = 'general'
time_range_support = True
time_range_dict = {"day": 86400, "week": 604800, "month": 2592000, "year": 31536000}
def init(_):
if baidu_category not in ('general', 'images', 'it'):
raise SearxEngineAPIException(f"Unsupported category: {baidu_category}")
def request(query, params):
page_num = params["pageno"]
category_config = {
'general': {
'endpoint': 'https://www.baidu.com/s',
'params': {
"wd": query,
"rn": results_per_page,
"pn": (page_num - 1) * results_per_page,
"tn": "json",
},
},
'images': {
'endpoint': 'https://image.baidu.com/search/acjson',
'params': {
"word": query,
"rn": results_per_page,
"pn": (page_num - 1) * results_per_page,
"tn": "resultjson_com",
},
},
'it': {
'endpoint': 'https://kaifa.baidu.com/rest/v1/search',
'params': {
"wd": query,
"pageSize": results_per_page,
"pageNum": page_num,
"paramList": f"page_num={page_num},page_size={results_per_page}",
"position": 0,
},
},
}
query_params = category_config[baidu_category]['params']
query_url = category_config[baidu_category]['endpoint']
if params.get("time_range") in time_range_dict:
now = int(time.time())
past = now - time_range_dict[params["time_range"]]
if baidu_category == 'general':
query_params["gpc"] = f"stf={past},{now}|stftype=1"
if baidu_category == 'it':
query_params["paramList"] += f",timestamp_range={past}-{now}"
params["url"] = f"{query_url}?{urlencode(query_params)}"
return params
def response(resp):
text = resp.text
if baidu_category == 'images':
# baidu's JSON encoder wrongly quotes / and ' characters by \\ and \'
text = text.replace(r"\/", "/").replace(r"\'", "'")
data = json.loads(text, strict=False)
parsers = {'general': parse_general, 'images': parse_images, 'it': parse_it}
return parsers[baidu_category](data)
def parse_general(data):
results = []
if not data.get("feed", {}).get("entry"):
raise SearxEngineAPIException("Invalid response")
for entry in data["feed"]["entry"]:
if not entry.get("title") or not entry.get("url"):
continue
published_date = None
if entry.get("time"):
try:
published_date = datetime.fromtimestamp(entry["time"])
except (ValueError, TypeError):
published_date = None
# title and content sometimes containing characters such as &amp; &#39; &quot; etc...
title = unescape(entry["title"])
content = unescape(entry.get("abs", ""))
results.append(
{
"title": title,
"url": entry["url"],
"content": content,
"publishedDate": published_date,
}
)
return results
def parse_images(data):
results = []
if "data" in data:
for item in data["data"]:
if not item:
# the last item in the JSON list is empty, the JSON string ends with "}, {}]"
continue
replace_url = item.get("replaceUrl", [{}])[0]
width = item.get("width")
height = item.get("height")
img_date = item.get("bdImgnewsDate")
publishedDate = None
if img_date:
publishedDate = datetime.strptime(img_date, "%Y-%m-%d %H:%M")
results.append(
{
"template": "images.html",
"url": replace_url.get("FromURL"),
"thumbnail_src": item.get("thumbURL"),
"img_src": replace_url.get("ObjURL"),
"title": html_to_text(item.get("fromPageTitle")),
"source": item.get("fromURLHost"),
"resolution": f"{width} x {height}",
"img_format": item.get("type"),
"filesize": item.get("filesize"),
"publishedDate": publishedDate,
}
)
return results
def parse_it(data):
results = []
if not data.get("data", {}).get("documents", {}).get("data"):
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["documents"]["data"]:
results.append(
{
'title': entry["techDocDigest"]["title"],
'url': entry["techDocDigest"]["url"],
'content': entry["techDocDigest"]["summary"],
}
)
return results

81
searx/engines/bandcamp.py Normal file
View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Bandcamp (Music)
@website https://bandcamp.com/
@provide-api no
@results HTML
@parse url, title, content, publishedDate, iframe_src, thumbnail
"""
from urllib.parse import urlencode, urlparse, parse_qs
from dateutil.parser import parse as dateparse
from lxml import html
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
# about
about = {
"website": 'https://bandcamp.com/',
"wikidata_id": 'Q545966',
"official_api_documentation": 'https://bandcamp.com/developer',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['music']
paging = True
base_url = "https://bandcamp.com/"
search_string = 'search?{query}&page={page}'
iframe_src = "https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=000/linkcol=fff/artwork=small"
def request(query, params):
search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno'])
params['url'] = base_url + search_path
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//li[contains(@class, "searchresult")]'):
link = eval_xpath_getindex(result, './/div[@class="itemurl"]/a', 0, default=None)
if link is None:
continue
title = result.xpath('.//div[@class="heading"]/a/text()')
content = result.xpath('.//div[@class="subhead"]/text()')
new_result = {
"url": extract_text(link),
"title": extract_text(title),
"content": extract_text(content),
}
date = eval_xpath_getindex(result, '//div[@class="released"]/text()', 0, default=None)
if date:
new_result["publishedDate"] = dateparse(date.replace("released ", ""))
thumbnail = result.xpath('.//div[@class="art"]/img/@src')
if thumbnail:
new_result['thumbnail'] = thumbnail[0]
result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0]
itemtype = extract_text(result.xpath('.//div[@class="itemtype"]')).lower()
if "album" == itemtype:
new_result["iframe_src"] = iframe_src.format(type='album', result_id=result_id)
elif "track" == itemtype:
new_result["iframe_src"] = iframe_src.format(type='track', result_id=result_id)
results.append(new_result)
return results

118
searx/engines/base.py Executable file
View File

@@ -0,0 +1,118 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""BASE (Scholar publications)
"""
from datetime import datetime
import re
from urllib.parse import urlencode
from lxml import etree
from searx.utils import searx_useragent
# about
about = {
"website": 'https://base-search.net',
"wikidata_id": 'Q448335',
"official_api_documentation": 'https://api.base-search.net/',
"use_official_api": True,
"require_api_key": False,
"results": 'XML',
}
categories = ['science']
base_url = (
'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'
+ '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}'
)
# engine dependent config
paging = True
number_of_results = 10
# shortcuts for advanced search
shortcut_dict = {
# user-friendly keywords
'format:': 'dcformat:',
'author:': 'dccreator:',
'collection:': 'dccollection:',
'hdate:': 'dchdate:',
'contributor:': 'dccontributor:',
'coverage:': 'dccoverage:',
'date:': 'dcdate:',
'abstract:': 'dcdescription:',
'urls:': 'dcidentifier:',
'language:': 'dclanguage:',
'publisher:': 'dcpublisher:',
'relation:': 'dcrelation:',
'rights:': 'dcrights:',
'source:': 'dcsource:',
'subject:': 'dcsubject:',
'title:': 'dctitle:',
'type:': 'dcdctype:',
}
def request(query, params):
# replace shortcuts with API advanced search keywords
for key, val in shortcut_dict.items():
query = re.sub(key, val, query)
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = {
'query': urlencode({'query': query}),
'offset': offset,
'hits': number_of_results,
}
params['url'] = base_url.format(**string_args)
params['headers']['User-Agent'] = searx_useragent()
return params
def response(resp):
results = []
search_results = etree.XML(resp.content)
for entry in search_results.xpath('./result/doc'):
content = "No description available"
url = ""
title = ""
date = datetime.now() # needed in case no dcdate is available for an item
for item in entry:
if item.attrib["name"] == "dcdate":
date = item.text
elif item.attrib["name"] == "dctitle":
title = item.text
elif item.attrib["name"] == "dclink":
url = item.text
elif item.attrib["name"] == "dcdescription":
content = item.text[:300]
if len(item.text) > 300:
content += "..."
# dates returned by the BASE API are not several formats
publishedDate = None
for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']:
try:
publishedDate = datetime.strptime(date, date_format)
break
except: # pylint: disable=bare-except
pass
if publishedDate is not None:
res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}
else:
res_dict = {'url': url, 'title': title, 'content': content}
results.append(res_dict)
return results

96
searx/engines/bilibili.py Normal file
View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Bilibili is a Chinese video sharing website.
.. _Bilibili: https://www.bilibili.com
"""
import random
import string
from urllib.parse import urlencode
from datetime import datetime, timedelta
from searx import utils
# Engine metadata
about = {
"website": "https://www.bilibili.com",
"wikidata_id": "Q3077586",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
# Engine configuration
paging = True
results_per_page = 20
categories = ["videos"]
# Search URL
base_url = "https://api.bilibili.com/x/web-interface/search/type"
cookie = {
"innersign": "0",
"buvid3": "".join(random.choice(string.hexdigits) for _ in range(16)) + "infoc",
"i-wanna-go-back": "-1",
"b_ut": "7",
"FEED_LIVE_VERSION": "V8",
"header_theme_version": "undefined",
"home_feed_column": "4",
}
def request(query, params):
query_params = {
"__refresh__": "true",
"page": params["pageno"],
"page_size": results_per_page,
"single_column": "0",
"keyword": query,
"search_type": "video",
}
params["url"] = f"{base_url}?{urlencode(query_params)}"
params["cookies"] = cookie
return params
def response(resp):
search_res = resp.json()
results = []
for item in search_res.get("data", {}).get("result", []):
title = utils.html_to_text(item["title"])
url = item["arcurl"]
thumbnail = item["pic"]
description = item["description"]
author = item["author"]
video_id = item["aid"]
unix_date = item["pubdate"]
formatted_date = datetime.fromtimestamp(unix_date)
# the duration only seems to be valid if the video is less than 60 mins
duration = utils.parse_duration_string(item["duration"])
if duration and duration > timedelta(minutes=60):
duration = None
iframe_url = f"https://player.bilibili.com/player.html?aid={video_id}&high_quality=1&autoplay=false&danmaku=0"
results.append(
{
"title": title,
"url": url,
"content": description,
"author": author,
"publishedDate": formatted_date,
"length": duration,
"thumbnail": thumbnail,
"iframe_src": iframe_url,
"template": "videos.html",
}
)
return results

284
searx/engines/bing.py Normal file
View File

@@ -0,0 +1,284 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is the implementation of the Bing-WEB engine. Some of this
implementations are shared by other engines:
- :ref:`bing images engine`
- :ref:`bing news engine`
- :ref:`bing videos engine`
On the `preference page`_ Bing offers a lot of languages an regions (see section
LANGUAGE and COUNTRY/REGION). The Language is the language of the UI, we need
in SearXNG to get the translations of data such as *"published last week"*.
There is a description of the official search-APIs_, unfortunately this is not
the API we can use or that bing itself would use. You can look up some things
in the API to get a better picture of bing, but the value specifications like
the market codes are usually outdated or at least no longer used by bing itself.
The market codes have been harmonized and are identical for web, video and
images. The news area has also been harmonized with the other categories. Only
political adjustments still seem to be made -- for example, there is no news
category for the Chinese market.
.. _preference page: https://www.bing.com/account/general
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
"""
# pylint: disable=too-many-branches, invalid-name
from typing import TYPE_CHECKING
import base64
import re
import time
from urllib.parse import parse_qs, urlencode, urlparse
from lxml import html
import babel
import babel.languages
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.locales import language_tag, region_tag
from searx.enginelib.traits import EngineTraits
from searx.exceptions import SearxEngineAPIException
if TYPE_CHECKING:
import logging
logger = logging.getLogger()
traits: EngineTraits
about = {
"website": 'https://www.bing.com',
"wikidata_id": 'Q182496',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general', 'web']
paging = True
max_page = 200
"""200 pages maximum (``&first=1991``)"""
time_range_support = True
safesearch = True
"""Bing results are always SFW. To get NSFW links from bing some age
verification by a cookie is needed / thats not possible in SearXNG.
"""
base_url = 'https://www.bing.com/search'
"""Bing (Web) search URL"""
def _page_offset(pageno):
return (int(pageno) - 1) * 10 + 1
def set_bing_cookies(params, engine_language, engine_region):
params['cookies']['_EDGE_CD'] = f'm={engine_region}&u={engine_language}'
params['cookies']['_EDGE_S'] = f'mkt={engine_region}&ui={engine_language}'
logger.debug("bing cookies: %s", params['cookies'])
def request(query, params):
"""Assemble a Bing-Web request."""
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
page = params.get('pageno', 1)
query_params = {
'q': query,
# if arg 'pq' is missed, sometimes on page 4 we get results from page 1,
# don't ask why it is only sometimes / its M$ and they have never been
# deterministic ;)
'pq': query,
}
# To get correct page, arg first and this arg FORM is needed, the value PERE
# is on page 2, on page 3 its PERE1 and on page 4 its PERE2 .. and so forth.
# The 'first' arg should never send on page 1.
if page > 1:
query_params['first'] = _page_offset(page) # see also arg FORM
if page == 2:
query_params['FORM'] = 'PERE'
elif page > 2:
query_params['FORM'] = 'PERE%s' % (page - 2)
params['url'] = f'{base_url}?{urlencode(query_params)}'
if params.get('time_range'):
unix_day = int(time.time() / 86400)
time_ranges = {'day': '1', 'week': '2', 'month': '3', 'year': f'5_{unix_day-365}_{unix_day}'}
params['url'] += f'&filters=ex1:"ez{time_ranges[params["time_range"]]}"'
return params
def response(resp):
# pylint: disable=too-many-locals
results = []
result_len = 0
dom = html.fromstring(resp.text)
# parse results again if nothing is found yet
for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
link = eval_xpath_getindex(result, './/h2/a', 0, None)
if link is None:
continue
url = link.attrib.get('href')
title = extract_text(link)
content = eval_xpath(result, './/p')
for p in content:
# Make sure that the element is free of:
# <span class="algoSlug_icon" # data-priority="2">Web</span>
for e in p.xpath('.//span[@class="algoSlug_icon"]'):
e.getparent().remove(e)
content = extract_text(content)
# get the real URL
if url.startswith('https://www.bing.com/ck/a?'):
# get the first value of u parameter
url_query = urlparse(url).query
parsed_url_query = parse_qs(url_query)
param_u = parsed_url_query["u"][0]
# remove "a1" in front
encoded_url = param_u[2:]
# add padding
encoded_url = encoded_url + '=' * (-len(encoded_url) % 4)
# decode base64 encoded URL
url = base64.urlsafe_b64decode(encoded_url).decode()
# append result
results.append({'url': url, 'title': title, 'content': content})
# get number_of_results
if results:
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
if "-" in result_len_container:
start_str, result_len_container = re.split(r'-\d+', result_len_container)
start = int(start_str)
else:
start = 1
result_len_container = re.sub('[^0-9]', '', result_len_container)
if len(result_len_container) > 0:
result_len = int(result_len_container)
expected_start = _page_offset(resp.search_params.get("pageno", 1))
if expected_start != start:
if expected_start > result_len:
# Avoid reading more results than available.
# For example, if there is 100 results from some search and we try to get results from 120 to 130,
# Bing will send back the results from 0 to 10 and no error.
# If we compare results count with the first parameter of the request we can avoid this "invalid"
# results.
return []
# Sometimes Bing will send back the first result page instead of the requested page as a rate limiting
# measure.
msg = f"Expected results to start at {expected_start}, but got results starting at {start}"
raise SearxEngineAPIException(msg)
results.append({'number_of_results': result_len})
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Web."""
# pylint: disable=import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.utils import gen_useragent
headers = {
"User-Agent": gen_useragent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-GPC": "1",
"Cache-Control": "max-age=0",
}
resp = get("https://www.bing.com/account/general", headers=headers)
if not resp.ok: # type: ignore
print("ERROR: response from bing is not OK.")
dom = html.fromstring(resp.text) # type: ignore
# languages
engine_traits.languages['zh'] = 'zh-hans'
map_lang = {'prs': 'fa-AF', 'en': 'en-us'}
bing_ui_lang_map = {
# HINT: this list probably needs to be supplemented
'en': 'us', # en --> en-us
'da': 'dk', # da --> da-dk
}
for href in eval_xpath(dom, '//div[@id="language-section-content"]//div[@class="languageItem"]/a/@href'):
eng_lang = parse_qs(urlparse(href).query)['setlang'][0]
babel_lang = map_lang.get(eng_lang, eng_lang)
try:
sxng_tag = language_tag(babel.Locale.parse(babel_lang.replace('-', '_')))
except babel.UnknownLocaleError:
print("ERROR: language (%s) is unknown by babel" % (babel_lang))
continue
# Language (e.g. 'en' or 'de') from https://www.bing.com/account/general
# is converted by bing to 'en-us' or 'de-de'. But only if there is not
# already a '-' delemitter in the language. For instance 'pt-PT' -->
# 'pt-pt' and 'pt-br' --> 'pt-br'
bing_ui_lang = eng_lang.lower()
if '-' not in bing_ui_lang:
bing_ui_lang = bing_ui_lang + '-' + bing_ui_lang_map.get(bing_ui_lang, bing_ui_lang)
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != bing_ui_lang:
print(f"CONFLICT: babel {sxng_tag} --> {conflict}, {bing_ui_lang}")
continue
engine_traits.languages[sxng_tag] = bing_ui_lang
# regions (aka "market codes")
engine_traits.regions['zh-CN'] = 'zh-cn'
map_market_codes = {
'zh-hk': 'en-hk', # not sure why, but at M$ this is the market code for Hongkong
}
for href in eval_xpath(dom, '//div[@id="region-section-content"]//div[@class="regionItem"]/a/@href'):
cc_tag = parse_qs(urlparse(href).query)['cc'][0]
if cc_tag == 'clear':
engine_traits.all_locale = cc_tag
continue
# add market codes from official languages of the country ..
for lang_tag in babel.languages.get_official_languages(cc_tag, de_facto=True):
if lang_tag not in engine_traits.languages.keys():
# print("ignore lang: %s <-- %s" % (cc_tag, lang_tag))
continue
lang_tag = lang_tag.split('_')[0] # zh_Hant --> zh
market_code = f"{lang_tag}-{cc_tag}" # zh-tw
market_code = map_market_codes.get(market_code, market_code)
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, cc_tag.upper())))
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != market_code:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, market_code))
continue
engine_traits.regions[sxng_tag] = market_code

View File

@@ -0,0 +1,109 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Bing-Images: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import set_bing_cookies
from searx.engines.bing import fetch_traits # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger = logging.getLogger()
traits: EngineTraits
# about
about = {
"website": 'https://www.bing.com/images',
"wikidata_id": 'Q182496',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-image-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images', 'web']
paging = True
safesearch = True
time_range_support = True
base_url = 'https://www.bing.com/images/async'
"""Bing (Images) search URL"""
time_map = {
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
}
def request(query, params):
"""Assemble a Bing-Image request."""
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
# build URL query
# - example: https://www.bing.com/images/async?q=foo&async=content&first=1&count=35
query_params = {
'q': query,
'async': '1',
# to simplify the page count lets use the default of 35 images per page
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count': 35,
}
# time range
# - example: one year (525600 minutes) 'qft=+filterui:age-lt525600'
if params['time_range']:
query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Images"""
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'):
metadata = result.xpath('.//a[@class="iusc"]/@m')
if not metadata:
continue
metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0])
title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip()
img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip().split(" · ")
source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip()
results.append(
{
'template': 'images.html',
'url': metadata['purl'],
'thumbnail_src': metadata['turl'],
'img_src': metadata['murl'],
'content': metadata.get('desc'),
'title': title,
'source': source,
'resolution': img_format[0],
'img_format': img_format[1] if len(img_format) >= 2 else None,
}
)
return results

160
searx/engines/bing_news.py Normal file
View File

@@ -0,0 +1,160 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Bing-News: description see :py:obj:`searx.engines.bing`.
.. hint::
Bing News is *different* in some ways!
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import set_bing_cookies
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.bing.com/news',
"wikidata_id": 'Q2878637',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'RSS',
}
# engine dependent config
categories = ['news']
paging = True
"""If go through the pages and there are actually no new results for another
page, then bing returns the results from the last page again."""
time_range_support = True
time_map = {
'day': 'interval="4"',
'week': 'interval="7"',
'month': 'interval="9"',
}
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
difference of *last day* and *last week* in the result list is just marginally.
Bing does not have news range ``year`` / we use ``month`` instead."""
base_url = 'https://www.bing.com/news/infinitescrollajax'
"""Bing (News) search URL"""
def request(query, params):
"""Assemble a Bing-News request."""
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
# build URL query
#
# example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
page = int(params.get('pageno', 1)) - 1
query_params = {
'q': query,
'InfiniteScroll': 1,
# to simplify the page count lets use the default of 10 images per page
'first': page * 10 + 1,
'SFX': page,
'form': 'PTFTNR',
'setlang': engine_region.split('-')[0],
'cc': engine_region.split('-')[-1],
}
if params['time_range']:
query_params['qft'] = time_map.get(params['time_range'], 'interval="9"')
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Video"""
results = []
if not resp.ok or not resp.text:
return results
dom = html.fromstring(resp.text)
for newsitem in eval_xpath_list(dom, '//div[contains(@class, "newsitem")]'):
link = eval_xpath_getindex(newsitem, './/a[@class="title"]', 0, None)
if link is None:
continue
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(eval_xpath(newsitem, './/div[@class="snippet"]'))
metadata = []
source = eval_xpath_getindex(newsitem, './/div[contains(@class, "source")]', 0, None)
if source is not None:
for item in (
eval_xpath_getindex(source, './/span[@aria-label]/@aria-label', 0, None),
# eval_xpath_getindex(source, './/a', 0, None),
# eval_xpath_getindex(source, './div/span', 3, None),
link.attrib.get('data-author'),
):
if item is not None:
t = extract_text(item)
if t and t.strip():
metadata.append(t.strip())
metadata = ' | '.join(metadata)
thumbnail = None
imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None)
if imagelink is not None:
thumbnail = imagelink.attrib.get('src')
if not thumbnail.startswith("https://www.bing.com"):
thumbnail = 'https://www.bing.com/' + thumbnail
results.append(
{
'url': url,
'title': title,
'content': content,
'thumbnail': thumbnail,
'metadata': metadata,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News."""
# pylint: disable=import-outside-toplevel
from searx.engines.bing import fetch_traits as _f
_f(engine_traits)
# fix market codes not known by bing news:
# In bing the market code 'zh-cn' exists, but there is no 'news' category in
# bing for this market. Alternatively we use the the market code from Honk
# Kong. Even if this is not correct, it is better than having no hits at
# all, or sending false queries to bing that could raise the suspicion of a
# bot.
# HINT: 'en-hk' is the region code it does not indicate the language en!!
engine_traits.regions['zh-CN'] = 'en-hk'

View File

@@ -0,0 +1,98 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""Bing-Videos: description see :py:obj:`searx.engines.bing`.
"""
from typing import TYPE_CHECKING
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import set_bing_cookies
from searx.engines.bing import fetch_traits # pylint: disable=unused-import
from searx.engines.bing_images import time_map
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://www.bing.com/videos',
"wikidata_id": 'Q4914152',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-video-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['videos', 'web']
paging = True
safesearch = True
time_range_support = True
base_url = 'https://www.bing.com/videos/asyncv2'
"""Bing (Videos) async search URL."""
def request(query, params):
"""Assemble a Bing-Video request."""
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
# build URL query
#
# example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
query_params = {
'q': query,
'async': 'content',
# to simplify the page count lets use the default of 35 images per page
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count': 35,
}
# time range
#
# example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR'
if params['time_range']:
query_params['form'] = 'VRFLTR'
query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Video"""
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'):
metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
content = '{0} - {1}'.format(metadata['du'], info)
thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0]
results.append(
{
'url': metadata['murl'],
'thumbnail': thumbnail,
'title': metadata.get('vt', ''),
'content': content,
'template': 'videos.html',
}
)
return results

56
searx/engines/bitchute.py Normal file
View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""bitchute (Videos)"""
from json import dumps
from datetime import datetime
from searx.utils import html_to_text
about = {
"website": 'https://bitchute.com',
"wikidata_id": "Q45287179",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
base_url = "https://api.bitchute.com/api/beta/search/videos"
categories = ['videos']
paging = True
results_per_page = 20
def request(query, params):
start_index = (params["pageno"] - 1) * results_per_page
data = {"offset": start_index, "limit": results_per_page, "query": query, "sensitivity_id": "normal", "sort": "new"}
params["url"] = base_url
params["method"] = 'POST'
params['headers']['content-type'] = "application/json"
params['data'] = dumps(data)
return params
def response(resp):
search_res = resp.json()
results = []
for item in search_res.get('videos', []):
results.append(
{
"title": item['video_name'],
"url": 'https://www.bitchute.com/video/' + item['video_id'],
"content": html_to_text(item['description']),
"author": item['channel']['channel_name'],
"publishedDate": datetime.strptime(item["date_published"], "%Y-%m-%dT%H:%M:%S.%fZ"),
"length": item['duration'],
"views": item['view_count'],
"thumbnail": item['thumbnail_url'],
"iframe_src": 'https://www.bitchute.com/embed/' + item['video_id'],
"template": "videos.html",
}
)
return results

67
searx/engines/bpb.py Normal file
View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""BPB refers to ``Bundeszentrale für poltische Bildung``, which is a German
governmental institution aiming to reduce misinformation by providing resources
about politics and history.
"""
from datetime import datetime
from urllib.parse import urlencode
about = {
'website': "https://www.bpb.de",
'official_api_documentation': None,
'use_official_api': False,
'require_api_key': False,
'results': 'JSON',
'language': 'de',
}
paging = True
categories = ['general']
base_url = "https://www.bpb.de"
def request(query, params):
args = {
'query[term]': query,
'page': params['pageno'] - 1,
'sort[direction]': 'descending',
'payload[nid]': 65350,
}
params['url'] = f"{base_url}/bpbapi/filter/search?{urlencode(args)}"
return params
def response(resp):
results = []
json_resp = resp.json()
for result in json_resp['teaser']:
thumbnail = None
if result['teaser']['image']:
thumbnail = base_url + result['teaser']['image']['sources'][-1]['url']
metadata = result['extension']['overline']
authors = ', '.join(author['name'] for author in result['extension'].get('authors', []))
if authors:
metadata += f" | {authors}"
publishedDate = None
if result['extension'].get('publishingDate'):
publishedDate = datetime.fromtimestamp(result['extension']['publishingDate'])
results.append(
{
'url': base_url + result['teaser']['link']['url'],
'title': result['teaser']['title'],
'content': result['teaser']['text'],
'thumbnail': thumbnail,
'publishedDate': publishedDate,
'metadata': metadata,
}
)
return results

505
searx/engines/brave.py Normal file
View File

@@ -0,0 +1,505 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
<time_range_support>` is limited (see remarks).
Configured ``brave`` engines:
.. code:: yaml
- name: brave
engine: brave
...
brave_category: search
time_range_support: true
paging: true
- name: brave.images
engine: brave
...
brave_category: images
- name: brave.videos
engine: brave
...
brave_category: videos
- name: brave.news
engine: brave
...
brave_category: news
- name: brave.goggles
time_range_support: true
paging: true
...
brave_category: goggles
.. _brave regions:
Brave regions
=============
Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
locales. To get a mapping, all *officiat de-facto* languages of the Brave
region are mapped to regions in SearXNG (see :py:obj:`babel
<babel.languages.get_official_languages>`):
.. code:: python
"regions": {
..
"en-CA": "ca",
"fr-CA": "ca",
..
}
.. note::
The language (aka region) support of Brave's index is limited to very basic
languages. The search results for languages like Chinese or Arabic are of
low quality.
.. _brave googles:
Brave Goggles
=============
.. _list of Goggles: https://search.brave.com/goggles/discover
.. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf
.. _Goggles Quickstart: https://github.com/brave/goggles-quickstart
Goggles allow you to choose, alter, or extend the ranking of Brave Search
results (`Goggles Whitepaper`_). Goggles are openly developed by the community
of Brave Search users.
Select from the `list of Goggles`_ people have published, or create your own
(`Goggles Quickstart`_).
.. _brave languages:
Brave languages
===============
Brave's language support is limited to the UI (menus, area local notations,
etc). Brave's index only seems to support a locale, but it does not seem to
support any languages in its index. The choice of available languages is very
small (and its not clear to me where the difference in UI is when switching
from en-us to en-ca or en-gb).
In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
UI languages are stored in a custom field named ``ui_lang``:
.. code:: python
"custom": {
"ui_lang": {
"ca": "ca",
"de-DE": "de-de",
"en-CA": "en-ca",
"en-GB": "en-gb",
"en-US": "en-us",
"es": "es",
"fr-CA": "fr-ca",
"fr-FR": "fr-fr",
"ja-JP": "ja-jp",
"pt-BR": "pt-br",
"sq-AL": "sq-al"
}
},
Implementations
===============
"""
from typing import Any, TYPE_CHECKING
from urllib.parse import (
urlencode,
urlparse,
)
from dateutil import parser
from lxml import html
from searx import locales
from searx.utils import (
extr,
extract_text,
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
js_variable_to_python,
get_embeded_stream_url,
)
from searx.enginelib.traits import EngineTraits
from searx.result_types import EngineResults
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://search.brave.com/',
"wikidata_id": 'Q22906900',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
base_url = "https://search.brave.com/"
categories = []
brave_category = 'search'
Goggles = Any
"""Brave supports common web-search, videos, images, news, and goggles search.
- ``search``: Common WEB search
- ``videos``: search for videos
- ``images``: search for images
- ``news``: search for news
- ``goggles``: Common WEB search with custom rules
"""
brave_spellcheck = False
"""Brave supports some kind of spell checking. When activated, Brave tries to
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
the UI of Brave the user gets warned about this, since we can not warn the user
in SearXNG, the spellchecking is disabled by default.
"""
send_accept_language_header = True
paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All) and in the goggles category."""
max_page = 10
"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
to do more won't return any result and you will most likely be flagged as a bot.
"""
safesearch = True
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
time_range_support = False
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
category All) and in the goggles category."""
time_range_map = {
'day': 'pd',
'week': 'pw',
'month': 'pm',
'year': 'py',
}
def request(query, params):
# Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
params['headers']['Accept-Encoding'] = 'gzip, deflate'
args = {
'q': query,
'source': 'web',
}
if brave_spellcheck:
args['spellcheck'] = '1'
if brave_category in ('search', 'goggles'):
if params.get('pageno', 1) - 1:
args['offset'] = params.get('pageno', 1) - 1
if time_range_map.get(params['time_range']):
args['tf'] = time_range_map.get(params['time_range'])
if brave_category == 'goggles':
args['goggles_id'] = Goggles
params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
# set properties in the cookies
params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
# the useLocation is IP based, we use cookie 'country' for the region
params['cookies']['useLocation'] = '0'
params['cookies']['summarizer'] = '0'
engine_region = traits.get_region(params['searxng_locale'], 'all')
params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
params['cookies']['ui_lang'] = ui_lang
logger.debug("cookies %s", params['cookies'])
params['headers']['Sec-Fetch-Dest'] = "document"
params['headers']['Sec-Fetch-Mode'] = "navigate"
params['headers']['Sec-Fetch-Site'] = "same-origin"
params['headers']['Sec-Fetch-User'] = "?1"
def _extract_published_date(published_date_raw):
if published_date_raw is None:
return None
try:
return parser.parse(published_date_raw)
except parser.ParserError:
return None
def response(resp) -> EngineResults:
if brave_category in ('search', 'goggles'):
return _parse_search(resp)
if brave_category in ('news'):
return _parse_news(resp)
# Example script source containing the data:
#
# kit.start(app, element, {
# node_ids: [0, 19],
# data: [{type:"data",data: .... ["q","goggles_id"],route:1,url:1}}]
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
js_object = "[{" + extr(resp.text, "data: [{", "}}],") + "}}]"
json_data = js_variable_to_python(js_object)
# json_data is a list and at the second position (0,1) in this list we find the "response" data we need ..
json_resp = json_data[1]['data']['body']['response']
if brave_category == 'images':
return _parse_images(json_resp)
if brave_category == 'videos':
return _parse_videos(json_resp)
raise ValueError(f"Unsupported brave category: {brave_category}")
def _parse_search(resp) -> EngineResults:
result_list = EngineResults()
dom = html.fromstring(resp.text)
# I doubt that Brave is still providing the "answer" class / I haven't seen
# answers in brave for a long time.
answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
if answer_tag:
url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
answer = extract_text(answer_tag)
if answer is not None:
result_list.add(result_list.types.Answer(answer=answer, url=url))
# xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
xpath_results = '//div[contains(@class, "snippet ")]'
for result in eval_xpath_list(dom, xpath_results):
url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
title_tag = eval_xpath_getindex(
result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
)
if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
continue
content: str = extract_text(
eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
) # type: ignore
pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
pub_date = _extract_published_date(pub_date_raw)
if pub_date and content.startswith(pub_date_raw):
content = content.lstrip(pub_date_raw).strip("- \n\t")
thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
item = {
'url': url,
'title': extract_text(title_tag),
'content': content,
'publishedDate': pub_date,
'thumbnail': thumbnail,
}
video_tag = eval_xpath_getindex(
result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
)
if video_tag is not None:
# In my tests a video tag in the WEB search was most often not a
# video, except the ones from youtube ..
iframe_src = get_embeded_stream_url(url)
if iframe_src:
item['iframe_src'] = iframe_src
item['template'] = 'videos.html'
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
pub_date_raw = extract_text(
eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
)
item['publishedDate'] = _extract_published_date(pub_date_raw)
else:
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
result_list.append(item)
return result_list
def _parse_news(resp) -> EngineResults:
result_list = EngineResults()
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
# import pdb
# pdb.set_trace()
url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
if url is None:
continue
title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
item = {
"url": url,
"title": title,
"content": content,
"thumbnail": thumbnail,
}
result_list.append(item)
return result_list
def _parse_images(json_resp) -> EngineResults:
result_list = EngineResults()
for result in json_resp["results"]:
item = {
'url': result['url'],
'title': result['title'],
'content': result['description'],
'template': 'images.html',
'resolution': result['properties']['format'],
'source': result['source'],
'img_src': result['properties']['url'],
'thumbnail_src': result['thumbnail']['src'],
}
result_list.append(item)
return result_list
def _parse_videos(json_resp) -> EngineResults:
result_list = EngineResults()
for result in json_resp["results"]:
url = result['url']
item = {
'url': url,
'title': result['title'],
'content': result['description'],
'template': 'videos.html',
'length': result['video']['duration'],
'duration': result['video']['duration'],
'publishedDate': _extract_published_date(result['age']),
}
if result['thumbnail'] is not None:
item['thumbnail'] = result['thumbnail']['src']
iframe_src = get_embeded_stream_url(url)
if iframe_src:
item['iframe_src'] = iframe_src
result_list.append(item)
return result_list
def fetch_traits(engine_traits: EngineTraits):
"""Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
regions>` from Brave."""
# pylint: disable=import-outside-toplevel, too-many-branches
import babel.languages
from searx.locales import region_tag, language_tag
from searx.network import get # see https://github.com/searxng/searxng/issues/762
engine_traits.custom["ui_lang"] = {}
headers = {
'Accept-Encoding': 'gzip, deflate',
}
lang_map = {'no': 'nb'} # norway
# languages (UI)
resp = get('https://search.brave.com/settings', headers=headers)
if not resp.ok: # type: ignore
print("ERROR: response from Brave is not OK.")
dom = html.fromstring(resp.text) # type: ignore
for option in dom.xpath('//section//option[@value="en-us"]/../option'):
ui_lang = option.get('value')
try:
l = babel.Locale.parse(ui_lang, sep='-')
if l.territory:
sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
else:
sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
continue
conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
if conflict:
if conflict != ui_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
continue
engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
# search regions of brave
resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers)
if not resp.ok: # type: ignore
print("ERROR: response from Brave is not OK.")
country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
country_js = country_js[: country_js.index("},k={default")]
country_tags = js_variable_to_python(country_js)
for k, v in country_tags.items():
if k == 'all':
engine_traits.all_locale = 'all'
continue
country_tag = v['value']
# add official languages of the country ..
for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
lang_tag = lang_map.get(lang_tag, lang_tag)
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
# print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != country_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
continue
engine_traits.regions[sxng_tag] = country_tag

118
searx/engines/bt4g.py Normal file
View File

@@ -0,0 +1,118 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""BT4G_ (bt4g.com) is not a tracker and doesn't store any content and only
collects torrent metadata (such as file names and file sizes) and a magnet link
(torrent identifier).
This engine does not parse the HTML page because there is an API in XML (RSS).
The RSS feed provides fewer data like amount of seeders/leechers and the files
in the torrent file. It's a tradeoff for a "stable" engine as the XML from RSS
content will change way less than the HTML page.
.. _BT4G: https://bt4g.com/
Configuration
=============
The engine has the following additional settings:
- :py:obj:`bt4g_order_by`
- :py:obj:`bt4g_category`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific torrent searches. For example a engine to search only for
Movies and sort the result list by the count of seeders.
.. code:: yaml
- name: bt4g.movie
engine: bt4g
shortcut: bt4gv
categories: video
bt4g_order_by: seeders
bt4g_category: 'movie'
Implementations
===============
"""
from datetime import datetime
from urllib.parse import quote
from lxml import etree
# about
about = {
"website": 'https://bt4gprx.com',
"use_official_api": False,
"require_api_key": False,
"results": 'XML',
}
# engine dependent config
categories = ['files']
paging = True
time_range_support = True
# search-url
url = 'https://bt4gprx.com'
search_url = url + '/search?q={search_term}&orderby={order_by}&category={category}&p={pageno}&page=rss'
bt4g_order_by = 'relevance'
"""Result list can be ordered by ``relevance`` (default), ``size``, ``seeders``
or ``time``.
.. hint::
When *time_range* is activate, the results always ordered by ``time``.
"""
bt4g_category = 'all'
"""BT$G offers categories: ``all`` (default), ``audio``, ``movie``, ``doc``,
``app`` and `` other``.
"""
def request(query, params):
order_by = bt4g_order_by
if params['time_range']:
order_by = 'time'
params['url'] = search_url.format(
search_term=quote(query),
order_by=order_by,
category=bt4g_category,
pageno=params['pageno'],
)
return params
def response(resp):
results = []
search_results = etree.XML(resp.content)
# return empty array if nothing is found
if len(search_results) == 0:
return []
for entry in search_results.xpath('./channel/item'):
title = entry.find("title").text
link = entry.find("guid").text
fullDescription = entry.find("description").text.split('<br>')
magnetlink = entry.find("link").text
pubDate = entry.find("pubDate").text
results.append(
{
'url': link,
'title': title,
'magnetlink': magnetlink,
'seed': 'N/A',
'leech': 'N/A',
'filesize': fullDescription[1],
'publishedDate': datetime.strptime(pubDate, '%a,%d %b %Y %H:%M:%S %z'),
'template': 'torrent.html',
}
)
return results

85
searx/engines/btdigg.py Normal file
View File

@@ -0,0 +1,85 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
BTDigg (Videos, Music, Files)
"""
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://btdig.com',
"wikidata_id": 'Q4836698',
"official_api_documentation": {'url': 'https://btdig.com/contacts', 'comment': 'on demand'},
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files']
paging = True
# search-url
url = 'https://btdig.com'
search_url = url + '/search?q={search_term}&p={pageno}'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'] - 1)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//div[@class="one_result"]')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res:
link = result.xpath('.//div[@class="torrent_name"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
title = extract_text(link)
excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]
content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False)
content = content.strip().replace('\n', ' | ')
content = ' '.join(content.split())
filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0]
files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0]
# convert files to int if possible
try:
files = int(files)
except: # pylint: disable=bare-except
files = None
magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href']
# append result
results.append(
{
'url': href,
'title': title,
'content': content,
'filesize': filesize,
'files': files,
'magnetlink': magnetlink,
'template': 'torrent.html',
}
)
# return results sorted by seeder
return results

View File

@@ -0,0 +1,59 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""media.ccc.de"""
import datetime
from urllib.parse import urlencode
from dateutil import parser
about = {
'website': 'https://media.ccc.de',
'official_api_documentation': 'https://github.com/voc/voctoweb',
'use_official_api': True,
'require_api_key': False,
'results': 'JSON',
}
categories = ['videos']
paging = True
api_url = "https://api.media.ccc.de"
def request(query, params):
args = {'q': query, 'page': params['pageno']}
params['url'] = f"{api_url}/public/events/search?{urlencode(args)}"
return params
def response(resp):
results = []
for item in resp.json()['events']:
publishedDate = None
if item.get('date'):
publishedDate = parser.parse(item['date'])
iframe_src = None
for rec in item['recordings']:
if rec['mime_type'].startswith('video'):
if not iframe_src:
iframe_src = rec['recording_url']
elif rec['mime_type'] == 'video/mp4':
# prefer mp4 (minimal data rates)
iframe_src = rec['recording_url']
results.append(
{
'template': 'videos.html',
'url': item['frontend_link'],
'title': item['title'],
'content': item['description'],
'thumbnail': item['thumb_url'],
'publishedDate': publishedDate,
'length': datetime.timedelta(seconds=item['length']),
'iframe_src': iframe_src,
}
)
return results

68
searx/engines/chefkoch.py Normal file
View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Chefkoch is a German database of recipes.
"""
from datetime import datetime
from urllib.parse import urlencode
about = {
'website': "https://www.chefkoch.de",
'official_api_documentation': None,
'use_official_api': False,
'require_api_key': False,
'results': 'JSON',
'language': 'de',
}
paging = True
categories = []
number_of_results = 20
skip_premium = True
base_url = "https://api.chefkoch.de"
thumbnail_format = "crop-240x300"
def request(query, params):
args = {'query': query, 'limit': number_of_results, 'offset': (params['pageno'] - 1) * number_of_results}
params['url'] = f"{base_url}/v2/search-gateway/recipes?{urlencode(args)}"
return params
def response(resp):
results = []
json = resp.json()
for result in json['results']:
recipe = result['recipe']
if skip_premium and (recipe['isPremium'] or recipe['isPlus']):
continue
publishedDate = None
if recipe['submissionDate']:
publishedDate = datetime.strptime(result['recipe']['submissionDate'][:19], "%Y-%m-%dT%H:%M:%S")
content = [
f"Schwierigkeitsstufe (1-3): {recipe['difficulty']}",
f"Zubereitungszeit: {recipe['preparationTime']}min",
f"Anzahl der Zutaten: {recipe['ingredientCount']}",
]
if recipe['subtitle']:
content.insert(0, recipe['subtitle'])
results.append(
{
'url': recipe['siteUrl'],
'title': recipe['title'],
'content': " | ".join(content),
'thumbnail': recipe['previewImageUrlTemplate'].replace("<format>", thumbnail_format),
'publishedDate': publishedDate,
}
)
return results

223
searx/engines/chinaso.py Normal file
View File

@@ -0,0 +1,223 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""ChinaSo_, a search engine for the chinese language area.
.. attention::
ChinaSo engine does not return real URL, the links from these search
engines violate the privacy of the users!!
We try to find a solution for this problem, please follow `issue #4694`_.
As long as the problem has not been resolved, these engines are
not active in a standard setup (``inactive: true``).
.. _ChinaSo: https://www.chinaso.com/
.. _issue #4694: https://github.com/searxng/searxng/issues/4694
Configuration
=============
The engine has the following additional settings:
- :py:obj:`chinaso_category` (:py:obj:`ChinasoCategoryType`)
- :py:obj:`chinaso_news_source` (:py:obj:`ChinasoNewsSourceType`)
In the example below, all three ChinaSO engines are using the :ref:`network
<engine network>` from the ``chinaso news`` engine.
.. code:: yaml
- name: chinaso news
engine: chinaso
shortcut: chinaso
categories: [news]
chinaso_category: news
chinaso_news_source: all
- name: chinaso images
engine: chinaso
network: chinaso news
shortcut: chinasoi
categories: [images]
chinaso_category: images
- name: chinaso videos
engine: chinaso
network: chinaso news
shortcut: chinasov
categories: [videos]
chinaso_category: videos
Implementations
===============
"""
import typing
from urllib.parse import urlencode
from datetime import datetime
from searx.exceptions import SearxEngineAPIException
from searx.utils import html_to_text
about = {
"website": "https://www.chinaso.com/",
"wikidata_id": "Q10846064",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
"language": "zh",
}
paging = True
time_range_support = True
results_per_page = 10
categories = []
ChinasoCategoryType = typing.Literal['news', 'videos', 'images']
"""ChinaSo supports news, videos, images search.
- ``news``: search for news
- ``videos``: search for videos
- ``images``: search for images
In the category ``news`` you can additionally filter by option
:py:obj:`chinaso_news_source`.
"""
chinaso_category = 'news'
"""Configure ChinaSo category (:py:obj:`ChinasoCategoryType`)."""
ChinasoNewsSourceType = typing.Literal['CENTRAL', 'LOCAL', 'BUSINESS', 'EPAPER', 'all']
"""Filtering ChinaSo-News results by source:
- ``CENTRAL``: central publication
- ``LOCAL``: local publication
- ``BUSINESS``: business publication
- ``EPAPER``: E-Paper
- ``all``: all sources
"""
chinaso_news_source: ChinasoNewsSourceType = 'all'
"""Configure ChinaSo-News type (:py:obj:`ChinasoNewsSourceType`)."""
time_range_dict = {'day': '24h', 'week': '1w', 'month': '1m', 'year': '1y'}
base_url = "https://www.chinaso.com"
def init(_):
if chinaso_category not in ('news', 'videos', 'images'):
raise ValueError(f"Unsupported category: {chinaso_category}")
if chinaso_category == 'news' and chinaso_news_source not in typing.get_args(ChinasoNewsSourceType):
raise ValueError(f"Unsupported news source: {chinaso_news_source}")
def request(query, params):
query_params = {"q": query}
if time_range_dict.get(params['time_range']):
query_params["stime"] = time_range_dict[params['time_range']]
query_params["etime"] = 'now'
category_config = {
'news': {'endpoint': '/v5/general/v1/web/search', 'params': {'pn': params["pageno"], 'ps': results_per_page}},
'images': {
'endpoint': '/v5/general/v1/search/image',
'params': {'start_index': (params["pageno"] - 1) * results_per_page, 'rn': results_per_page},
},
'videos': {
'endpoint': '/v5/general/v1/search/video',
'params': {'start_index': (params["pageno"] - 1) * results_per_page, 'rn': results_per_page},
},
}
if chinaso_news_source != 'all':
if chinaso_news_source == 'EPAPER':
category_config['news']['params']["type"] = 'EPAPER'
else:
category_config['news']['params']["cate"] = chinaso_news_source
query_params.update(category_config[chinaso_category]['params'])
params["url"] = f"{base_url}{category_config[chinaso_category]['endpoint']}?{urlencode(query_params)}"
return params
def response(resp):
try:
data = resp.json()
except Exception as e:
raise SearxEngineAPIException(f"Invalid response: {e}") from e
parsers = {'news': parse_news, 'images': parse_images, 'videos': parse_videos}
return parsers[chinaso_category](data)
def parse_news(data):
results = []
if not data.get("data", {}).get("data"):
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["data"]:
published_date = None
if entry.get("timestamp"):
try:
published_date = datetime.fromtimestamp(int(entry["timestamp"]))
except (ValueError, TypeError):
pass
results.append(
{
'title': html_to_text(entry["title"]),
'url': entry["url"],
'content': html_to_text(entry["snippet"]),
'publishedDate': published_date,
}
)
return results
def parse_images(data):
results = []
if not data.get("data", {}).get("arrRes"):
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["arrRes"]:
results.append(
{
'url': entry["web_url"],
'title': html_to_text(entry["title"]),
'content': html_to_text(entry["ImageInfo"]),
'template': 'images.html',
'img_src': entry["url"].replace("http://", "https://"),
'thumbnail_src': entry["largeimage"].replace("http://", "https://"),
}
)
return results
def parse_videos(data):
results = []
if not data.get("data", {}).get("arrRes"):
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["arrRes"]:
published_date = None
if entry.get("VideoPubDate"):
try:
published_date = datetime.fromtimestamp(int(entry["VideoPubDate"]))
except (ValueError, TypeError):
pass
results.append(
{
'url': entry["url"],
'title': html_to_text(entry["raw_title"]),
'template': 'videos.html',
'publishedDate': published_date,
'thumbnail': entry["image_src"].replace("http://", "https://"),
}
)
return results

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Cloudflare AI engine"""
from json import loads, dumps
from searx.exceptions import SearxEngineAPIException
about = {
"website": 'https://ai.cloudflare.com',
"wikidata_id": None,
"official_api_documentation": 'https://developers.cloudflare.com/workers-ai',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
cf_account_id = ''
cf_ai_api = ''
cf_ai_gateway = ''
cf_ai_model = ''
cf_ai_model_display_name = 'Cloudflare AI'
# Assistant messages hint to the AI about the desired output format. Not all models support this role.
cf_ai_model_assistant = 'Keep your answers as short and effective as possible.'
# System messages define the AI's personality. You can use them to set rules and how you expect the AI to behave.
cf_ai_model_system = 'You are a self-aware language model who is honest and direct about any question from the user.'
def request(query, params):
params['query'] = query
params['url'] = f'https://gateway.ai.cloudflare.com/v1/{cf_account_id}/{cf_ai_gateway}/workers-ai/{cf_ai_model}'
params['method'] = 'POST'
params['headers']['Authorization'] = f'Bearer {cf_ai_api}'
params['headers']['Content-Type'] = 'application/json'
params['data'] = dumps(
{
'messages': [
{'role': 'assistant', 'content': cf_ai_model_assistant},
{'role': 'system', 'content': cf_ai_model_system},
{'role': 'user', 'content': params['query']},
]
}
).encode('utf-8')
return params
def response(resp):
results = []
json = loads(resp.text)
if 'error' in json:
raise SearxEngineAPIException('Cloudflare AI error: ' + json['error'])
if 'result' in json:
results.append(
{
'content': json['result']['response'],
'infobox': cf_ai_model_display_name,
}
)
return results

243
searx/engines/command.py Normal file
View File

@@ -0,0 +1,243 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""With *command engines* administrators can run engines to integrate arbitrary
shell commands.
.. attention::
When creating and enabling a ``command`` engine on a public instance, you
must be careful to avoid leaking private data.
The easiest solution is to limit the access by setting ``tokens`` as described
in section :ref:`private engines`. The engine base is flexible. Only your
imagination can limit the power of this engine (and maybe security concerns).
Configuration
=============
The following options are available:
``command``:
A comma separated list of the elements of the command. A special token
``{{QUERY}}`` tells where to put the search terms of the user. Example:
.. code:: yaml
['ls', '-l', '-h', '{{QUERY}}']
``delimiter``:
A mapping containing a delimiter ``char`` and the *titles* of each element in
``keys``.
``parse_regex``:
A dict containing the regular expressions for each result key.
``query_type``:
The expected type of user search terms. Possible values: ``path`` and
``enum``.
``path``:
Checks if the user provided path is inside the working directory. If not,
the query is not executed.
``enum``:
Is a list of allowed search terms. If the user submits something which is
not included in the list, the query returns an error.
``query_enum``:
A list containing allowed search terms if ``query_type`` is set to ``enum``.
``working_dir``:
The directory where the command has to be executed. Default: ``./``.
``result_separator``:
The character that separates results. Default: ``\\n``.
Example
=======
The example engine below can be used to find files with a specific name in the
configured working directory:
.. code:: yaml
- name: find
engine: command
command: ['find', '.', '-name', '{{QUERY}}']
query_type: path
shortcut: fnd
delimiter:
chars: ' '
keys: ['line']
Implementations
===============
"""
import re
from os.path import expanduser, isabs, realpath, commonprefix
from shlex import split as shlex_split
from subprocess import Popen, PIPE
from threading import Thread
from searx import logger
from searx.result_types import EngineResults
engine_type = 'offline'
paging = True
command = []
delimiter = {}
parse_regex = {}
query_type = ''
query_enum = []
environment_variables = {}
working_dir = realpath('.')
result_separator = '\n'
timeout = 4.0
_command_logger = logger.getChild('command')
_compiled_parse_regex = {}
def init(engine_settings):
check_parsing_options(engine_settings)
if 'command' not in engine_settings:
raise ValueError('engine command : missing configuration key: command')
global command, working_dir, delimiter, parse_regex, environment_variables # pylint: disable=global-statement
command = engine_settings['command']
if 'working_dir' in engine_settings:
working_dir = engine_settings['working_dir']
if not isabs(engine_settings['working_dir']):
working_dir = realpath(working_dir)
if 'parse_regex' in engine_settings:
parse_regex = engine_settings['parse_regex']
for result_key, regex in parse_regex.items():
_compiled_parse_regex[result_key] = re.compile(regex, flags=re.MULTILINE)
if 'delimiter' in engine_settings:
delimiter = engine_settings['delimiter']
if 'environment_variables' in engine_settings:
environment_variables = engine_settings['environment_variables']
def search(query, params) -> EngineResults:
res = EngineResults()
cmd = _get_command_to_run(query)
if not cmd:
return res
reader_thread = Thread(target=_get_results_from_process, args=(res, cmd, params['pageno']))
reader_thread.start()
reader_thread.join(timeout=timeout)
return res
def _get_command_to_run(query):
params = shlex_split(query)
__check_query_params(params)
cmd = []
for c in command:
if c == '{{QUERY}}':
cmd.extend(params)
else:
cmd.append(c)
return cmd
def _get_results_from_process(res: EngineResults, cmd, pageno):
leftover = ''
count = 0
start, end = __get_results_limits(pageno)
with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process:
line = process.stdout.readline()
while line:
buf = leftover + line.decode('utf-8')
raw_results = buf.split(result_separator)
if raw_results[-1]:
leftover = raw_results[-1]
raw_results = raw_results[:-1]
for raw_result in raw_results:
result = __parse_single_result(raw_result)
if result is None:
_command_logger.debug('skipped result:', raw_result)
continue
if start <= count and count <= end: # pylint: disable=chained-comparison
res.add(res.types.KeyValue(kvmap=result))
count += 1
if end < count:
return res
line = process.stdout.readline()
return_code = process.wait(timeout=timeout)
if return_code != 0:
raise RuntimeError('non-zero return code when running command', cmd, return_code)
return None
def __get_results_limits(pageno):
start = (pageno - 1) * 10
end = start + 9
return start, end
def __check_query_params(params):
if not query_type:
return
if query_type == 'path':
query_path = params[-1]
query_path = expanduser(query_path)
if commonprefix([realpath(query_path), working_dir]) != working_dir:
raise ValueError('requested path is outside of configured working directory')
elif query_type == 'enum' and len(query_enum) > 0:
for param in params:
if param not in query_enum:
raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum)
def check_parsing_options(engine_settings):
"""Checks if delimiter based parsing or regex parsing is configured correctly"""
if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings:
raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex')
if 'delimiter' in engine_settings and 'parse_regex' in engine_settings:
raise ValueError('failed to init settings for parsing lines: too many settings')
if 'delimiter' in engine_settings:
if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']:
raise ValueError
def __parse_single_result(raw_result):
"""Parses command line output based on configuration"""
result = {}
if delimiter:
elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1)
if len(elements) != len(delimiter['keys']):
return {}
for i in range(len(elements)): # pylint: disable=consider-using-enumerate
result[delimiter['keys'][i]] = elements[i]
if parse_regex:
for result_key, regex in _compiled_parse_regex.items():
found = regex.search(raw_result)
if not found:
return {}
result[result_key] = raw_result[found.start() : found.end()]
return result

151
searx/engines/core.py Normal file
View File

@@ -0,0 +1,151 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""CORE_ (COnnecting REpositories) provides a comprehensive bibliographic
database of the worlds scholarly literature, collecting and indexing
research from repositories and journals.
.. _CORE: https://core.ac.uk/about
.. _core engine config:
Configuration
=============
The engine has the following additional settings:
- :py:obj:`api_key`
.. code:: yaml
- name: core.ac.uk
engine: core
categories: science
shortcut: cor
api_key: "..."
timeout: 5
Implementations
===============
"""
# pylint: disable=too-many-branches
from datetime import datetime
from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
about = {
"website": 'https://core.ac.uk',
"wikidata_id": 'Q22661180',
"official_api_documentation": 'https://api.core.ac.uk/docs/v3',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
api_key = 'unset'
"""For an API key register at https://core.ac.uk/services/api and insert
the API key in the engine :ref:`core engine config`."""
categories = ['science', 'scientific publications']
paging = True
nb_per_page = 10
base_url = 'https://api.core.ac.uk/v3/search/works/'
def request(query, params):
if api_key == 'unset':
raise SearxEngineAPIException('missing CORE API key')
# API v3 uses different parameters
search_params = {
'q': query,
'offset': (params['pageno'] - 1) * nb_per_page,
'limit': nb_per_page,
'sort': 'relevance',
}
params['url'] = base_url + '?' + urlencode(search_params)
params['headers'] = {'Authorization': f'Bearer {api_key}'}
return params
def response(resp):
results = []
json_data = resp.json()
for result in json_data.get('results', []):
# Get title
if not result.get('title'):
continue
# Get URL - try different options
url = None
# Try DOI first
doi = result.get('doi')
if doi:
url = f'https://doi.org/{doi}'
if url is None and result.get('doi'):
# use the DOI reference
url = 'https://doi.org/' + str(result['doi'])
elif result.get('id'):
url = 'https://core.ac.uk/works/' + str(result['id'])
elif result.get('downloadUrl'):
url = result['downloadUrl']
elif result.get('sourceFulltextUrls'):
url = result['sourceFulltextUrls']
else:
continue
# Published date
published_date = None
raw_date = result.get('publishedDate') or result.get('depositedDate')
if raw_date:
try:
published_date = datetime.fromisoformat(result['publishedDate'].replace('Z', '+00:00'))
except (ValueError, AttributeError):
pass
# Handle journals
journals = []
if result.get('journals'):
journals = [j.get('title') for j in result['journals'] if j.get('title')]
# Handle publisher
publisher = result.get('publisher', '').strip("'")
if publisher:
publisher = publisher.strip("'")
# Handle authors
authors = set()
for i in result.get('authors', []):
name = i.get("name")
if name:
authors.add(name)
results.append(
{
'template': 'paper.html',
'title': result.get('title'),
'url': url,
'content': result.get('fullText', '') or '',
# 'comments': '',
'tags': result.get('fieldOfStudy', []),
'publishedDate': published_date,
'type': result.get('documentType', '') or '',
'authors': authors,
'editor': ', '.join(result.get('contributors', [])),
'publisher': publisher,
'journal': ', '.join(journals),
'doi': result.get('doi'),
# 'issn' : ''
# 'isbn' : ''
'pdf_url': result.get('downloadUrl', {}) or result.get("sourceFulltextUrls", {}),
}
)
return results

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Cppreference
"""
from lxml import html
from searx.utils import eval_xpath
about = {
"website": "https://en.cppreference.com/",
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['it']
url = 'https://en.cppreference.com/'
search_url = url + 'mwiki/index.php?title=Special%3ASearch&search={query}'
def request(query, params):
params['url'] = search_url.format(query=query)
return query
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath(dom, '//div[contains(@class, "mw-search-result-heading")]'):
results.append(
{
'url': url + eval_xpath(result, './/a/@href')[0],
'title': eval_xpath(result, './/a/text()')[0],
}
)
return results

70
searx/engines/crates.py Normal file
View File

@@ -0,0 +1,70 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Cargo search on crates.io"""
from collections import OrderedDict
from urllib.parse import urlencode
from dateutil import parser
about = {
"website": "https://crates.io/",
"wikidata_id": None,
"official_api_documentation": "https://crates.io/data-access",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ["it", "packages", "cargo"]
# engine dependent config
paging = True
page_size = 10
search_url = "https://crates.io/api/v1/crates"
linked_terms = OrderedDict(
[
("homepage", "Project homepage"),
("documentation", "Documentation"),
("repository", "Source code"),
]
)
def request(query: str, params):
args = urlencode({"page": params["pageno"], "q": query, "per_page": page_size})
params["url"] = f"{search_url}?{args}"
return params
def response(resp):
results = []
for package in resp.json()["crates"]:
published_date = package.get("updated_at")
published_date = parser.parse(published_date)
links = {}
for k, v in linked_terms.items():
l = package.get(k)
if l:
links[v] = l
results.append(
{
"template": "packages.html",
"url": f'https://crates.io/crates/{package["name"]}',
"title": package["name"],
"package_name": package["name"],
"tags": package["keywords"],
"content": package["description"],
"version": package["newest_version"] or package["max_version"] or package["max_stable_version"],
"publishedDate": published_date,
"links": links,
}
)
return results

63
searx/engines/crossref.py Normal file
View File

@@ -0,0 +1,63 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""CrossRef"""
from urllib.parse import urlencode
from datetime import datetime
about = {
"website": "https://www.crossref.org/",
"wikidata_id": "Q5188229",
"official_api_documentation": "https://api.crossref.org",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
categories = ["science", "scientific publications"]
paging = True
search_url = "https://api.crossref.org/works"
def request(query, params):
params["url"] = search_url + "?" + urlencode({"query": query, "offset": 20 * (params["pageno"] - 1)})
return params
def response(resp):
results = []
for record in resp.json()["message"]["items"]:
if record["type"] == "component":
# These seem to be files published along with papers. Not something you'd search for
continue
result = {
"template": "paper.html",
"content": record.get("abstract", ""),
"doi": record.get("DOI"),
"pages": record.get("page"),
"publisher": record.get("publisher"),
"tags": record.get("subject"),
"type": record.get("type"),
"url": record.get("URL"),
"volume": record.get("volume"),
}
if record["type"] == "book-chapter":
result["title"] = record["container-title"][0]
if record["title"][0].lower().strip() != result["title"].lower().strip():
result["title"] += f" ({record['title'][0]})"
else:
result["title"] = record["title"][0] if "title" in record else record.get("container-title", [None])[0]
result["journal"] = record.get("container-title", [None])[0] if "title" in record else None
if "resource" in record and "primary" in record["resource"] and "URL" in record["resource"]["primary"]:
result["url"] = record["resource"]["primary"]["URL"]
if "published" in record and "date-parts" in record["published"]:
result["publishedDate"] = datetime(*(record["published"]["date-parts"][0] + [1, 1][:3]))
result["authors"] = [a.get("given", "") + " " + a.get("family", "") for a in record.get("author", [])]
result["isbn"] = record.get("isbn") or [i["value"] for i in record.get("isbn-type", [])]
# All the links are not PDFs, even if the URL ends with ".pdf"
# result["pdf_url"] = record.get("link", [{"URL": None}])[0]["URL"]
results.append(result)
return results

View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Currency convert (DuckDuckGo)
"""
import json
from searx.result_types import EngineResults
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": False,
"require_api_key": False,
"results": 'JSONP',
"description": "Service from DuckDuckGo.",
}
engine_type = 'online_currency'
categories = []
base_url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'
weight = 100
https_support = True
def request(_query, params):
params['url'] = base_url.format(params['from'], params['to'])
return params
def response(resp) -> EngineResults:
res = EngineResults()
# remove first and last lines to get only json
json_resp = resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2]
try:
conversion_rate = float(json.loads(json_resp)["to"][0]["mid"])
except IndexError:
return res
answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format(
resp.search_params['amount'],
resp.search_params['from'],
resp.search_params['amount'] * conversion_rate,
resp.search_params['to'],
conversion_rate,
resp.search_params['from_name'],
resp.search_params['to_name'],
)
url = f"https://duckduckgo.com/?q={resp.search_params['from']}+to+{resp.search_params['to']}"
res.add(res.types.Answer(answer=answer, url=url))
return res

View File

@@ -0,0 +1,251 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dailymotion (Videos)
~~~~~~~~~~~~~~~~~~~~
.. _REST GET: https://developers.dailymotion.com/tools/
.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters
.. _Video filters API: https://developers.dailymotion.com/api/#video-filters
.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection
"""
from typing import TYPE_CHECKING
from datetime import datetime, timedelta
from urllib.parse import urlencode
import time
import babel
from searx.network import get, raise_for_httperror # see https://github.com/searxng/searxng/issues/762
from searx.utils import html_to_text
from searx.exceptions import SearxEngineAPIException
from searx.locales import region_tag, language_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.dailymotion.com',
"wikidata_id": 'Q769222',
"official_api_documentation": 'https://www.dailymotion.com/developer',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['videos']
paging = True
number_of_results = 10
time_range_support = True
time_delta_dict = {
"day": timedelta(days=1),
"week": timedelta(days=7),
"month": timedelta(days=31),
"year": timedelta(days=365),
}
safesearch = True
safesearch_params = {
2: {'is_created_for_kids': 'true'},
1: {'is_created_for_kids': 'true'},
0: {},
}
"""True if this video is "Created for Kids" / intends to target an audience
under the age of 16 (``is_created_for_kids`` in `Video filters API`_ )
"""
family_filter_map = {
2: 'true',
1: 'true',
0: 'false',
}
"""By default, the family filter is turned on. Setting this parameter to
``false`` will stop filtering-out explicit content from searches and global
contexts (``family_filter`` in `Global API Parameters`_ ).
"""
result_fields = [
'allow_embed',
'description',
'title',
'created_time',
'duration',
'url',
'thumbnail_360_url',
'id',
]
"""`Fields selection`_, by default, a few fields are returned. To request more
specific fields, the ``fields`` parameter is used with the list of fields
SearXNG needs in the response to build a video result list.
"""
search_url = 'https://api.dailymotion.com/videos?'
"""URL to retrieve a list of videos.
- `REST GET`_
- `Global API Parameters`_
- `Video filters API`_
"""
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
"""URL template to embed video in SearXNG's result list."""
def request(query, params):
if not query:
return False
eng_region: str = traits.get_region(params['searxng_locale'], 'en_US') # type: ignore
eng_lang = traits.get_language(params['searxng_locale'], 'en')
args = {
'search': query,
'family_filter': family_filter_map.get(params['safesearch'], 'false'),
'thumbnail_ratio': 'original', # original|widescreen|square
# https://developers.dailymotion.com/api/#video-filters
'languages': eng_lang,
'page': params['pageno'],
'password_protected': 'false',
'private': 'false',
'sort': 'relevance',
'limit': number_of_results,
'fields': ','.join(result_fields),
}
args.update(safesearch_params.get(params['safesearch'], {}))
# Don't add localization and country arguments if the user does select a
# language (:de, :en, ..)
if len(params['searxng_locale'].split('-')) > 1:
# https://developers.dailymotion.com/api/#global-parameters
args['localization'] = eng_region
args['country'] = eng_region.split('_')[1]
# Insufficient rights for the `ams_country' parameter of route `GET /videos'
# 'ams_country': eng_region.split('_')[1],
time_delta = time_delta_dict.get(params["time_range"])
if time_delta:
created_after = datetime.now() - time_delta
args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(args)
params['url'] = search_url + query_str
return params
# get response from search-request
def response(resp):
results = []
search_res = resp.json()
# check for an API error
if 'error' in search_res:
raise SearxEngineAPIException(search_res['error'].get('message'))
raise_for_httperror(resp)
# parse results
for res in search_res.get('list', []):
title = res['title']
url = res['url']
content = html_to_text(res['description'])
if len(content) > 300:
content = content[:300] + '...'
publishedDate = datetime.fromtimestamp(res['created_time'], None)
length = time.gmtime(res.get('duration'))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
thumbnail = res['thumbnail_360_url']
thumbnail = thumbnail.replace("http://", "https://")
item = {
'template': 'videos.html',
'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate,
'length': length,
'thumbnail': thumbnail,
}
# HINT: no mater what the value is, without API token videos can't shown
# embedded
if res['allow_embed']:
item['iframe_src'] = iframe_src.format(video_id=res['id'])
results.append(item)
# return results
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch locales & languages from dailymotion.
Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_.
There are duplications in the locale codes returned from Dailymotion which
can be ignored::
en_EN --> en_GB, en_US
ar_AA --> ar_EG, ar_AE, ar_SA
The language list `api/languages <https://api.dailymotion.com/languages>`_
contains over 7000 *languages* codes (see PR1071_). We use only those
language codes that are used in the locales.
.. _PR1071: https://github.com/searxng/searxng/pull/1071
"""
resp = get('https://api.dailymotion.com/locales')
if not resp.ok: # type: ignore
print("ERROR: response from dailymotion/locales is not OK.")
for item in resp.json()['list']: # type: ignore
eng_tag = item['locale']
if eng_tag in ('en_EN', 'ar_AA'):
continue
try:
sxng_tag = region_tag(babel.Locale.parse(eng_tag))
except babel.UnknownLocaleError:
print("ERROR: item unknown --> %s" % item)
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()]
resp = get('https://api.dailymotion.com/languages')
if not resp.ok: # type: ignore
print("ERROR: response from dailymotion/languages is not OK.")
for item in resp.json()['list']: # type: ignore
eng_tag = item['code']
if eng_tag in locale_lang_list:
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
engine_traits.languages[sxng_tag] = eng_tag

52
searx/engines/deepl.py Normal file
View File

@@ -0,0 +1,52 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Deepl translation engine"""
from searx.result_types import EngineResults
about = {
"website": 'https://deepl.com',
"wikidata_id": 'Q43968444',
"official_api_documentation": 'https://www.deepl.com/docs-api',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general', 'translate']
url = 'https://api-free.deepl.com/v2/translate'
api_key = None
def request(_query, params):
'''pre-request callback
params<dict>:
- ``method`` : POST/GET
- ``headers``: {}
- ``data``: {} # if method == POST
- ``url``: ''
- ``category``: 'search category'
- ``pageno``: 1 # number of the requested page
'''
params['url'] = url
params['method'] = 'POST'
params['data'] = {'auth_key': api_key, 'text': params['query'], 'target_lang': params['to_lang'][1]}
return params
def response(resp) -> EngineResults:
res = EngineResults()
data = resp.json()
if not data.get('translations'):
return res
translations = [res.types.Translations.Item(text=t['text']) for t in data['translations']]
res.add(res.types.Translations(translations=translations))
return res

61
searx/engines/deezer.py Normal file
View File

@@ -0,0 +1,61 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Deezer (Music)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://deezer.com',
"wikidata_id": 'Q602243',
"official_api_documentation": 'https://developers.deezer.com/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['music']
paging = True
# search-url
url = 'https://api.deezer.com/'
search_url = url + 'search?{query}&index={offset}'
iframe_src = "https://www.deezer.com/plugins/player?type=tracks&id={audioid}"
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 25
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get('data', []):
if result['type'] == 'track':
title = result['title']
url = result['link'] # pylint: disable=redefined-outer-name
if url.startswith('http://'):
url = 'https' + url[4:]
content = '{} - {} - {}'.format(result['artist']['name'], result['album']['title'], result['title'])
# append result
results.append(
{'url': url, 'title': title, 'iframe_src': iframe_src.format(audioid=result['id']), 'content': content}
)
# return results
return results

View File

@@ -0,0 +1,86 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Within this module we implement a *demo offline engine*. Do not look to
close to the implementation, its just a simple example. To get in use of this
*demo* engine add the following entry to your engines list in ``settings.yml``:
.. code:: yaml
- name: my offline engine
engine: demo_offline
shortcut: demo
disabled: false
"""
import json
from searx.result_types import EngineResults
from searx.enginelib import EngineCache
engine_type = 'offline'
categories = ['general']
disabled = True
timeout = 2.0
about = {
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
# if there is a need for globals, use a leading underline
_my_offline_engine: str = ""
CACHE: EngineCache
"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
seconds."""
def init(engine_settings):
"""Initialization of the (offline) engine. The origin of this demo engine is a
simple json string which is loaded in this example while the engine is
initialized."""
global _my_offline_engine, CACHE # pylint: disable=global-statement
CACHE = EngineCache(engine_settings["name"]) # type:ignore
_my_offline_engine = (
'[ {"value": "%s"}'
', {"value":"first item"}'
', {"value":"second item"}'
', {"value":"third item"}'
']' % engine_settings.get('name')
)
def search(query, request_params) -> EngineResults:
"""Query (offline) engine and return results. Assemble the list of results
from your local engine. In this demo engine we ignore the 'query' term,
usual you would pass the 'query' term to your local engine to filter out the
results.
"""
res = EngineResults()
count = CACHE.get("count", 0)
for row in json.loads(_my_offline_engine):
count += 1
kvmap = {
'query': query,
'language': request_params['searxng_locale'],
'value': row.get("value"),
}
res.add(
res.types.KeyValue(
caption=f"Demo Offline Engine Result #{count}",
key_title="Name",
value_title="Value",
kvmap=kvmap,
)
)
res.add(res.types.LegacyResult(number_of_results=count))
# cache counter value for 20sec
CACHE.set("count", count, expire=20)
return res

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Within this module we implement a *demo online engine*. Do not look to
close to the implementation, its just a simple example which queries `The Art
Institute of Chicago <https://www.artic.edu>`_
To get in use of this *demo* engine add the following entry to your engines
list in ``settings.yml``:
.. code:: yaml
- name: my online engine
engine: demo_online
shortcut: demo
disabled: false
"""
from json import loads
from urllib.parse import urlencode
from searx.result_types import EngineResults
engine_type = 'online'
send_accept_language_header = True
categories = ['general']
disabled = True
timeout = 2.0
categories = ['images']
paging = True
page_size = 20
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
image_api = 'https://www.artic.edu/iiif/2/'
about = {
"website": 'https://www.artic.edu',
"wikidata_id": 'Q239303',
"official_api_documentation": 'http://api.artic.edu/docs/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# if there is a need for globals, use a leading underline
_my_online_engine = None
def init(engine_settings):
"""Initialization of the (online) engine. If no initialization is needed, drop
this init function.
"""
global _my_online_engine # pylint: disable=global-statement
_my_online_engine = engine_settings.get('name')
def request(query, params):
"""Build up the ``params`` for the online request. In this example we build a
URL to fetch images from `artic.edu <https://artic.edu>`__
"""
args = urlencode(
{
'q': query,
'page': params['pageno'],
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
'limit': page_size,
}
)
params['url'] = search_api + args
return params
def response(resp) -> EngineResults:
"""Parse out the result items from the response. In this example we parse the
response from `api.artic.edu <https://artic.edu>`__ and filter out all
images.
"""
res = EngineResults()
json_data = loads(resp.text)
res.add(
res.types.Answer(
answer="this is a dummy answer ..",
url="https://example.org",
)
)
for result in json_data['data']:
if not result['image_id']:
continue
res.append(
{
'url': 'https://artic.edu/artworks/%(id)s' % result,
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
'content': "%(medium_display)s // %(dimensions)s" % result,
'author': ', '.join(result['artist_titles']),
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
'template': 'images.html',
}
)
return res

67
searx/engines/destatis.py Normal file
View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""DeStatis
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, eval_xpath_list, extract_text
about = {
'website': 'https://www.destatis.de',
'official_api_documentation': 'https://destatis.api.bund.dev/',
'use_official_api': False,
'require_api_key': False,
'results': 'HTML',
'language': 'de',
}
categories = []
paging = True
base_url = "https://www.destatis.de"
search_url = f"{base_url}/SiteGlobals/Forms/Suche/Expertensuche_Formular.html"
# pylint: disable-next=line-too-long
results_xpath = '//div[contains(@class, "l-content-wrapper")]/div[contains(@class, "row")]/div[contains(@class, "column")]/div[contains(@class, "c-result"){extra}]'
results_xpath_filter_recommended = " and not(contains(@class, 'c-result--recommended'))"
url_xpath = './/a/@href'
title_xpath = './/a/text()'
date_xpath = './/a/span[contains(@class, "c-result__date")]'
content_xpath = './/div[contains(@class, "column")]/p/text()'
doctype_xpath = './/div[contains(@class, "c-result__doctype")]/p'
def request(query, params):
args = {
'templateQueryString': query,
'gtp': f"474_list%3D{params['pageno']}",
}
params['url'] = f"{search_url}?{urlencode(args)}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
# filter out suggested results on further page because they're the same on each page
extra_xpath = results_xpath_filter_recommended if resp.search_params['pageno'] > 1 else ''
res_xpath = results_xpath.format(extra=extra_xpath)
for result in eval_xpath_list(dom, res_xpath):
doctype = extract_text(eval_xpath(result, doctype_xpath))
date = extract_text(eval_xpath(result, date_xpath))
metadata = [meta for meta in (doctype, date) if meta != ""]
results.append(
{
'url': base_url + "/" + extract_text(eval_xpath(result, url_xpath)),
'title': extract_text(eval_xpath(result, title_xpath)),
'content': extract_text(eval_xpath(result, content_xpath)),
'metadata': ', '.join(metadata),
}
)
return results

View File

@@ -0,0 +1,87 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Deviantart (Images)
"""
import urllib.parse
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list
# about
about = {
"website": 'https://www.deviantart.com/',
"wikidata_id": 'Q46523',
"official_api_documentation": 'https://www.deviantart.com/developers/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images']
paging = True
# search-url
base_url = 'https://www.deviantart.com'
results_xpath = '//div[@class="_2pZkk"]/div/div/a'
url_xpath = './@href'
thumbnail_src_xpath = './div/img/@src'
img_src_xpath = './div/img/@srcset'
title_xpath = './@aria-label'
premium_xpath = '../div/div/div/text()'
premium_keytext = 'Watch the artist to view this deviation'
cursor_xpath = '(//a[@class="_1OGeq"]/@href)[last()]'
def request(query, params):
# https://www.deviantart.com/search?q=foo
nextpage_url = params['engine_data'].get('nextpage')
# don't use nextpage when user selected to jump back to page 1
if params['pageno'] > 1 and nextpage_url is not None:
params['url'] = nextpage_url
else:
params['url'] = f"{base_url}/search?{urllib.parse.urlencode({'q': query})}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, results_xpath):
# skip images that are blurred
_text = extract_text(eval_xpath(result, premium_xpath))
if _text and premium_keytext in _text:
continue
img_src = extract_text(eval_xpath(result, img_src_xpath))
if img_src:
img_src = img_src.split(' ')[0]
parsed_url = urllib.parse.urlparse(img_src)
img_src = parsed_url._replace(path=parsed_url.path.split('/v1')[0]).geturl()
results.append(
{
'template': 'images.html',
'url': extract_text(eval_xpath(result, url_xpath)),
'img_src': img_src,
'thumbnail_src': extract_text(eval_xpath(result, thumbnail_src_xpath)),
'title': extract_text(eval_xpath(result, title_xpath)),
}
)
nextpage_url = extract_text(eval_xpath(dom, cursor_xpath))
if nextpage_url:
results.append(
{
'engine_data': nextpage_url.replace("http://", "https://"),
'key': 'nextpage',
}
)
return results

105
searx/engines/dictzone.py Normal file
View File

@@ -0,0 +1,105 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dictzone
"""
import urllib.parse
from lxml import html
from searx.utils import eval_xpath, extract_text
from searx.result_types import EngineResults
from searx.network import get as http_get # https://github.com/searxng/searxng/issues/762
# about
about = {
"website": 'https://dictzone.com/',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
engine_type = 'online_dictionary'
categories = ['general', 'translate']
base_url = "https://dictzone.com"
weight = 100
https_support = True
def request(query, params): # pylint: disable=unused-argument
from_lang = params["from_lang"][2] # "english"
to_lang = params["to_lang"][2] # "german"
query = params["query"]
params["url"] = f"{base_url}/{from_lang}-{to_lang}-dictionary/{urllib.parse.quote_plus(query)}"
return params
def _clean_up_node(node):
for x in ["./i", "./span", "./button"]:
for n in node.xpath(x):
n.getparent().remove(n)
def response(resp) -> EngineResults:
results = EngineResults()
item_list = []
if not resp.ok:
return results
dom = html.fromstring(resp.text)
for result in eval_xpath(dom, ".//table[@id='r']//tr"):
# each row is an Translations.Item
td_list = result.xpath("./td")
if len(td_list) != 2:
# ignore header columns "tr/th"
continue
col_from, col_to = td_list
_clean_up_node(col_from)
text = f"{extract_text(col_from)}"
synonyms = []
p_list = col_to.xpath(".//p")
for i, p_item in enumerate(p_list):
smpl: str = extract_text(p_list[i].xpath("./i[@class='smpl']")) # type: ignore
_clean_up_node(p_item)
p_text: str = extract_text(p_item) # type: ignore
if smpl:
p_text += " // " + smpl
if i == 0:
text += f" : {p_text}"
continue
synonyms.append(p_text)
item = results.types.Translations.Item(text=text, synonyms=synonyms)
item_list.append(item)
# the "autotranslate" of dictzone is loaded by the JS from URL:
# https://dictzone.com/trans/hello%20world/en_de
from_lang = resp.search_params["from_lang"][1] # "en"
to_lang = resp.search_params["to_lang"][1] # "de"
query = resp.search_params["query"]
# works only sometimes?
autotranslate = http_get(f"{base_url}/trans/{query}/{from_lang}_{to_lang}", timeout=1.0)
if autotranslate.ok and autotranslate.text:
item_list.insert(0, results.types.Translations.Item(text=autotranslate.text))
if item_list:
results.add(results.types.Translations(translations=item_list, url=resp.search_params["url"]))
return results

64
searx/engines/digbt.py Normal file
View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DigBT (Videos, Music, Files)
"""
from urllib.parse import urljoin
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://digbt.org',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['videos', 'music', 'files']
paging = True
URL = 'https://digbt.org'
SEARCH_URL = URL + '/search/{query}-time-{pageno}'
FILESIZE = 3
FILESIZE_MULTIPLIER = 4
def request(query, params):
params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno'])
return params
def response(resp):
dom = html.fromstring(resp.text)
search_res = dom.xpath('.//td[@class="x-item"]')
if not search_res:
return []
results = []
for result in search_res:
url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
title = extract_text(result.xpath('.//a[@title]'))
content = extract_text(result.xpath('.//div[@class="files"]'))
files_data = extract_text(result.xpath('.//div[@class="tail"]')).split()
filesize = f"{files_data[FILESIZE]} {files_data[FILESIZE_MULTIPLIER]}"
magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0]
results.append(
{
'url': url,
'title': title,
'content': content,
'filesize': filesize,
'magnetlink': magnetlink,
'seed': 'N/A',
'leech': 'N/A',
'template': 'torrent.html',
}
)
return results

181
searx/engines/discourse.py Normal file
View File

@@ -0,0 +1,181 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. sidebar:: info
- `builtwith.com Discourse <https://trends.builtwith.com/websitelist/Discourse>`_
Discourse is an open source Internet forum system. To search in a forum this
engine offers some additional settings:
- :py:obj:`base_url`
- :py:obj:`api_order`
- :py:obj:`search_endpoint`
- :py:obj:`show_avatar`
- :py:obj:`api_key`
- :py:obj:`api_username`
Example
=======
To search in your favorite Discourse forum, add a configuration like shown here
for the ``paddling.com`` forum:
.. code:: yaml
- name: paddling
engine: discourse
shortcut: paddle
base_url: 'https://forums.paddling.com/'
api_order: views
categories: ['social media', 'sports']
show_avatar: true
If the forum is private, you need to add an API key and username for the search:
.. code:: yaml
- name: paddling
engine: discourse
shortcut: paddle
base_url: 'https://forums.paddling.com/'
api_order: views
categories: ['social media', 'sports']
show_avatar: true
api_key: '<KEY>'
api_username: 'system'
Implementations
===============
"""
from urllib.parse import urlencode
from datetime import datetime, timedelta
import html
from dateutil import parser
from flask_babel import gettext
about = {
"website": "https://discourse.org/",
"wikidata_id": "Q15054354",
"official_api_documentation": "https://docs.discourse.org/",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
base_url: str = None # type: ignore
"""URL of the Discourse forum."""
search_endpoint = '/search.json'
"""URL path of the `search endpoint`_.
.. _search endpoint: https://docs.discourse.org/#tag/Search
"""
api_order = 'likes'
"""Order method, valid values are: ``latest``, ``likes``, ``views``, ``latest_topic``"""
show_avatar = False
"""Show avatar of the user who send the post."""
api_key = ''
"""API key of the Discourse forum."""
api_username = ''
"""API username of the Discourse forum."""
paging = True
time_range_support = True
AGO_TIMEDELTA = {
'day': timedelta(days=1),
'week': timedelta(days=7),
'month': timedelta(days=31),
'year': timedelta(days=365),
}
def request(query, params):
if len(query) <= 2:
return None
q = [query, f'order:{api_order}']
time_range = params.get('time_range')
if time_range:
after_date = datetime.now() - AGO_TIMEDELTA[time_range]
q.append('after:' + after_date.strftime('%Y-%m-%d'))
args = {
'q': ' '.join(q),
'page': params['pageno'],
}
params['url'] = f'{base_url}{search_endpoint}?{urlencode(args)}'
params['headers'] = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
}
if api_key != '':
params['headers']['Api-Key'] = api_key
if api_username != '':
params['headers']['Api-Username'] = api_username
return params
def response(resp):
results = []
json_data = resp.json()
if ('topics' or 'posts') not in json_data.keys():
return []
topics = {}
for item in json_data['topics']:
topics[item['id']] = item
for post in json_data['posts']:
result = topics.get(post['topic_id'], {})
url = f"{base_url}/p/{post['id']}"
status = gettext("closed") if result.get('closed', '') else gettext("open")
comments = result.get('posts_count', 0)
publishedDate = parser.parse(result['created_at'])
metadata = []
metadata.append('@' + post.get('username', ''))
if int(comments) > 1:
metadata.append(f'{gettext("comments")}: {comments}')
if result.get('has_accepted_answer'):
metadata.append(gettext("answered"))
elif int(comments) > 1:
metadata.append(status)
result = {
'url': url,
'title': html.unescape(result['title']),
'content': html.unescape(post.get('blurb', '')),
'metadata': ' | '.join(metadata),
'publishedDate': publishedDate,
'upstream': {'topics': result},
}
avatar = post.get('avatar_template', '').replace('{size}', '96')
if show_avatar and avatar:
result['thumbnail'] = base_url + avatar
results.append(result)
results.append({'number_of_results': len(json_data['topics'])})
return results

View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Docker Hub (IT)
"""
# pylint: disable=use-dict-literal
from urllib.parse import urlencode
from dateutil import parser
about = {
"website": 'https://hub.docker.com',
"wikidata_id": 'Q100769064',
"official_api_documentation": 'https://docs.docker.com/registry/spec/api/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['it', 'packages'] # optional
paging = True
base_url = "https://hub.docker.com"
page_size = 10
def request(query, params):
args = {
"query": query,
"from": page_size * (params['pageno'] - 1),
"size": page_size,
}
params['url'] = f"{base_url}/api/search/v3/catalog/search?{urlencode(args)}"
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
results = []
json_resp = resp.json()
for item in json_resp.get("results", []):
image_source = item.get("source")
is_official = image_source in ["store", "official"]
popularity_infos = [f"{item.get('star_count', 0)} stars"]
architectures = []
for rate_plan in item.get("rate_plans", []):
pull_count = rate_plan.get("repositories", [{}])[0].get("pull_count")
if pull_count:
popularity_infos.insert(0, f"{pull_count} pulls")
architectures.extend(arch['name'] for arch in rate_plan.get("architectures", []) if arch['name'])
result = {
'template': 'packages.html',
'url': base_url + ("/_/" if is_official else "/r/") + item.get("slug", ""),
'title': item.get("name"),
'content': item.get("short_description"),
'thumbnail': item["logo_url"].get("large") or item["logo_url"].get("small"),
'package_name': item.get("name"),
'maintainer': item["publisher"].get("name"),
'publishedDate': parser.parse(item.get("updated_at") or item.get("created_at")),
'popularity': ', '.join(popularity_infos),
'tags': architectures,
}
results.append(result)
return results

87
searx/engines/doku.py Normal file
View File

@@ -0,0 +1,87 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Doku Wiki
"""
from urllib.parse import urlencode
from urllib.parse import urljoin
from lxml.html import fromstring
from searx.utils import extract_text, eval_xpath
# about
about = {
"website": 'https://www.dokuwiki.org/',
"wikidata_id": 'Q851864',
"official_api_documentation": 'https://www.dokuwiki.org/devel:xmlrpc',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general'] # 'images', 'music', 'videos', 'files'
paging = False
number_of_results = 5
# search-url
# Doku is OpenSearch compatible
base_url = 'http://localhost:8090'
search_url = (
# fmt: off
'/?do=search'
'&{query}'
# fmt: on
)
# '&startRecord={offset}'
# '&maximumRecords={limit}'
# do search-request
def request(query, params):
params['url'] = base_url + search_url.format(query=urlencode({'id': query}))
return params
# get response from search-request
def response(resp):
results = []
doc = fromstring(resp.text)
# parse results
# Quickhits
for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
try:
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
except: # pylint: disable=bare-except
continue
if not res_url:
continue
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
# append result
results.append({'title': title, 'content': "", 'url': urljoin(base_url, res_url)})
# Search results
for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
try:
if r.tag == "dt":
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
elif r.tag == "dd":
content = extract_text(eval_xpath(r, '.'))
# append result
results.append({'title': title, 'content': content, 'url': urljoin(base_url, res_url)})
except: # pylint: disable=bare-except
continue
if not res_url:
continue
# return results
return results

496
searx/engines/duckduckgo.py Normal file
View File

@@ -0,0 +1,496 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo WEB
~~~~~~~~~~~~~~
"""
from __future__ import annotations
import json
import re
import typing
from urllib.parse import quote_plus
import babel
import lxml.html
from searx import (
locales,
external_bang,
)
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
extr,
extract_text,
)
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.enginelib.traits import EngineTraits
from searx.enginelib import EngineCache
from searx.exceptions import SearxEngineCaptchaException
from searx.result_types import EngineResults
if typing.TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://lite.duckduckgo.com/lite/',
"wikidata_id": 'Q12805',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
send_accept_language_header = True
"""DuckDuckGo-Lite tries to guess user's preferred language from the HTTP
``Accept-Language``. Optional the user can select a region filter (but not a
language).
"""
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True # user can't select but the results are filtered
url = "https://html.duckduckgo.com/html/"
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
_CACHE: EngineCache = None # type: ignore
"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
seconds."""
def get_cache():
global _CACHE # pylint: disable=global-statement
if _CACHE is None:
_CACHE = EngineCache("duckduckgo") # type:ignore
return _CACHE
def get_vqd(query: str, region: str, force_request: bool = False) -> str:
"""Returns the ``vqd`` that fits to the *query*.
:param query: The query term
:param region: DDG's region code
:param force_request: force a request to get a vqd value from DDG
TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
by all request to DDG:
- DuckDuckGo Lite: ``https://lite.duckduckgo.com/lite`` (POST form data)
- DuckDuckGo Web: ``https://links.duckduckgo.com/d.js?q=...&vqd=...``
- DuckDuckGo Images: ``https://duckduckgo.com/i.js??q=...&vqd=...``
- DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
- DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
(such as extremely long search terms that are often sent by bots), no ``vqd``
value can be determined.
If SearXNG cannot determine a ``vqd`` value, then no request should go out
to DDG.
.. attention::
A request with a wrong ``vqd`` value leads to DDG temporarily putting
SearXNG's IP on a block list.
Requests from IPs in this block list run into timeouts. Not sure, but it
seems the block list is a sliding window: to get my IP rid from the bot list
I had to cool down my IP for 1h (send no requests from that IP to DDG).
"""
cache = get_cache()
key = cache.secret_hash(f"{query}//{region}")
value = cache.get(key=key)
if value is not None and not force_request:
logger.debug("vqd: re-use cached value: %s", value)
return value
logger.debug("vqd: request value from from duckduckgo.com")
resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
if resp.status_code == 200: # type: ignore
value = extr(resp.text, 'vqd="', '"') # type: ignore
if value:
logger.debug("vqd value from duckduckgo.com request: '%s'", value)
else:
logger.error("vqd: can't parse value from ddg response (return empty string)")
return ""
else:
logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code)
if value:
cache.set(key=key, value=value)
else:
logger.error("vqd value from duckduckgo.com ", resp.status_code)
return value
def set_vqd(query: str, region: str, value: str):
cache = get_cache()
key = cache.secret_hash(f"{query}//{region}")
cache.set(key=key, value=value, expire=3600)
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
"""Get DuckDuckGo's language identifier from SearXNG's locale.
DuckDuckGo defines its languages by region codes (see
:py:obj:`fetch_traits`).
To get region and language of a DDG service use:
.. code: python
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
It might confuse, but the ``l`` value of the cookie is what SearXNG calls
the *region*:
.. code:: python
# !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
.. hint::
`DDG-lite <https://lite.duckduckgo.com/lite>`__ and the *no Javascript*
page https://html.duckduckgo.com/html do not offer a language selection
to the user, only a region can be selected by the user (``eng_region``
from the example above). DDG-lite and *no Javascript* store the selected
region in a cookie::
params['cookies']['kl'] = eng_region # 'ar-es'
"""
return eng_traits.custom['lang_region'].get( # type: ignore
sxng_locale, eng_traits.get_language(sxng_locale, default)
)
ddg_reg_map = {
'tw-tzh': 'zh_TW',
'hk-tzh': 'zh_HK',
'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
'es-ca': 'ca_ES',
'id-en': 'id_ID',
'no-no': 'nb_NO',
'jp-jp': 'ja_JP',
'kr-kr': 'ko_KR',
'xa-ar': 'ar_SA',
'sl-sl': 'sl_SI',
'th-en': 'th_TH',
'vn-en': 'vi_VN',
}
ddg_lang_map = {
# use ar --> ar_EG (Egypt's arabic)
"ar_DZ": 'lang_region',
"ar_JO": 'lang_region',
"ar_SA": 'lang_region',
# use bn --> bn_BD
'bn_IN': 'lang_region',
# use de --> de_DE
'de_CH': 'lang_region',
# use en --> en_US,
'en_AU': 'lang_region',
'en_CA': 'lang_region',
'en_GB': 'lang_region',
# Esperanto
'eo_XX': 'eo',
# use es --> es_ES,
'es_AR': 'lang_region',
'es_CL': 'lang_region',
'es_CO': 'lang_region',
'es_CR': 'lang_region',
'es_EC': 'lang_region',
'es_MX': 'lang_region',
'es_PE': 'lang_region',
'es_UY': 'lang_region',
'es_VE': 'lang_region',
# use fr --> rf_FR
'fr_CA': 'lang_region',
'fr_CH': 'lang_region',
'fr_BE': 'lang_region',
# use nl --> nl_NL
'nl_BE': 'lang_region',
# use pt --> pt_PT
'pt_BR': 'lang_region',
# skip these languages
'od_IN': 'skip',
'io_XX': 'skip',
'tokipona_XX': 'skip',
}
def quote_ddg_bangs(query):
# quote ddg bangs
query_parts = []
# for val in re.split(r'(\s+)', query):
for val in re.split(r'(\s+)', query):
if not val.strip():
continue
if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
val = f"'{val}'"
query_parts.append(val)
return ' '.join(query_parts)
def request(query, params):
query = quote_ddg_bangs(query)
if len(query) >= 500:
# DDG does not accept queries with more than 499 chars
params["url"] = None
return
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
# Note: The API is reverse-engineered from DuckDuckGo's HTML webpage
# (https://html.duckduckgo.com/html/) and may be subject to additional bot detection mechanisms
# and breaking changes in the future.
#
# The params['data'] dictionary can have the following key parameters, in this order:
# - q (str): Search query string
# - b (str): Beginning parameter - empty string for first page requests
# - s (int): Search offset for pagination
# - nextParams (str): Continuation parameters from previous page response, typically empty
# - v (str): Typically 'l' for subsequent pages
# - o (str): Output format, typically 'json'
# - dc (int): Display count - value equal to offset (s) + 1
# - api (str): API endpoint identifier, typically 'd.js'
# - vqd (str): Validation query digest
# - kl (str): Keyboard language/region code (e.g., 'en-us')
# - df (str): Time filter, maps to values like 'd' (day), 'w' (week), 'm' (month), 'y' (year)
params['data']['q'] = query
if params['pageno'] == 1:
params['data']['b'] = ""
elif params['pageno'] >= 2:
offset = 10 + (params['pageno'] - 2) * 15 # Page 2 = 10, Page 3+ = 10 + n*15
params['data']['s'] = offset
params['data']['nextParams'] = form_data.get('nextParams', '')
params['data']['v'] = form_data.get('v', 'l')
params['data']['o'] = form_data.get('o', 'json')
params['data']['dc'] = offset + 1
params['data']['api'] = form_data.get('api', 'd.js')
# vqd is required to request other pages after the first one
vqd = get_vqd(query, eng_region, force_request=False)
if vqd:
params['data']['vqd'] = vqd
else:
# Don't try to call follow up pages without a vqd value.
# DDG recognizes this as a request from a bot. This lowers the
# reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
params["url"] = None
return
if params['searxng_locale'].startswith("zh"):
# Some locales (at least China) do not have a "next page" button and DDG
# will return a HTTP/2 403 Forbidden for a request of such a page.
params["url"] = None
return
# Put empty kl in form data if language/region set to all
if eng_region == "wt-wt":
params['data']['kl'] = ""
else:
params['data']['kl'] = eng_region
params['data']['df'] = ''
if params['time_range'] in time_range_dict:
params['data']['df'] = time_range_dict[params['time_range']]
params['cookies']['df'] = time_range_dict[params['time_range']]
params['cookies']['kl'] = eng_region
params['url'] = url
params['method'] = 'POST'
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
params['headers']['Referer'] = url
params['headers']['Sec-Fetch-Dest'] = "document"
params['headers']['Sec-Fetch-Mode'] = "navigate" # at least this one is used by ddg's bot detection
params['headers']['Sec-Fetch-Site'] = "same-origin"
params['headers']['Sec-Fetch-User'] = "?1"
logger.debug("param headers: %s", params['headers'])
logger.debug("param data: %s", params['data'])
logger.debug("param cookies: %s", params['cookies'])
def is_ddg_captcha(dom):
"""In case of CAPTCHA ddg response its own *not a Robot* dialog and is not
redirected to a CAPTCHA page."""
return bool(eval_xpath(dom, "//form[@id='challenge-form']"))
def response(resp) -> EngineResults:
results = EngineResults()
if resp.status_code == 303:
return results
doc = lxml.html.fromstring(resp.text)
if is_ddg_captcha(doc):
# set suspend time to zero is OK --> ddg does not block the IP
raise SearxEngineCaptchaException(suspended_time=0, message=f"CAPTCHA ({resp.search_params['data'].get('kl')})")
form = eval_xpath(doc, '//input[@name="vqd"]/..')
if len(form):
# some locales (at least China) does not have a "next page" button
form = form[0]
form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
set_vqd(
query=resp.search_params['data']['q'],
region=resp.search_params['data']['kl'],
value=str(form_vqd),
)
# just select "web-result" and ignore results of class "result--ad result--ad--small"
for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
item = {}
title = eval_xpath(div_result, './/h2/a')
if not title:
# this is the "No results." item in the result list
continue
item["title"] = extract_text(title)
item["url"] = eval_xpath(div_result, './/h2/a/@href')[0]
item["content"] = extract_text(
eval_xpath_getindex(div_result, './/a[contains(@class, "result__snippet")]', 0, [])
)
results.append(item)
zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip() # type: ignore
if zero_click and (
"Your IP address is" not in zero_click
and "Your user agent:" not in zero_click
and "URL Decoded:" not in zero_click
):
results.add(
results.types.Answer(
answer=zero_click,
url=eval_xpath_getindex(doc, '//div[@id="zero_click_abstract"]/a/@href', 0), # type: ignore
)
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages & regions from DuckDuckGo.
SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
DuckDuckGo's language "Browsers preferred language" (``wt_WT``) makes no
sense in a SearXNG request since SearXNG's ``all`` will not add a
``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
is ``wt-wt`` (the region).
Beside regions DuckDuckGo also defines its languages by region codes. By
example these are the english languages in DuckDuckGo:
- en_US
- en_AU
- en_CA
- en_GB
The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
SearXNG's locale.
"""
# pylint: disable=too-many-branches, too-many-statements, disable=import-outside-toplevel
from searx.utils import js_variable_to_python
# fetch regions
engine_traits.all_locale = 'wt-wt'
# updated from u661.js to u.7669f071a13a7daa57cb / should be updated automatically?
resp = get('https://duckduckgo.com/dist/util/u.7669f071a13a7daa57cb.js')
if not resp.ok: # type: ignore
print("ERROR: response from DuckDuckGo is not OK.")
js_code = extr(resp.text, 'regions:', ',snippetLengths') # type: ignore
regions = json.loads(js_code)
for eng_tag, name in regions.items():
if eng_tag == 'wt-wt':
engine_traits.all_locale = 'wt-wt'
continue
region = ddg_reg_map.get(eng_tag)
if region == 'skip':
continue
if not region:
eng_territory, eng_lang = eng_tag.split('-')
region = eng_lang + '_' + eng_territory.upper()
try:
sxng_tag = locales.region_tag(babel.Locale.parse(region))
except babel.UnknownLocaleError:
print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
# fetch languages
engine_traits.custom['lang_region'] = {}
js_code = extr(resp.text, 'languages:', ',regions') # type: ignore
languages = js_variable_to_python(js_code)
for eng_lang, name in languages.items():
if eng_lang == 'wt_WT':
continue
babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
if babel_tag == 'skip':
continue
try:
if babel_tag == 'lang_region':
sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
engine_traits.custom['lang_region'][sxng_tag] = eng_lang
continue
sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
except babel.UnknownLocaleError:
print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang

View File

@@ -0,0 +1,264 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo Instant Answer API
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
reverse engineering we can see that some services (e.g. instant answers) still
in use from the DDG search engine.
As far we can say the *instant answers* API does not support languages, or at
least we could not find out how language support should work. It seems that
most of the features are based on English terms.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urlparse, urljoin
from lxml import html
from searx.data import WIKIDATA_UNITS
from searx.utils import extract_text, html_to_text, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.result_types import EngineResults
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
def is_broken_text(text):
"""duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
The href URL is broken, the "Related website" may contains some HTML.
The best solution seems to ignore these results.
"""
return text.startswith('http') and ' ' in text
def result_to_text(text, htmlResult):
# TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
result = None
dom = html.fromstring(htmlResult)
a = dom.xpath('//a')
if len(a) >= 1:
result = extract_text(a[0])
else:
result = text
if not is_broken_text(result):
return result
return None
def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query}))
return params
def response(resp) -> EngineResults:
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
results = EngineResults()
search_res = resp.json()
# search_res.get('Entity') possible values (not exhaustive) :
# * continent / country / department / location / waterfall
# * actor / musician / artist
# * book / performing art / film / television / media franchise / concert tour / playwright
# * prepared food
# * website / software / os / programming language / file format / software engineer
# * company
content = ''
heading = search_res.get('Heading', '')
attributes = []
urls = []
infobox_id = None
relatedTopics = []
# add answer if there is one
answer = search_res.get('Answer', '')
if answer:
answer_type = search_res.get('AnswerType')
logger.debug('AnswerType="%s" Answer="%s"', answer_type, answer)
if isinstance(answer, str) and answer_type not in ['calc', 'ip']:
results.add(
results.types.Answer(
answer=html_to_text(answer),
url=search_res.get('AbstractURL', ''),
)
)
# add infobox
if 'Definition' in search_res:
content = content + search_res.get('Definition', '')
if 'Abstract' in search_res:
content = content + search_res.get('Abstract', '')
# image
image = search_res.get('Image')
image = None if image == '' else image
if image is not None and urlparse(image).netloc == '':
image = urljoin('https://duckduckgo.com', image)
# urls
# Official website, Wikipedia page
for ddg_result in search_res.get('Results', []):
firstURL = ddg_result.get('FirstURL')
text = ddg_result.get('Text')
if firstURL is not None and text is not None:
urls.append({'title': text, 'url': firstURL})
results.append({'title': heading, 'url': firstURL})
# related topics
for ddg_result in search_res.get('RelatedTopics', []):
if 'FirstURL' in ddg_result:
firstURL = ddg_result.get('FirstURL')
text = ddg_result.get('Text')
if not is_broken_text(text):
suggestion = result_to_text(text, ddg_result.get('Result'))
if suggestion != heading and suggestion is not None:
results.append({'suggestion': suggestion})
elif 'Topics' in ddg_result:
suggestions = []
relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
for topic_result in ddg_result.get('Topics', []):
suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
if suggestion != heading and suggestion is not None:
suggestions.append(suggestion)
# abstract
abstractURL = search_res.get('AbstractURL', '')
if abstractURL != '':
# add as result ? problem always in english
infobox_id = abstractURL
urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
results.append({'url': abstractURL, 'title': heading})
# definition
definitionURL = search_res.get('DefinitionURL', '')
if definitionURL != '':
# add as result ? as answer ? problem always in english
infobox_id = definitionURL
urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
# to merge with wikidata's infobox
if infobox_id:
infobox_id = replace_http_by_https(infobox_id)
# attributes
# some will be converted to urls
if 'Infobox' in search_res:
infobox = search_res.get('Infobox')
if 'content' in infobox:
osm_zoom = 17
coordinates = None
for info in infobox.get('content'):
data_type = info.get('data_type')
data_label = info.get('label')
data_value = info.get('value')
# Workaround: ddg may return a double quote
if data_value == '""':
continue
# Is it an external URL ?
# * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
# * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
# * netflix_id
external_url = get_external_url(data_type, data_value)
if external_url is not None:
urls.append({'title': data_label, 'url': external_url})
elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
# ignore instance: Wikidata value from "Instance Of" (Qxxxx)
# ignore wiki_maps_trigger: reference to a javascript
# ignore google_play_artist_id: service shutdown
pass
elif data_type == 'string' and data_label == 'Website':
# There is already an URL for the website
pass
elif data_type == 'area':
attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
osm_zoom = area_to_osm_zoom(data_value.get('amount'))
elif data_type == 'coordinates':
if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
# coordinate on Earth
# get the zoom information from the area
coordinates = info
else:
# coordinate NOT on Earth
attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
elif data_type == 'string':
attributes.append({'label': data_label, 'value': data_value})
if coordinates:
data_label = coordinates.get('label')
data_value = coordinates.get('value')
latitude = data_value.get('latitude')
longitude = data_value.get('longitude')
url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
if len(heading) > 0:
# TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
else:
results.append(
{
'infobox': heading,
'id': infobox_id,
'content': content,
'img_src': image,
'attributes': attributes,
'urls': urls,
'relatedTopics': relatedTopics,
}
)
return results
def unit_to_str(unit):
for prefix in WIKIDATA_PREFIX:
if unit.startswith(prefix):
wikidata_entity = unit[len(prefix) :]
real_unit = WIKIDATA_UNITS.get(wikidata_entity)
if real_unit is None:
return unit
return real_unit['symbol']
return unit
def area_to_str(area):
"""parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
unit = unit_to_str(area.get('unit'))
if unit is not None:
try:
amount = float(area.get('amount'))
return '{} {}'.format(amount, unit)
except ValueError:
pass
return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))

View File

@@ -0,0 +1,149 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo Extra (images, videos, news)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
from __future__ import annotations
from datetime import datetime
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from searx.utils import get_embeded_stream_url, html_to_text
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import get_ddg_lang, get_vqd
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON (site requires js to get images)',
}
# engine dependent config
categories = ['images', 'web']
ddg_category = 'images'
"""The category must be any of ``images``, ``videos`` and ``news``
"""
paging = True
safesearch = True
send_accept_language_header = True
safesearch_cookies = {0: '-2', 1: None, 2: '1'}
safesearch_args = {0: '1', 1: None, 2: '1'}
search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
def request(query, params):
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
# request needs a vqd argument
vqd = get_vqd(query, eng_region, force_request=True)
if not vqd:
# some search terms do not have results and therefore no vqd value
params['url'] = None
return params
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
args = {
'q': query,
'o': 'json',
# 'u': 'bing',
'l': eng_region,
'f': ',,,,,',
'vqd': vqd,
}
if params['pageno'] > 1:
args['s'] = (params['pageno'] - 1) * 100
params['cookies']['ad'] = eng_lang # zh_CN
params['cookies']['ah'] = eng_region # "us-en,de-de"
params['cookies']['l'] = eng_region # "hk-tzh"
safe_search = safesearch_cookies.get(params['safesearch'])
if safe_search is not None:
params['cookies']['p'] = safe_search # "-2", "1"
safe_search = safesearch_args.get(params['safesearch'])
if safe_search is not None:
args['p'] = safe_search # "-1", "1"
logger.debug("cookies: %s", params['cookies'])
params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
# sending these two headers prevents rate limiting for the query
params['headers'] = {
'Referer': 'https://duckduckgo.com/',
'X-Requested-With': 'XMLHttpRequest',
}
return params
def _image_result(result):
return {
'template': 'images.html',
'url': result['url'],
'title': result['title'],
'content': '',
'thumbnail_src': result['thumbnail'],
'img_src': result['image'],
'resolution': '%s x %s' % (result['width'], result['height']),
'source': result['source'],
}
def _video_result(result):
return {
'template': 'videos.html',
'url': result['content'],
'title': result['title'],
'content': result['description'],
'thumbnail': result['images'].get('small') or result['images'].get('medium'),
'iframe_src': get_embeded_stream_url(result['content']),
'source': result['provider'],
'length': result['duration'],
'metadata': result.get('uploader'),
}
def _news_result(result):
return {
'url': result['url'],
'title': result['title'],
'content': html_to_text(result['excerpt']),
'source': result['source'],
'publishedDate': datetime.fromtimestamp(result['date']),
}
def response(resp):
results = []
res_json = resp.json()
for result in res_json['results']:
if ddg_category == 'images':
results.append(_image_result(result))
elif ddg_category == 'videos':
results.append(_video_result(result))
elif ddg_category == 'news':
results.append(_news_result(result))
else:
raise ValueError(f"Invalid duckduckgo category: {ddg_category}")
return results

View File

@@ -0,0 +1,158 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo Weather
~~~~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
from json import loads
from urllib.parse import quote
from dateutil import parser as date_parser
from flask_babel import gettext
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import get_ddg_lang
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
send_accept_language_header = True
# engine dependent config
categories = ["weather"]
URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
def generate_condition_table(condition):
res = ""
res += f"<tr><td><b>{gettext('Condition')}</b></td>" f"<td><b>{condition['conditionCode']}</b></td></tr>"
res += (
f"<tr><td><b>{gettext('Temperature')}</b></td>"
f"<td><b>{condition['temperature']}°C / {c_to_f(condition['temperature'])}°F</b></td></tr>"
)
res += (
f"<tr><td>{gettext('Feels like')}</td><td>{condition['temperatureApparent']}°C / "
f"{c_to_f(condition['temperatureApparent'])}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Wind')}</td><td>{condition['windDirection']}° — "
f"{(condition['windSpeed'] * 1.6093440006147):.2f} km/h / {condition['windSpeed']} mph</td></tr>"
)
res += f"<tr><td>{gettext('Visibility')}</td><td>{condition['visibility']} m</td>"
res += f"<tr><td>{gettext('Humidity')}</td><td>{(condition['humidity'] * 100):.1f}%</td></tr>"
return res
def generate_day_table(day):
res = ""
res += (
f"<tr><td>{gettext('Min temp.')}</td><td>{day['temperatureMin']}°C / "
f"{c_to_f(day['temperatureMin'])}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Max temp.')}</td><td>{day['temperatureMax']}°C / "
f"{c_to_f(day['temperatureMax'])}°F</td></tr>"
)
res += f"<tr><td>{gettext('UV index')}</td><td>{day['maxUvIndex']}</td></tr>"
res += f"<tr><td>{gettext('Sunrise')}</td><td>{date_parser.parse(day['sunrise']).strftime('%H:%M')}</td></tr>"
res += f"<tr><td>{gettext('Sunset')}</td><td>{date_parser.parse(day['sunset']).strftime('%H:%M')}</td></tr>"
return res
def request(query, params):
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
# !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
logger.debug("cookies: %s", params['cookies'])
params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0])
return params
def c_to_f(temperature):
return "%.2f" % ((temperature * 1.8) + 32)
def response(resp):
results = []
if resp.text.strip() == "ddg_spice_forecast();":
return []
result = loads(resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2])
current = result["currentWeather"]
title = result['location']
infobox = f"<h3>{gettext('Current condition')}</h3><table><tbody>"
infobox += generate_condition_table(current)
infobox += "</tbody></table>"
last_date = None
for time in result['forecastHourly']['hours']:
current_time = date_parser.parse(time['forecastStart'])
if last_date != current_time.date():
if last_date is not None:
infobox += "</tbody></table>"
infobox += f"<h3>{current_time.strftime('%Y-%m-%d')}</h3>"
infobox += "<table><tbody>"
for day in result['forecastDaily']['days']:
if date_parser.parse(day['forecastStart']).date() == current_time.date():
infobox += generate_day_table(day)
infobox += "</tbody></table><table><tbody>"
last_date = current_time.date()
infobox += f"<tr><td rowspan=\"7\"><b>{current_time.strftime('%H:%M')}</b></td></tr>"
infobox += generate_condition_table(time)
infobox += "</tbody></table>"
results.append(
{
"infobox": title,
"content": infobox,
}
)
return results

71
searx/engines/duden.py Normal file
View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Duden
"""
import re
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.network import raise_for_httperror
# about
about = {
"website": 'https://www.duden.de',
"wikidata_id": 'Q73624591',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'de',
}
categories = ['dictionaries']
paging = True
# search-url
base_url = 'https://www.duden.de/'
search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}'
def request(query, params):
offset = params['pageno'] - 1
if offset == 0:
search_url_fmt = base_url + 'suchen/dudenonline/{query}'
params['url'] = search_url_fmt.format(query=quote(query))
else:
params['url'] = search_url.format(offset=offset, query=quote(query))
# after the last page of results, spelling corrections are returned after a HTTP redirect
# whatever the page number is
params['soft_max_redirects'] = 1
params['raise_for_httperror'] = False
return params
def response(resp):
results = []
if resp.status_code == 404:
return results
raise_for_httperror(resp)
dom = html.fromstring(resp.text)
number_of_results_element = eval_xpath_getindex(
dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None
)
if number_of_results_element is not None:
number_of_results_string = re.sub('[^0-9]', '', number_of_results_element)
results.append({'number_of_results': int(number_of_results_string)})
for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'):
url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
url = urljoin(base_url, url)
title = eval_xpath(result, 'string(.//h2/a)').strip()
content = extract_text(eval_xpath(result, './/p'))
# append result
results.append({'url': url, 'title': title, 'content': content})
return results

View File

@@ -0,0 +1,23 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""Dummy Offline
"""
# about
about = {
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
def search(query, request_params): # pylint: disable=unused-argument
return [
{
'result': 'this is what you get',
}
]

24
searx/engines/dummy.py Normal file
View File

@@ -0,0 +1,24 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Dummy
"""
# about
about = {
"website": None,
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'empty array',
}
# do search-request
def request(query, params): # pylint: disable=unused-argument
return params
# get response from search-request
def response(resp): # pylint: disable=unused-argument
return []

Some files were not shown because too many files have changed in this diff Show More