first commit

This commit is contained in:
Iyas Altawil
2025-06-26 15:38:10 +03:30
commit e928faf6d2
899 changed files with 403713 additions and 0 deletions

56
searx/engines/1337x.py Normal file
View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""1337x
"""
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://1337x.to/',
"wikidata_id": 'Q28134166',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
url = 'https://1337x.to/'
search_url = url + 'search/{search_term}/{pageno}/'
categories = ['files']
paging = True
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//table[contains(@class, "table-list")]/tbody//tr'):
href = urljoin(url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0))
title = extract_text(eval_xpath(result, './td[contains(@class, "name")]/a[2]'))
seed = extract_text(eval_xpath(result, './/td[contains(@class, "seeds")]'))
leech = extract_text(eval_xpath(result, './/td[contains(@class, "leeches")]'))
filesize = extract_text(eval_xpath(result, './/td[contains(@class, "size")]/text()'))
results.append(
{
'url': href,
'title': title,
'seed': seed,
'leech': leech,
'filesize': filesize,
'template': 'torrent.html',
}
)
return results

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""360Search search engine for searxng"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text
# Metadata
about = {
"website": "https://www.so.com/",
"wikidata_id": "Q10846064",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
"language": "zh",
}
# Engine Configuration
categories = ["general"]
paging = True
time_range_support = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
# Base URL
base_url = "https://www.so.com"
def request(query, params):
query_params = {
"pn": params["pageno"],
"q": query,
}
if time_range_dict.get(params['time_range']):
query_params["adv_t"] = time_range_dict.get(params['time_range'])
params["url"] = f"{base_url}/s?{urlencode(query_params)}"
return params
def response(resp):
dom = html.fromstring(resp.text)
results = []
for item in dom.xpath('//li[contains(@class, "res-list")]'):
title = extract_text(item.xpath('.//h3[contains(@class, "res-title")]/a'))
url = extract_text(item.xpath('.//h3[contains(@class, "res-title")]/a/@data-mdurl'))
if not url:
url = extract_text(item.xpath('.//h3[contains(@class, "res-title")]/a/@href'))
content = extract_text(item.xpath('.//p[@class="res-desc"]'))
if not content:
content = extract_text(item.xpath('.//span[@class="res-list-summary"]'))
if title and url:
results.append(
{
"title": title,
"url": url,
"content": content,
}
)
return results

View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""360Search-Videos: A search engine for retrieving videos from 360Search."""
from urllib.parse import urlencode
from datetime import datetime
from searx.exceptions import SearxEngineAPIException
from searx.utils import html_to_text, get_embeded_stream_url
about = {
"website": "https://tv.360kan.com/",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
paging = True
results_per_page = 10
categories = ["videos"]
base_url = "https://tv.360kan.com"
def request(query, params):
query_params = {"count": 10, "q": query, "start": params["pageno"] * 10}
params["url"] = f"{base_url}/v1/video/list?{urlencode(query_params)}"
return params
def response(resp):
try:
data = resp.json()
except Exception as e:
raise SearxEngineAPIException(f"Invalid response: {e}") from e
results = []
if "data" not in data or "result" not in data["data"]:
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["result"]:
if not entry.get("title") or not entry.get("play_url"):
continue
published_date = None
if entry.get("publish_time"):
try:
published_date = datetime.fromtimestamp(int(entry["publish_time"]))
except (ValueError, TypeError):
published_date = None
results.append(
{
'url': entry["play_url"],
'title': html_to_text(entry["title"]),
'content': html_to_text(entry["description"]),
'template': 'videos.html',
'publishedDate': published_date,
'thumbnail': entry["cover_img"],
"iframe_src": get_embeded_stream_url(entry["play_url"]),
}
)
return results

76
searx/engines/9gag.py Normal file
View File

@@ -0,0 +1,76 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""9GAG (social media)"""
from json import loads
from datetime import datetime
from urllib.parse import urlencode
about = {
"website": 'https://9gag.com/',
"wikidata_id": 'Q277421',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['social media']
paging = True
search_url = "https://9gag.com/v1/search-posts?{query}"
page_size = 10
def request(query, params):
query = urlencode({'query': query, 'c': (params['pageno'] - 1) * page_size})
params['url'] = search_url.format(query=query)
return params
def response(resp):
results = []
json_results = loads(resp.text)['data']
for result in json_results['posts']:
result_type = result['type']
# Get the not cropped version of the thumbnail when the image height is not too important
if result['images']['image700']['height'] > 400:
thumbnail = result['images']['imageFbThumbnail']['url']
else:
thumbnail = result['images']['image700']['url']
if result_type == 'Photo':
results.append(
{
'template': 'images.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.fromtimestamp(result['creationTs']),
'img_src': result['images']['image700']['url'],
'thumbnail_src': thumbnail,
}
)
elif result_type == 'Animated':
results.append(
{
'template': 'videos.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.fromtimestamp(result['creationTs']),
'thumbnail': thumbnail,
'iframe_src': result['images'].get('image460sv', {}).get('url'),
}
)
if 'tags' in json_results:
for suggestion in json_results['tags']:
results.append({'suggestion': suggestion['key']})
return results

253
searx/engines/__init__.py Normal file
View File

@@ -0,0 +1,253 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Load and initialize the ``engines``, see :py:func:`load_engines` and register
:py:obj:`engine_shortcuts`.
usage::
load_engines( settings['engines'] )
"""
from __future__ import annotations
import sys
import copy
from os.path import realpath, dirname
from typing import TYPE_CHECKING, Dict
import types
import inspect
from searx import logger, settings
from searx.utils import load_module
if TYPE_CHECKING:
from searx.enginelib import Engine
logger = logger.getChild('engines')
ENGINE_DIR = dirname(realpath(__file__))
ENGINE_DEFAULT_ARGS = {
# Common options in the engine module
"engine_type": "online",
"paging": False,
"time_range_support": False,
"safesearch": False,
# settings.yml
"categories": ["general"],
"enable_http": False,
"shortcut": "-",
"timeout": settings["outgoing"]["request_timeout"],
"display_error_messages": True,
"disabled": False,
"inactive": False,
"about": {},
"using_tor_proxy": False,
"send_accept_language_header": False,
"tokens": [],
"max_page": 0,
}
# set automatically when an engine does not have any tab category
DEFAULT_CATEGORY = 'other'
# Defaults for the namespace of an engine module, see :py:func:`load_engine`
categories = {'general': []}
engines: Dict[str, Engine | types.ModuleType] = {}
engine_shortcuts = {}
"""Simple map of registered *shortcuts* to name of the engine (or ``None``).
::
engine_shortcuts[engine.shortcut] = engine.name
:meta hide-value:
"""
def check_engine_module(module: types.ModuleType):
# probe unintentional name collisions / for example name collisions caused
# by import statements in the engine module ..
# network: https://github.com/searxng/searxng/issues/762#issuecomment-1605323861
obj = getattr(module, 'network', None)
if obj and inspect.ismodule(obj):
msg = f'type of {module.__name__}.network is a module ({obj.__name__}), expected a string'
# logger.error(msg)
raise TypeError(msg)
def load_engine(engine_data: dict) -> Engine | types.ModuleType | None:
"""Load engine from ``engine_data``.
:param dict engine_data: Attributes from YAML ``settings:engines/<engine>``
:return: initialized namespace of the ``<engine>``.
1. create a namespace and load module of the ``<engine>``
2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS`
3. update namespace with values from ``engine_data``
If engine *is active*, return namespace of the engine, otherwise return
``None``.
This function also returns ``None`` if initialization of the namespace fails
for one of the following reasons:
- engine name contains underscore
- engine name is not lowercase
- required attribute is not set :py:func:`is_missing_required_attributes`
"""
# pylint: disable=too-many-return-statements
engine_name = engine_data.get('name')
if engine_name is None:
logger.error('An engine does not have a "name" field')
return None
if '_' in engine_name:
logger.error('Engine name contains underscore: "{}"'.format(engine_name))
return None
if engine_name.lower() != engine_name:
logger.warning('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name))
engine_name = engine_name.lower()
engine_data['name'] = engine_name
# load_module
module_name = engine_data.get('engine')
if module_name is None:
logger.error('The "engine" field is missing for the engine named "{}"'.format(engine_name))
return None
try:
engine = load_module(module_name + '.py', ENGINE_DIR)
except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError):
logger.exception('Fatal exception in engine "{}"'.format(module_name))
sys.exit(1)
except BaseException:
logger.exception('Cannot load engine "{}"'.format(module_name))
return None
check_engine_module(engine)
update_engine_attributes(engine, engine_data)
update_attributes_for_tor(engine)
# avoid cyclic imports
# pylint: disable=import-outside-toplevel
from searx.enginelib.traits import EngineTraitsMap
trait_map = EngineTraitsMap.from_data()
trait_map.set_traits(engine)
if not is_engine_active(engine):
return None
if is_missing_required_attributes(engine):
return None
set_loggers(engine, engine_name)
if not any(cat in settings['categories_as_tabs'] for cat in engine.categories):
engine.categories.append(DEFAULT_CATEGORY)
return engine
def set_loggers(engine, engine_name):
# set the logger for engine
engine.logger = logger.getChild(engine_name)
# the engine may have load some other engines
# may sure the logger is initialized
# use sys.modules.copy() to avoid "RuntimeError: dictionary changed size during iteration"
# see https://github.com/python/cpython/issues/89516
# and https://docs.python.org/3.10/library/sys.html#sys.modules
modules = sys.modules.copy()
for module_name, module in modules.items():
if (
module_name.startswith("searx.engines")
and module_name != "searx.engines.__init__"
and not hasattr(module, "logger")
):
module_engine_name = module_name.split(".")[-1]
module.logger = logger.getChild(module_engine_name) # type: ignore
def update_engine_attributes(engine: Engine | types.ModuleType, engine_data):
# set engine attributes from engine_data
for param_name, param_value in engine_data.items():
if param_name == 'categories':
if isinstance(param_value, str):
param_value = list(map(str.strip, param_value.split(',')))
engine.categories = param_value # type: ignore
elif hasattr(engine, 'about') and param_name == 'about':
engine.about = {**engine.about, **engine_data['about']} # type: ignore
else:
setattr(engine, param_name, param_value)
# set default attributes
for arg_name, arg_value in ENGINE_DEFAULT_ARGS.items():
if not hasattr(engine, arg_name):
setattr(engine, arg_name, copy.deepcopy(arg_value))
def update_attributes_for_tor(engine: Engine | types.ModuleType):
if using_tor_proxy(engine) and hasattr(engine, 'onion_url'):
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') # type: ignore
engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) # type: ignore
def is_missing_required_attributes(engine):
"""An attribute is required when its name doesn't start with ``_`` (underline).
Required attributes must not be ``None``.
"""
missing = False
for engine_attr in dir(engine):
if not engine_attr.startswith('_') and getattr(engine, engine_attr) is None:
logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr))
missing = True
return missing
def using_tor_proxy(engine: Engine | types.ModuleType):
"""Return True if the engine configuration declares to use Tor."""
return settings['outgoing'].get('using_tor_proxy') or getattr(engine, 'using_tor_proxy', False)
def is_engine_active(engine: Engine | types.ModuleType):
# check if engine is inactive
if engine.inactive is True:
return False
# exclude onion engines if not using tor
if 'onions' in engine.categories and not using_tor_proxy(engine):
return False
return True
def register_engine(engine: Engine | types.ModuleType):
if engine.name in engines:
logger.error('Engine config error: ambiguous name: {0}'.format(engine.name))
sys.exit(1)
engines[engine.name] = engine
if engine.shortcut in engine_shortcuts:
logger.error('Engine config error: ambiguous shortcut: {0}'.format(engine.shortcut))
sys.exit(1)
engine_shortcuts[engine.shortcut] = engine.name
for category_name in engine.categories:
categories.setdefault(category_name, []).append(engine)
def load_engines(engine_list):
"""usage: ``engine_list = settings['engines']``"""
engines.clear()
engine_shortcuts.clear()
categories.clear()
categories['general'] = []
for engine_data in engine_list:
engine = load_engine(engine_data)
if engine:
register_engine(engine)
return engines

109
searx/engines/acfun.py Normal file
View File

@@ -0,0 +1,109 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Acfun search engine for searxng"""
from urllib.parse import urlencode
import re
import json
from datetime import datetime, timedelta
from lxml import html
from searx.utils import extract_text
# Metadata
about = {
"website": "https://www.acfun.cn/",
"wikidata_id": "Q3077675",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
"language": "zh",
}
# Engine Configuration
categories = ["videos"]
paging = True
# Base URL
base_url = "https://www.acfun.cn"
def request(query, params):
query_params = {"keyword": query, "pCursor": params["pageno"]}
params["url"] = f"{base_url}/search?{urlencode(query_params)}"
return params
def response(resp):
results = []
matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL)
if not matches:
return results
for match in matches:
try:
json_data = json.loads(match)
raw_html = json_data.get("html", "")
if not raw_html:
continue
tree = html.fromstring(raw_html)
video_blocks = tree.xpath('//div[contains(@class, "search-video")]')
if not video_blocks:
continue
for video_block in video_blocks:
video_info = extract_video_data(video_block)
if video_info and video_info["title"] and video_info["url"]:
results.append(video_info)
except json.JSONDecodeError:
continue
return results
def extract_video_data(video_block):
try:
data_exposure_log = video_block.get('data-exposure-log')
video_data = json.loads(data_exposure_log)
content_id = video_data.get("content_id", "")
title = video_data.get("title", "")
url = f"{base_url}/v/ac{content_id}"
iframe_src = f"{base_url}/player/ac{content_id}"
create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]'))
video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0])
video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]'))
video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]'))
published_date = None
if create_time:
try:
published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d")
except (ValueError, TypeError):
pass
length = None
if video_duration:
try:
timediff = datetime.strptime(video_duration.strip(), "%M:%S")
length = timedelta(minutes=timediff.minute, seconds=timediff.second)
except (ValueError, TypeError):
pass
return {
"title": title,
"url": url,
"content": video_intro,
"thumbnail": video_cover,
"length": length,
"publishedDate": published_date,
"iframe_src": iframe_src,
}
except (json.JSONDecodeError, AttributeError, TypeError, ValueError):
return None

View File

@@ -0,0 +1,229 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Adobe Stock`_ is a service that gives access to millions of royalty-free
assets. Assets types include photos, vectors, illustrations, templates, 3D
assets, videos, motion graphics templates and audio tracks.
.. Adobe Stock: https://stock.adobe.com/
Configuration
=============
The engine has the following mandatory setting:
- SearXNG's :ref:`engine categories`
- Adobe-Stock's :py:obj:`adobe_order`
- Adobe-Stock's :py:obj:`adobe_content_types`
.. code:: yaml
- name: adobe stock
engine: adobe_stock
shortcut: asi
categories: [images]
adobe_order: relevance
adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
- name: adobe stock video
engine: adobe_stock
network: adobe stock
shortcut: asi
categories: [videos]
adobe_order: relevance
adobe_content_types: ["video"]
Implementation
==============
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from datetime import datetime, timedelta
from urllib.parse import urlencode
import isodate
if TYPE_CHECKING:
import logging
logger: logging.Logger
about = {
"website": "https://stock.adobe.com/",
"wikidata_id": "Q5977430",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
categories = []
paging = True
send_accept_language_header = True
results_per_page = 10
base_url = "https://stock.adobe.com"
adobe_order: str = ""
"""Sort order, can be one of:
- ``relevance`` or
- ``featured`` or
- ``creation`` (most recent) or
- ``nb_downloads`` (number of downloads)
"""
ADOBE_VALID_TYPES = ["photo", "illustration", "zip_vector", "video", "template", "3d", "audio", "image"]
adobe_content_types: list = []
"""A list of of content types. The following content types are offered:
- Images: ``image``
- Videos: ``video``
- Templates: ``template``
- 3D: ``3d``
- Audio ``audio``
Additional subcategories:
- Photos: ``photo``
- Illustrations: ``illustration``
- Vectors: ``zip_vector`` (Vectors),
"""
# Do we need support for "free_collection" and "include_stock_enterprise"?
def init(_):
if not categories:
raise ValueError("adobe_stock engine: categories is unset")
# adobe_order
if not adobe_order:
raise ValueError("adobe_stock engine: adobe_order is unset")
if adobe_order not in ["relevance", "featured", "creation", "nb_downloads"]:
raise ValueError(f"unsupported adobe_order: {adobe_order}")
# adobe_content_types
if not adobe_content_types:
raise ValueError("adobe_stock engine: adobe_content_types is unset")
if isinstance(adobe_content_types, list):
for t in adobe_content_types:
if t not in ADOBE_VALID_TYPES:
raise ValueError("adobe_stock engine: adobe_content_types: '%s' is invalid" % t)
else:
raise ValueError(
"adobe_stock engine: adobe_content_types must be a list of strings not %s" % type(adobe_content_types)
)
def request(query, params):
args = {
"k": query,
"limit": results_per_page,
"order": adobe_order,
"search_page": params["pageno"],
"search_type": "pagination",
}
for content_type in ADOBE_VALID_TYPES:
args[f"filters[content_type:{content_type}]"] = 1 if content_type in adobe_content_types else 0
params["url"] = f"{base_url}/de/Ajax/Search?{urlencode(args)}"
# headers required to bypass bot-detection
if params["searxng_locale"] == "all":
params["headers"]["Accept-Language"] = "en-US,en;q=0.5"
return params
def parse_image_item(item):
return {
"template": "images.html",
"url": item["content_url"],
"title": item["title"],
"content": item["asset_type"],
"img_src": item["content_thumb_extra_large_url"],
"thumbnail_src": item["thumbnail_url"],
"resolution": f"{item['content_original_width']}x{item['content_original_height']}",
"img_format": item["format"],
"author": item["author"],
}
def parse_video_item(item):
# in video items, the title is more or less a "content description", we try
# to reduce the length of the title ..
title = item["title"]
content = ""
if "." in title.strip()[:-1]:
content = title
title = title.split(".", 1)[0]
elif "," in title:
content = title
title = title.split(",", 1)[0]
elif len(title) > 50:
content = title
title = ""
for w in content.split(" "):
title += f" {w}"
if len(title) > 50:
title = title.strip() + "\u2026"
break
return {
"template": "videos.html",
"url": item["content_url"],
"title": title,
"content": content,
# https://en.wikipedia.org/wiki/ISO_8601#Durations
"length": isodate.parse_duration(item["time_duration"]),
"publishedDate": datetime.fromisoformat(item["creation_date"]),
"thumbnail": item["thumbnail_url"],
"iframe_src": item["video_small_preview_url"],
"metadata": item["asset_type"],
}
def parse_audio_item(item):
audio_data = item["audio_data"]
content = audio_data.get("description") or ""
if audio_data.get("album"):
content = audio_data["album"] + " - " + content
return {
"url": item["content_url"],
"title": item["title"],
"content": content,
# "thumbnail": base_url + item["thumbnail_url"],
"iframe_src": audio_data["preview"]["url"],
"publishedDate": datetime.fromisoformat(audio_data["release_date"]) if audio_data["release_date"] else None,
"length": timedelta(seconds=round(audio_data["duration"] / 1000)) if audio_data["duration"] else None,
"author": item.get("artist_name"),
}
def response(resp):
results = []
json_resp = resp.json()
if isinstance(json_resp["items"], list):
return None
for item in json_resp["items"].values():
if item["asset_type"].lower() in ["image", "premium-image", "illustration", "vector"]:
result = parse_image_item(item)
elif item["asset_type"].lower() == "video":
result = parse_video_item(item)
elif item["asset_type"].lower() == "audio":
result = parse_audio_item(item)
else:
logger.error("no handle for %s --> %s", item["asset_type"], item)
continue
results.append(result)
return results

80
searx/engines/ahmia.py Normal file
View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Ahmia (Onions)
"""
from urllib.parse import urlencode, urlparse, parse_qs
from lxml.html import fromstring
from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath
# about
about = {
"website": 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion',
"wikidata_id": 'Q18693938',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine config
categories = ['onions']
paging = True
page_size = 10
# search url
search_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?{query}'
time_range_support = True
time_range_dict = {'day': 1, 'week': 7, 'month': 30}
# xpaths
results_xpath = '//li[@class="result"]'
url_xpath = './h4/a/@href'
title_xpath = './h4/a[1]'
content_xpath = './/p[1]'
correction_xpath = '//*[@id="didYouMean"]//a'
number_of_results_xpath = '//*[@id="totalResults"]'
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
if params['time_range'] in time_range_dict:
params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
return params
def response(resp):
results = []
dom = fromstring(resp.text)
# trim results so there's not way too many at once
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
all_results = eval_xpath_list(dom, results_xpath)
trimmed_results = all_results[first_result_index : first_result_index + page_size]
# get results
for result in trimmed_results:
# remove ahmia url and extract the actual url for the result
raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
title = extract_text(eval_xpath(result, title_xpath))
content = extract_text(eval_xpath(result, content_xpath))
results.append({'url': cleaned_url, 'title': title, 'content': content, 'is_onion': True})
# get spelling corrections
for correction in eval_xpath_list(dom, correction_xpath):
results.append({'correction': extract_text(correction)})
# get number of results
number_of_results = eval_xpath(dom, number_of_results_xpath)
if number_of_results:
try:
results.append({'number_of_results': int(extract_text(number_of_results))})
except: # pylint: disable=bare-except
pass
return results

View File

@@ -0,0 +1,83 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Alpine Linux binary packages`_. `Alpine Linux`_ is a Linux-based operation
system designed to be small, simple and secure. Contrary to many other Linux
distributions, it uses musl, BusyBox and OpenRC. Alpine is mostly used on
servers and for Docker images.
.. _Alpine Linux binary packages: https://pkgs.alpinelinux.org
.. _Alpine Linux: https://www.alpinelinux.org
"""
import re
from urllib.parse import urlencode
from lxml import html
from dateutil import parser
from searx.utils import eval_xpath, eval_xpath_list, extract_text
about = {
'website': 'https://www.alpinelinux.org',
'wikidata_id': 'Q4033826',
'use_official_api': False,
'official_api_documentation': None,
'require_api_key': False,
'results': 'HTML',
}
paging = True
categories = ['packages', 'it']
base_url = "https://pkgs.alpinelinux.org"
alpine_arch = 'x86_64'
"""Kernel architecture: ``x86_64``, ``x86``, ``aarch64``, ``armhf``,
``ppc64le``, ``s390x``, ``armv7`` or ``riscv64``"""
ARCH_RE = re.compile("x86_64|x86|aarch64|armhf|ppc64le|s390x|armv7|riscv64")
"""Regular expression to match supported architectures in the query string."""
def request(query, params):
query_arch = ARCH_RE.search(query)
if query_arch:
query_arch = query_arch.group(0)
query = query.replace(query_arch, '').strip()
args = {
# use wildcards to match more than just packages with the exact same
# name as the query
'name': f"*{query}*",
'page': params['pageno'],
'arch': query_arch or alpine_arch,
}
params['url'] = f"{base_url}/packages?{urlencode(args)}"
return params
def response(resp):
results = []
doc = html.fromstring(resp.text)
for result in eval_xpath_list(doc, "//table/tbody/tr"):
if len(result.xpath("./td")) < 9:
# skip non valid entries in the result table
# e.g the "No item found..." message
continue
results.append(
{
'template': 'packages.html',
'url': base_url + extract_text(eval_xpath(result, './td[contains(@class, "package")]/a/@href')),
'title': extract_text(eval_xpath(result, './td[contains(@class, "package")]')),
'package_name': extract_text(eval_xpath(result, './td[contains(@class, "package")]')),
'publishedDate': parser.parse(extract_text(eval_xpath(result, './td[contains(@class, "bdate")]'))),
'version': extract_text(eval_xpath(result, './td[contains(@class, "version")]')),
'homepage': extract_text(eval_xpath(result, './td[contains(@class, "url")]/a/@href')),
'maintainer': extract_text(eval_xpath(result, './td[contains(@class, "maintainer")]')),
'license_name': extract_text(eval_xpath(result, './td[contains(@class, "license")]')),
'tags': [extract_text(eval_xpath(result, './td[contains(@class, "repo")]'))],
}
)
return results

View File

@@ -0,0 +1,202 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Anna's Archive`_ is a free non-profit online shadow library metasearch
engine providing access to a variety of book resources (also via IPFS), created
by a team of anonymous archivists (AnnaArchivist_).
.. _Anna's Archive: https://annas-archive.org/
.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive
Configuration
=============
The engine has the following additional settings:
- :py:obj:`aa_content`
- :py:obj:`aa_ext`
- :py:obj:`aa_sort`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific searches in Anna's Archive. For example a engine to search
for *newest* articles and journals (PDF) / by shortcut ``!aaa <search-term>``.
.. code:: yaml
- name: annas articles
engine: annas_archive
shortcut: aaa
aa_content: 'magazine'
aa_ext: 'pdf'
aa_sort: 'newest'
Implementations
===============
"""
from typing import List, Dict, Any, Optional
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_getindex, eval_xpath_list
from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
# about
about: Dict[str, Any] = {
"website": "https://annas-archive.org/",
"wikidata_id": "Q115288326",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
# engine dependent config
categories: List[str] = ["files"]
paging: bool = True
# search-url
base_url: str = "https://annas-archive.org"
aa_content: str = ""
"""Anan's search form field **Content** / possible values::
book_fiction, book_unknown, book_nonfiction,
book_comic, magazine, standards_document
To not filter use an empty string (default).
"""
aa_sort: str = ''
"""Sort Anna's results, possible values::
newest, oldest, largest, smallest
To sort by *most relevant* use an empty string (default)."""
aa_ext: str = ''
"""Filter Anna's results by a file ending. Common filters for example are
``pdf`` and ``epub``.
.. note::
Anna's Archive is a beta release: Filter results by file extension does not
really work on Anna's Archive.
"""
def init(engine_settings=None): # pylint: disable=unused-argument
"""Check of engine's settings."""
traits = EngineTraits(**ENGINE_TRAITS['annas archive'])
if aa_content and aa_content not in traits.custom['content']:
raise ValueError(f'invalid setting content: {aa_content}')
if aa_sort and aa_sort not in traits.custom['sort']:
raise ValueError(f'invalid setting sort: {aa_sort}')
if aa_ext and aa_ext not in traits.custom['ext']:
raise ValueError(f'invalid setting ext: {aa_ext}')
def request(query, params: Dict[str, Any]) -> Dict[str, Any]:
lang = traits.get_language(params["language"], traits.all_locale) # type: ignore
args = {
'lang': lang,
'content': aa_content,
'ext': aa_ext,
'sort': aa_sort,
'q': query,
'page': params['pageno'],
}
# filter out None and empty values
filtered_args = dict((k, v) for k, v in args.items() if v)
params["url"] = f"{base_url}/search?{urlencode(filtered_args)}"
return params
def response(resp) -> List[Dict[str, Optional[str]]]:
results: List[Dict[str, Optional[str]]] = []
dom = html.fromstring(resp.text)
for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'):
results.append(_get_result(item))
# The rendering of the WEB page is very strange; except the first position
# all other positions of Anna's result page are enclosed in SGML comments.
# These comments are *uncommented* by some JS code, see query of class
# '.js-scroll-hidden' in Anna's HTML template:
# https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'):
item = html.fromstring(item.xpath('./comment()')[0].text)
results.append(_get_result(item))
return results
def _get_result(item):
return {
'template': 'paper.html',
'url': base_url + extract_text(eval_xpath_getindex(item, './@href', 0)),
'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
'thumbnail': extract_text(eval_xpath_getindex(item, './/img/@src', 0, default=None), allow_none=True),
}
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and other search arguments from Anna's search form."""
# pylint: disable=import-outside-toplevel
import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag
engine_traits.all_locale = ''
engine_traits.custom['content'] = []
engine_traits.custom['ext'] = []
engine_traits.custom['sort'] = []
resp = get(base_url + '/search')
if not resp.ok: # type: ignore
raise RuntimeError("Response from Anna's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
# supported language codes
lang_map = {}
for x in eval_xpath_list(dom, "//form//input[@name='lang']"):
eng_lang = x.get("value")
if eng_lang in ('', '_empty', 'nl-BE', 'und') or eng_lang.startswith('anti__'):
continue
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
# silently ignore unknown languages
# print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = eng_lang
for x in eval_xpath_list(dom, "//form//input[@name='content']"):
if not x.get("value").startswith("anti__"):
engine_traits.custom['content'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//input[@name='ext']"):
if not x.get("value").startswith("anti__"):
engine_traits.custom['ext'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"):
engine_traits.custom['sort'].append(x.get("value"))
# for better diff; sort the persistence of these traits
engine_traits.custom['content'].sort()
engine_traits.custom['ext'].sort()
engine_traits.custom['sort'].sort()

81
searx/engines/ansa.py Normal file
View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Engine for Ansa, Italy's oldest news agency.
To use this engine add the following entry to your engines
list in ``settings.yml``:
.. code:: yaml
- name: ansa
engine: ansa
shortcut: ans
disabled: false
"""
from urllib.parse import urlencode
from lxml import html
from searx.result_types import EngineResults, MainResult
from searx.utils import eval_xpath, eval_xpath_list, extract_text
engine_type = 'online'
language_support = False
categories = ['news']
paging = True
page_size = 12
base_url = 'https://www.ansa.it'
time_range_support = True
time_range_args = {
'day': 1,
'week': 7,
'month': 31,
'year': 365,
}
# https://www.ansa.it/ricerca/ansait/search.shtml?start=0&any=houthi&periodo=&sort=data%3Adesc
search_api = 'https://www.ansa.it/ricerca/ansait/search.shtml?'
about = {
'website': 'https://www.ansa.it',
'wikidata_id': 'Q392934',
'official_api_documentation': None,
'use_official_api': False,
'require_api_key': False,
'results': 'HTML',
'language': 'it',
}
def request(query, params):
query_params = {
'any': query,
'start': (params['pageno'] - 1) * page_size,
'sort': "data:desc",
}
if params['time_range']:
query_params['periodo'] = time_range_args.get(params['time_range'])
params['url'] = search_api + urlencode(query_params)
return params
def response(resp) -> EngineResults:
res = EngineResults()
doc = html.fromstring(resp.text)
for result in eval_xpath_list(doc, "//div[@class='article']"):
res_obj = MainResult(
title=extract_text(eval_xpath(result, "./div[@class='content']/h2[@class='title']/a")),
content=extract_text(eval_xpath(result, "./div[@class='content']/div[@class='text']")),
url=base_url + extract_text(eval_xpath(result, "./div[@class='content']/h2[@class='title']/a/@href")),
)
thumbnail = extract_text(eval_xpath(result, "./div[@class='image']/a/img/@src"))
if thumbnail:
res_obj.thumbnail = base_url + thumbnail
res.append(res_obj)
return res

View File

@@ -0,0 +1,61 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""APKMirror
"""
# pylint: disable=invalid-name
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
about = {
"website": 'https://www.apkmirror.com',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files', 'apps']
paging = True
time_range_support = False
# search-url
base_url = 'https://www.apkmirror.com'
search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}'
def request(query, params):
params['url'] = search_url.format(
pageno=params['pageno'],
query=urlencode({'s': query}),
)
logger.debug("query_url --> %s", params['url'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']"):
link = eval_xpath_getindex(result, './/h5/a', 0)
url = base_url + link.attrib.get('href') + '#downloads'
title = extract_text(link)
thumbnail = base_url + eval_xpath_getindex(result, './/img/@src', 0)
res = {'url': url, 'title': title, 'thumbnail': thumbnail}
results.append(res)
return results

View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Apple App Store
"""
from json import loads
from urllib.parse import urlencode
from dateutil.parser import parse
about = {
"website": 'https://www.apple.com/app-store/',
"wikidata_id": 'Q368215',
"official_api_documentation": (
'https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/'
'iTuneSearchAPI/UnderstandingSearchResults.html#//apple_ref/doc/uid/TP40017632-CH8-SW1'
),
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['files', 'apps']
safesearch = True
search_url = 'https://itunes.apple.com/search?{query}'
def request(query, params):
explicit = "Yes"
if params['safesearch'] > 0:
explicit = "No"
params['url'] = search_url.format(query=urlencode({'term': query, 'media': 'software', 'explicit': explicit}))
return params
def response(resp):
results = []
json_result = loads(resp.text)
for result in json_result['results']:
results.append(
{
'url': result['trackViewUrl'],
'title': result['trackName'],
'content': result['description'],
'thumbnail': result['artworkUrl100'],
'publishedDate': parse(result['currentVersionReleaseDate']),
'author': result['sellerName'],
}
)
return results

112
searx/engines/apple_maps.py Normal file
View File

@@ -0,0 +1,112 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Apple Maps"""
from json import loads
from time import time
from urllib.parse import urlencode
from searx.network import get as http_get
from searx.engines.openstreetmap import get_key_label
about = {
"website": 'https://www.apple.com/maps/',
"wikidata_id": 'Q276101',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
token = {'value': '', 'last_updated': None}
categories = ['map']
paging = False
search_url = "https://api.apple-mapkit.com/v1/search?{query}&mkjsVersion=5.72.53"
def obtain_token():
update_time = time() - (time() % 1800)
try:
# use duckduckgo's mapkit token
token_response = http_get('https://duckduckgo.com/local.js?get_mk_token=1', timeout=2.0)
actual_token = http_get(
'https://cdn.apple-mapkit.com/ma/bootstrap?apiVersion=2&mkjsVersion=5.72.53&poi=1',
timeout=2.0,
headers={'Authorization': 'Bearer ' + token_response.text},
)
token['value'] = loads(actual_token.text)['authInfo']['access_token']
token['last_updated'] = update_time
# pylint: disable=bare-except
except:
pass
return token
def request(query, params):
if time() - (token['last_updated'] or 0) > 1800:
obtain_token()
params['url'] = search_url.format(query=urlencode({'q': query, 'lang': params['language']}))
params['headers'] = {'Authorization': 'Bearer ' + token['value']}
return params
def response(resp):
results = []
resp_json = loads(resp.text)
user_language = resp.search_params['language']
for result in resp_json['results']:
boundingbox = None
if 'displayMapRegion' in result:
box = result['displayMapRegion']
boundingbox = [box['southLat'], box['northLat'], box['westLng'], box['eastLng']]
links = []
if 'telephone' in result:
telephone = result['telephone']
links.append(
{
'label': get_key_label('phone', user_language),
'url': 'tel:' + telephone,
'url_label': telephone,
}
)
if result.get('urls'):
url = result['urls'][0]
links.append(
{
'label': get_key_label('website', user_language),
'url': url,
'url_label': url,
}
)
results.append(
{
'template': 'map.html',
'type': result.get('poiCategory'),
'title': result['name'],
'links': links,
'latitude': result['center']['lat'],
'longitude': result['center']['lng'],
'url': result['placecardUrl'],
'boundingbox': boundingbox,
'geojson': {'type': 'Point', 'coordinates': [result['center']['lng'], result['center']['lat']]},
'address': {
'name': result['name'],
'house_number': result.get('subThoroughfare'),
'road': result.get('thoroughfare'),
'locality': result.get('locality'),
'postcode': result.get('postCode'),
'country': result.get('country'),
},
}
)
return results

154
searx/engines/archlinux.py Normal file
View File

@@ -0,0 +1,154 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Arch Linux Wiki
~~~~~~~~~~~~~~~
This implementation does not use a official API: Mediawiki provides API, but
Arch Wiki blocks access to it.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urljoin, urlparse
import lxml
import babel
from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.locales import language_tag
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://wiki.archlinux.org/',
"wikidata_id": 'Q101445877',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['it', 'software wikis']
paging = True
main_wiki = 'wiki.archlinux.org'
def request(query, params):
sxng_lang = params['searxng_locale'].split('-')[0]
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
title: str = traits.custom['title'].get(sxng_lang, 'Special:Search') # type: ignore
base_url = 'https://' + netloc + '/index.php?'
offset = (params['pageno'] - 1) * 20
if netloc == main_wiki:
eng_lang: str = traits.get_language(sxng_lang, 'English') # type: ignore
query += ' (' + eng_lang + ')'
# wiki.archlinux.org is protected by anubis
# - https://github.com/searxng/searxng/issues/4646#issuecomment-2817848019
params['headers']['User-Agent'] = "SearXNG"
elif netloc == 'wiki.archlinuxcn.org':
base_url = 'https://' + netloc + '/wzh/index.php?'
args = {
'search': query,
'title': title,
'limit': 20,
'offset': offset,
'profile': 'default',
}
params['url'] = base_url + urlencode(args)
return params
def response(resp):
results = []
dom = lxml.html.fromstring(resp.text) # type: ignore
# get the base URL for the language in which request was made
sxng_lang = resp.search_params['searxng_locale'].split('-')[0]
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
base_url = 'https://' + netloc + '/index.php?'
for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'):
link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0)
content = extract_text(result.xpath('.//div[@class="searchresult"]'))
results.append(
{
'url': urljoin(base_url, link.get('href')), # type: ignore
'title': extract_text(link),
'content': content,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Archlinux-Wiki. The location of the Wiki address of a
language is mapped in a :py:obj:`custom field
<searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``). Depending
on the location, the ``title`` argument in the request is translated.
.. code:: python
"custom": {
"wiki_netloc": {
"de": "wiki.archlinux.de",
# ...
"zh": "wiki.archlinuxcn.org"
}
"title": {
"de": "Spezial:Suche",
# ...
"zh": "Special:\u641c\u7d22"
},
},
"""
# pylint: disable=import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
engine_traits.custom['wiki_netloc'] = {}
engine_traits.custom['title'] = {}
title_map = {
'de': 'Spezial:Suche',
'fa': 'ویژه:جستجو',
'ja': '特別:検索',
'zh': 'Special:搜索',
}
resp = get('https://wiki.archlinux.org/')
if not resp.ok: # type: ignore
print("ERROR: response from wiki.archlinux.org is not OK.")
dom = lxml.html.fromstring(resp.text) # type: ignore
for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"):
sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-'))
# zh_Hans --> zh
sxng_tag = sxng_tag.split('_')[0]
netloc = urlparse(a.get('href')).netloc
if netloc != 'wiki.archlinux.org':
title = title_map.get(sxng_tag)
if not title:
print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag))
continue
engine_traits.custom['wiki_netloc'][sxng_tag] = netloc
engine_traits.custom['title'][sxng_tag] = title # type: ignore
eng_tag = extract_text(eval_xpath_list(a, ".//span"))
engine_traits.languages[sxng_tag] = eng_tag # type: ignore
engine_traits.languages['en'] = 'English'

67
searx/engines/artic.py Normal file
View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""The Art Institute of Chicago
Explore thousands of artworks from The Art Institute of Chicago.
* https://artic.edu
"""
from json import loads
from urllib.parse import urlencode
about = {
"website": 'https://www.artic.edu',
"wikidata_id": 'Q239303',
"official_api_documentation": 'http://api.artic.edu/docs/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
nb_per_page = 20
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
image_api = 'https://www.artic.edu/iiif/2/'
def request(query, params):
args = urlencode(
{
'q': query,
'page': params['pageno'],
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
'limit': nb_per_page,
}
)
params['url'] = search_api + args
logger.debug("query_url --> %s", params['url'])
return params
def response(resp):
results = []
json_data = loads(resp.text)
for result in json_data['data']:
if not result['image_id']:
continue
results.append(
{
'url': 'https://artic.edu/artworks/%(id)s' % result,
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
'content': "%(medium_display)s // %(dimensions)s" % result,
'author': ', '.join(result['artist_titles']),
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
'template': 'images.html',
}
)
return results

110
searx/engines/arxiv.py Normal file
View File

@@ -0,0 +1,110 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""ArXiV (Scientific preprints)
"""
from datetime import datetime
from lxml import etree
from lxml.etree import XPath
from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://arxiv.org',
"wikidata_id": 'Q118398',
"official_api_documentation": 'https://arxiv.org/help/api',
"use_official_api": True,
"require_api_key": False,
"results": 'XML-RSS',
}
categories = ['science', 'scientific publications']
paging = True
base_url = (
'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}'
)
# engine dependent config
number_of_results = 10
# xpaths
arxiv_namespaces = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = {'query': query, 'offset': offset, 'number_of_results': number_of_results}
params['url'] = base_url.format(**string_args)
return params
def response(resp):
results = []
dom = etree.fromstring(resp.content)
for entry in eval_xpath_list(dom, xpath_entry):
title = eval_xpath_getindex(entry, xpath_title, 0).text
url = eval_xpath_getindex(entry, xpath_id, 0).text
abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
# doi
doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
doi = None if doi_element is None else doi_element.text
# pdf
pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
# journal
journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
journal = None if journal_element is None else journal_element.text
# tags
tag_elements = eval_xpath(entry, xpath_category)
tags = [str(tag) for tag in tag_elements]
# comments
comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
comments = None if comments_elements is None else comments_elements.text
publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'publishedDate': publishedDate,
'content': abstract,
'doi': doi,
'authors': authors,
'journal': journal,
'tags': tags,
'comments': comments,
'pdf_url': pdf_url,
}
results.append(res_dict)
return results

75
searx/engines/ask.py Normal file
View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Ask.com"""
from urllib.parse import urlencode
import dateutil
from lxml import html
from searx import utils
# Metadata
about = {
"website": "https://www.ask.com/",
"wikidata_id": 'Q847564',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
# Engine Configuration
categories = ['general']
paging = True
max_page = 5
"""Ask.com has at max 5 pages."""
# Base URL
base_url = "https://www.ask.com/web"
def request(query, params):
query_params = {
"q": query,
"page": params["pageno"],
}
params["url"] = f"{base_url}?{urlencode(query_params)}"
return params
def response(resp):
start_tag = 'window.MESON.initialState = {'
end_tag = '}};'
dom = html.fromstring(resp.text)
script = utils.eval_xpath_getindex(dom, '//script', 0, default=None).text
pos = script.index(start_tag) + len(start_tag) - 1
script = script[pos:]
pos = script.index(end_tag) + len(end_tag) - 1
script = script[:pos]
json_resp = utils.js_variable_to_python(script)
results = []
for item in json_resp['search']['webResults']['results']:
pubdate_original = item.get('pubdate_original')
if pubdate_original:
pubdate_original = dateutil.parser.parse(pubdate_original)
metadata = [item.get(field) for field in ['category_l1', 'catsy'] if item.get(field)]
results.append(
{
"url": item['url'].split('&ueid')[0],
"title": item['title'],
"content": item['abstract'],
"publishedDate": pubdate_original,
# "thumbnail": item.get('image_url') or None, # these are not thumbs / to large
"metadata": ' | '.join(metadata),
}
)
return results

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. sidebar:: info
The Astrophysics Data System (ADS) is a digital library portal for researchers in astronomy and physics,
operated by the Smithsonian Astrophysical Observatory (SAO) under a NASA grant.
The engine is adapted from the solr engine.
"""
# pylint: disable=global-statement
from datetime import datetime
from json import loads
from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
about = {
"website": 'https://ui.adsabs.harvard.edu/',
"wikidata_id": 'Q752099',
"official_api_documentation": 'https://ui.adsabs.harvard.edu/help/api/api-docs.html',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
base_url = 'https://api.adsabs.harvard.edu/v1/search'
result_base_url = 'https://ui.adsabs.harvard.edu/abs/'
rows = 10
sort = '' # sorting: asc or desc
field_list = ['bibcode', 'author', 'title', 'abstract', 'doi', 'date'] # list of field names to display on the UI
default_fields = '' # default field to query
query_fields = '' # query fields
paging = True
api_key = 'unset'
def init(_):
if api_key == 'unset':
raise SearxEngineAPIException('missing ADS API key')
def request(query, params):
query_params = {'q': query, 'rows': rows}
if field_list:
query_params['fl'] = ','.join(field_list)
if query_fields:
query_params['qf'] = ','.join(query_fields)
if default_fields:
query_params['df'] = default_fields
if sort:
query_params['sort'] = sort
query_params['start'] = rows * (params['pageno'] - 1)
params['headers']['Authorization'] = f'Bearer {api_key}'
params['url'] = f"{base_url}/query?{urlencode(query_params)}"
return params
def response(resp):
try:
resp_json = loads(resp.text)
except Exception as e:
raise SearxEngineAPIException("failed to parse response") from e
if 'error' in resp_json:
raise SearxEngineAPIException(resp_json['error']['msg'])
resp_json = resp_json["response"]
result_len = resp_json["numFound"]
results = []
for res in resp_json["docs"]:
author = res.get("author")
if author:
author = author[0] + ' et al.'
results.append(
{
'url': result_base_url + res.get("bibcode") + "/",
'title': res.get("title")[0],
'author': author,
'content': res.get("abstract"),
'doi': res.get("doi"),
'publishedDate': datetime.fromisoformat(res.get("date")),
}
)
results.append({'number_of_results': result_len})
return results

182
searx/engines/baidu.py Normal file
View File

@@ -0,0 +1,182 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Baidu_
.. _Baidu: https://www.baidu.com
"""
# There exits a https://github.com/ohblue/baidu-serp-api/
# but we don't use it here (may we can learn from).
from urllib.parse import urlencode
from datetime import datetime
from html import unescape
import time
import json
from searx.exceptions import SearxEngineAPIException
from searx.utils import html_to_text
about = {
"website": "https://www.baidu.com",
"wikidata_id": "Q14772",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
"language": "zh",
}
paging = True
categories = []
results_per_page = 10
baidu_category = 'general'
time_range_support = True
time_range_dict = {"day": 86400, "week": 604800, "month": 2592000, "year": 31536000}
def init(_):
if baidu_category not in ('general', 'images', 'it'):
raise SearxEngineAPIException(f"Unsupported category: {baidu_category}")
def request(query, params):
page_num = params["pageno"]
category_config = {
'general': {
'endpoint': 'https://www.baidu.com/s',
'params': {
"wd": query,
"rn": results_per_page,
"pn": (page_num - 1) * results_per_page,
"tn": "json",
},
},
'images': {
'endpoint': 'https://image.baidu.com/search/acjson',
'params': {
"word": query,
"rn": results_per_page,
"pn": (page_num - 1) * results_per_page,
"tn": "resultjson_com",
},
},
'it': {
'endpoint': 'https://kaifa.baidu.com/rest/v1/search',
'params': {
"wd": query,
"pageSize": results_per_page,
"pageNum": page_num,
"paramList": f"page_num={page_num},page_size={results_per_page}",
"position": 0,
},
},
}
query_params = category_config[baidu_category]['params']
query_url = category_config[baidu_category]['endpoint']
if params.get("time_range") in time_range_dict:
now = int(time.time())
past = now - time_range_dict[params["time_range"]]
if baidu_category == 'general':
query_params["gpc"] = f"stf={past},{now}|stftype=1"
if baidu_category == 'it':
query_params["paramList"] += f",timestamp_range={past}-{now}"
params["url"] = f"{query_url}?{urlencode(query_params)}"
return params
def response(resp):
text = resp.text
if baidu_category == 'images':
# baidu's JSON encoder wrongly quotes / and ' characters by \\ and \'
text = text.replace(r"\/", "/").replace(r"\'", "'")
data = json.loads(text, strict=False)
parsers = {'general': parse_general, 'images': parse_images, 'it': parse_it}
return parsers[baidu_category](data)
def parse_general(data):
results = []
if not data.get("feed", {}).get("entry"):
raise SearxEngineAPIException("Invalid response")
for entry in data["feed"]["entry"]:
if not entry.get("title") or not entry.get("url"):
continue
published_date = None
if entry.get("time"):
try:
published_date = datetime.fromtimestamp(entry["time"])
except (ValueError, TypeError):
published_date = None
# title and content sometimes containing characters such as &amp; &#39; &quot; etc...
title = unescape(entry["title"])
content = unescape(entry.get("abs", ""))
results.append(
{
"title": title,
"url": entry["url"],
"content": content,
"publishedDate": published_date,
}
)
return results
def parse_images(data):
results = []
if "data" in data:
for item in data["data"]:
if not item:
# the last item in the JSON list is empty, the JSON string ends with "}, {}]"
continue
replace_url = item.get("replaceUrl", [{}])[0]
width = item.get("width")
height = item.get("height")
img_date = item.get("bdImgnewsDate")
publishedDate = None
if img_date:
publishedDate = datetime.strptime(img_date, "%Y-%m-%d %H:%M")
results.append(
{
"template": "images.html",
"url": replace_url.get("FromURL"),
"thumbnail_src": item.get("thumbURL"),
"img_src": replace_url.get("ObjURL"),
"title": html_to_text(item.get("fromPageTitle")),
"source": item.get("fromURLHost"),
"resolution": f"{width} x {height}",
"img_format": item.get("type"),
"filesize": item.get("filesize"),
"publishedDate": publishedDate,
}
)
return results
def parse_it(data):
results = []
if not data.get("data", {}).get("documents", {}).get("data"):
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["documents"]["data"]:
results.append(
{
'title': entry["techDocDigest"]["title"],
'url': entry["techDocDigest"]["url"],
'content': entry["techDocDigest"]["summary"],
}
)
return results

81
searx/engines/bandcamp.py Normal file
View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Bandcamp (Music)
@website https://bandcamp.com/
@provide-api no
@results HTML
@parse url, title, content, publishedDate, iframe_src, thumbnail
"""
from urllib.parse import urlencode, urlparse, parse_qs
from dateutil.parser import parse as dateparse
from lxml import html
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
# about
about = {
"website": 'https://bandcamp.com/',
"wikidata_id": 'Q545966',
"official_api_documentation": 'https://bandcamp.com/developer',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['music']
paging = True
base_url = "https://bandcamp.com/"
search_string = 'search?{query}&page={page}'
iframe_src = "https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=000/linkcol=fff/artwork=small"
def request(query, params):
search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno'])
params['url'] = base_url + search_path
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//li[contains(@class, "searchresult")]'):
link = eval_xpath_getindex(result, './/div[@class="itemurl"]/a', 0, default=None)
if link is None:
continue
title = result.xpath('.//div[@class="heading"]/a/text()')
content = result.xpath('.//div[@class="subhead"]/text()')
new_result = {
"url": extract_text(link),
"title": extract_text(title),
"content": extract_text(content),
}
date = eval_xpath_getindex(result, '//div[@class="released"]/text()', 0, default=None)
if date:
new_result["publishedDate"] = dateparse(date.replace("released ", ""))
thumbnail = result.xpath('.//div[@class="art"]/img/@src')
if thumbnail:
new_result['thumbnail'] = thumbnail[0]
result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0]
itemtype = extract_text(result.xpath('.//div[@class="itemtype"]')).lower()
if "album" == itemtype:
new_result["iframe_src"] = iframe_src.format(type='album', result_id=result_id)
elif "track" == itemtype:
new_result["iframe_src"] = iframe_src.format(type='track', result_id=result_id)
results.append(new_result)
return results

118
searx/engines/base.py Executable file
View File

@@ -0,0 +1,118 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""BASE (Scholar publications)
"""
from datetime import datetime
import re
from urllib.parse import urlencode
from lxml import etree
from searx.utils import searx_useragent
# about
about = {
"website": 'https://base-search.net',
"wikidata_id": 'Q448335',
"official_api_documentation": 'https://api.base-search.net/',
"use_official_api": True,
"require_api_key": False,
"results": 'XML',
}
categories = ['science']
base_url = (
'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'
+ '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}'
)
# engine dependent config
paging = True
number_of_results = 10
# shortcuts for advanced search
shortcut_dict = {
# user-friendly keywords
'format:': 'dcformat:',
'author:': 'dccreator:',
'collection:': 'dccollection:',
'hdate:': 'dchdate:',
'contributor:': 'dccontributor:',
'coverage:': 'dccoverage:',
'date:': 'dcdate:',
'abstract:': 'dcdescription:',
'urls:': 'dcidentifier:',
'language:': 'dclanguage:',
'publisher:': 'dcpublisher:',
'relation:': 'dcrelation:',
'rights:': 'dcrights:',
'source:': 'dcsource:',
'subject:': 'dcsubject:',
'title:': 'dctitle:',
'type:': 'dcdctype:',
}
def request(query, params):
# replace shortcuts with API advanced search keywords
for key, val in shortcut_dict.items():
query = re.sub(key, val, query)
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = {
'query': urlencode({'query': query}),
'offset': offset,
'hits': number_of_results,
}
params['url'] = base_url.format(**string_args)
params['headers']['User-Agent'] = searx_useragent()
return params
def response(resp):
results = []
search_results = etree.XML(resp.content)
for entry in search_results.xpath('./result/doc'):
content = "No description available"
url = ""
title = ""
date = datetime.now() # needed in case no dcdate is available for an item
for item in entry:
if item.attrib["name"] == "dcdate":
date = item.text
elif item.attrib["name"] == "dctitle":
title = item.text
elif item.attrib["name"] == "dclink":
url = item.text
elif item.attrib["name"] == "dcdescription":
content = item.text[:300]
if len(item.text) > 300:
content += "..."
# dates returned by the BASE API are not several formats
publishedDate = None
for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']:
try:
publishedDate = datetime.strptime(date, date_format)
break
except: # pylint: disable=bare-except
pass
if publishedDate is not None:
res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}
else:
res_dict = {'url': url, 'title': title, 'content': content}
results.append(res_dict)
return results

96
searx/engines/bilibili.py Normal file
View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Bilibili is a Chinese video sharing website.
.. _Bilibili: https://www.bilibili.com
"""
import random
import string
from urllib.parse import urlencode
from datetime import datetime, timedelta
from searx import utils
# Engine metadata
about = {
"website": "https://www.bilibili.com",
"wikidata_id": "Q3077586",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
# Engine configuration
paging = True
results_per_page = 20
categories = ["videos"]
# Search URL
base_url = "https://api.bilibili.com/x/web-interface/search/type"
cookie = {
"innersign": "0",
"buvid3": "".join(random.choice(string.hexdigits) for _ in range(16)) + "infoc",
"i-wanna-go-back": "-1",
"b_ut": "7",
"FEED_LIVE_VERSION": "V8",
"header_theme_version": "undefined",
"home_feed_column": "4",
}
def request(query, params):
query_params = {
"__refresh__": "true",
"page": params["pageno"],
"page_size": results_per_page,
"single_column": "0",
"keyword": query,
"search_type": "video",
}
params["url"] = f"{base_url}?{urlencode(query_params)}"
params["cookies"] = cookie
return params
def response(resp):
search_res = resp.json()
results = []
for item in search_res.get("data", {}).get("result", []):
title = utils.html_to_text(item["title"])
url = item["arcurl"]
thumbnail = item["pic"]
description = item["description"]
author = item["author"]
video_id = item["aid"]
unix_date = item["pubdate"]
formatted_date = datetime.fromtimestamp(unix_date)
# the duration only seems to be valid if the video is less than 60 mins
duration = utils.parse_duration_string(item["duration"])
if duration and duration > timedelta(minutes=60):
duration = None
iframe_url = f"https://player.bilibili.com/player.html?aid={video_id}&high_quality=1&autoplay=false&danmaku=0"
results.append(
{
"title": title,
"url": url,
"content": description,
"author": author,
"publishedDate": formatted_date,
"length": duration,
"thumbnail": thumbnail,
"iframe_src": iframe_url,
"template": "videos.html",
}
)
return results

284
searx/engines/bing.py Normal file
View File

@@ -0,0 +1,284 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is the implementation of the Bing-WEB engine. Some of this
implementations are shared by other engines:
- :ref:`bing images engine`
- :ref:`bing news engine`
- :ref:`bing videos engine`
On the `preference page`_ Bing offers a lot of languages an regions (see section
LANGUAGE and COUNTRY/REGION). The Language is the language of the UI, we need
in SearXNG to get the translations of data such as *"published last week"*.
There is a description of the official search-APIs_, unfortunately this is not
the API we can use or that bing itself would use. You can look up some things
in the API to get a better picture of bing, but the value specifications like
the market codes are usually outdated or at least no longer used by bing itself.
The market codes have been harmonized and are identical for web, video and
images. The news area has also been harmonized with the other categories. Only
political adjustments still seem to be made -- for example, there is no news
category for the Chinese market.
.. _preference page: https://www.bing.com/account/general
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
"""
# pylint: disable=too-many-branches, invalid-name
from typing import TYPE_CHECKING
import base64
import re
import time
from urllib.parse import parse_qs, urlencode, urlparse
from lxml import html
import babel
import babel.languages
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.locales import language_tag, region_tag
from searx.enginelib.traits import EngineTraits
from searx.exceptions import SearxEngineAPIException
if TYPE_CHECKING:
import logging
logger = logging.getLogger()
traits: EngineTraits
about = {
"website": 'https://www.bing.com',
"wikidata_id": 'Q182496',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general', 'web']
paging = True
max_page = 200
"""200 pages maximum (``&first=1991``)"""
time_range_support = True
safesearch = True
"""Bing results are always SFW. To get NSFW links from bing some age
verification by a cookie is needed / thats not possible in SearXNG.
"""
base_url = 'https://www.bing.com/search'
"""Bing (Web) search URL"""
def _page_offset(pageno):
return (int(pageno) - 1) * 10 + 1
def set_bing_cookies(params, engine_language, engine_region):
params['cookies']['_EDGE_CD'] = f'm={engine_region}&u={engine_language}'
params['cookies']['_EDGE_S'] = f'mkt={engine_region}&ui={engine_language}'
logger.debug("bing cookies: %s", params['cookies'])
def request(query, params):
"""Assemble a Bing-Web request."""
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
page = params.get('pageno', 1)
query_params = {
'q': query,
# if arg 'pq' is missed, sometimes on page 4 we get results from page 1,
# don't ask why it is only sometimes / its M$ and they have never been
# deterministic ;)
'pq': query,
}
# To get correct page, arg first and this arg FORM is needed, the value PERE
# is on page 2, on page 3 its PERE1 and on page 4 its PERE2 .. and so forth.
# The 'first' arg should never send on page 1.
if page > 1:
query_params['first'] = _page_offset(page) # see also arg FORM
if page == 2:
query_params['FORM'] = 'PERE'
elif page > 2:
query_params['FORM'] = 'PERE%s' % (page - 2)
params['url'] = f'{base_url}?{urlencode(query_params)}'
if params.get('time_range'):
unix_day = int(time.time() / 86400)
time_ranges = {'day': '1', 'week': '2', 'month': '3', 'year': f'5_{unix_day-365}_{unix_day}'}
params['url'] += f'&filters=ex1:"ez{time_ranges[params["time_range"]]}"'
return params
def response(resp):
# pylint: disable=too-many-locals
results = []
result_len = 0
dom = html.fromstring(resp.text)
# parse results again if nothing is found yet
for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
link = eval_xpath_getindex(result, './/h2/a', 0, None)
if link is None:
continue
url = link.attrib.get('href')
title = extract_text(link)
content = eval_xpath(result, './/p')
for p in content:
# Make sure that the element is free of:
# <span class="algoSlug_icon" # data-priority="2">Web</span>
for e in p.xpath('.//span[@class="algoSlug_icon"]'):
e.getparent().remove(e)
content = extract_text(content)
# get the real URL
if url.startswith('https://www.bing.com/ck/a?'):
# get the first value of u parameter
url_query = urlparse(url).query
parsed_url_query = parse_qs(url_query)
param_u = parsed_url_query["u"][0]
# remove "a1" in front
encoded_url = param_u[2:]
# add padding
encoded_url = encoded_url + '=' * (-len(encoded_url) % 4)
# decode base64 encoded URL
url = base64.urlsafe_b64decode(encoded_url).decode()
# append result
results.append({'url': url, 'title': title, 'content': content})
# get number_of_results
if results:
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
if "-" in result_len_container:
start_str, result_len_container = re.split(r'-\d+', result_len_container)
start = int(start_str)
else:
start = 1
result_len_container = re.sub('[^0-9]', '', result_len_container)
if len(result_len_container) > 0:
result_len = int(result_len_container)
expected_start = _page_offset(resp.search_params.get("pageno", 1))
if expected_start != start:
if expected_start > result_len:
# Avoid reading more results than available.
# For example, if there is 100 results from some search and we try to get results from 120 to 130,
# Bing will send back the results from 0 to 10 and no error.
# If we compare results count with the first parameter of the request we can avoid this "invalid"
# results.
return []
# Sometimes Bing will send back the first result page instead of the requested page as a rate limiting
# measure.
msg = f"Expected results to start at {expected_start}, but got results starting at {start}"
raise SearxEngineAPIException(msg)
results.append({'number_of_results': result_len})
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Web."""
# pylint: disable=import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.utils import gen_useragent
headers = {
"User-Agent": gen_useragent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-GPC": "1",
"Cache-Control": "max-age=0",
}
resp = get("https://www.bing.com/account/general", headers=headers)
if not resp.ok: # type: ignore
print("ERROR: response from bing is not OK.")
dom = html.fromstring(resp.text) # type: ignore
# languages
engine_traits.languages['zh'] = 'zh-hans'
map_lang = {'prs': 'fa-AF', 'en': 'en-us'}
bing_ui_lang_map = {
# HINT: this list probably needs to be supplemented
'en': 'us', # en --> en-us
'da': 'dk', # da --> da-dk
}
for href in eval_xpath(dom, '//div[@id="language-section-content"]//div[@class="languageItem"]/a/@href'):
eng_lang = parse_qs(urlparse(href).query)['setlang'][0]
babel_lang = map_lang.get(eng_lang, eng_lang)
try:
sxng_tag = language_tag(babel.Locale.parse(babel_lang.replace('-', '_')))
except babel.UnknownLocaleError:
print("ERROR: language (%s) is unknown by babel" % (babel_lang))
continue
# Language (e.g. 'en' or 'de') from https://www.bing.com/account/general
# is converted by bing to 'en-us' or 'de-de'. But only if there is not
# already a '-' delemitter in the language. For instance 'pt-PT' -->
# 'pt-pt' and 'pt-br' --> 'pt-br'
bing_ui_lang = eng_lang.lower()
if '-' not in bing_ui_lang:
bing_ui_lang = bing_ui_lang + '-' + bing_ui_lang_map.get(bing_ui_lang, bing_ui_lang)
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != bing_ui_lang:
print(f"CONFLICT: babel {sxng_tag} --> {conflict}, {bing_ui_lang}")
continue
engine_traits.languages[sxng_tag] = bing_ui_lang
# regions (aka "market codes")
engine_traits.regions['zh-CN'] = 'zh-cn'
map_market_codes = {
'zh-hk': 'en-hk', # not sure why, but at M$ this is the market code for Hongkong
}
for href in eval_xpath(dom, '//div[@id="region-section-content"]//div[@class="regionItem"]/a/@href'):
cc_tag = parse_qs(urlparse(href).query)['cc'][0]
if cc_tag == 'clear':
engine_traits.all_locale = cc_tag
continue
# add market codes from official languages of the country ..
for lang_tag in babel.languages.get_official_languages(cc_tag, de_facto=True):
if lang_tag not in engine_traits.languages.keys():
# print("ignore lang: %s <-- %s" % (cc_tag, lang_tag))
continue
lang_tag = lang_tag.split('_')[0] # zh_Hant --> zh
market_code = f"{lang_tag}-{cc_tag}" # zh-tw
market_code = map_market_codes.get(market_code, market_code)
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, cc_tag.upper())))
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != market_code:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, market_code))
continue
engine_traits.regions[sxng_tag] = market_code

View File

@@ -0,0 +1,109 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Bing-Images: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import set_bing_cookies
from searx.engines.bing import fetch_traits # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger = logging.getLogger()
traits: EngineTraits
# about
about = {
"website": 'https://www.bing.com/images',
"wikidata_id": 'Q182496',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-image-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images', 'web']
paging = True
safesearch = True
time_range_support = True
base_url = 'https://www.bing.com/images/async'
"""Bing (Images) search URL"""
time_map = {
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
}
def request(query, params):
"""Assemble a Bing-Image request."""
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
# build URL query
# - example: https://www.bing.com/images/async?q=foo&async=content&first=1&count=35
query_params = {
'q': query,
'async': '1',
# to simplify the page count lets use the default of 35 images per page
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count': 35,
}
# time range
# - example: one year (525600 minutes) 'qft=+filterui:age-lt525600'
if params['time_range']:
query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Images"""
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'):
metadata = result.xpath('.//a[@class="iusc"]/@m')
if not metadata:
continue
metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0])
title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip()
img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip().split(" · ")
source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip()
results.append(
{
'template': 'images.html',
'url': metadata['purl'],
'thumbnail_src': metadata['turl'],
'img_src': metadata['murl'],
'content': metadata.get('desc'),
'title': title,
'source': source,
'resolution': img_format[0],
'img_format': img_format[1] if len(img_format) >= 2 else None,
}
)
return results

160
searx/engines/bing_news.py Normal file
View File

@@ -0,0 +1,160 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Bing-News: description see :py:obj:`searx.engines.bing`.
.. hint::
Bing News is *different* in some ways!
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import set_bing_cookies
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.bing.com/news',
"wikidata_id": 'Q2878637',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'RSS',
}
# engine dependent config
categories = ['news']
paging = True
"""If go through the pages and there are actually no new results for another
page, then bing returns the results from the last page again."""
time_range_support = True
time_map = {
'day': 'interval="4"',
'week': 'interval="7"',
'month': 'interval="9"',
}
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
difference of *last day* and *last week* in the result list is just marginally.
Bing does not have news range ``year`` / we use ``month`` instead."""
base_url = 'https://www.bing.com/news/infinitescrollajax'
"""Bing (News) search URL"""
def request(query, params):
"""Assemble a Bing-News request."""
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
# build URL query
#
# example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
page = int(params.get('pageno', 1)) - 1
query_params = {
'q': query,
'InfiniteScroll': 1,
# to simplify the page count lets use the default of 10 images per page
'first': page * 10 + 1,
'SFX': page,
'form': 'PTFTNR',
'setlang': engine_region.split('-')[0],
'cc': engine_region.split('-')[-1],
}
if params['time_range']:
query_params['qft'] = time_map.get(params['time_range'], 'interval="9"')
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Video"""
results = []
if not resp.ok or not resp.text:
return results
dom = html.fromstring(resp.text)
for newsitem in eval_xpath_list(dom, '//div[contains(@class, "newsitem")]'):
link = eval_xpath_getindex(newsitem, './/a[@class="title"]', 0, None)
if link is None:
continue
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(eval_xpath(newsitem, './/div[@class="snippet"]'))
metadata = []
source = eval_xpath_getindex(newsitem, './/div[contains(@class, "source")]', 0, None)
if source is not None:
for item in (
eval_xpath_getindex(source, './/span[@aria-label]/@aria-label', 0, None),
# eval_xpath_getindex(source, './/a', 0, None),
# eval_xpath_getindex(source, './div/span', 3, None),
link.attrib.get('data-author'),
):
if item is not None:
t = extract_text(item)
if t and t.strip():
metadata.append(t.strip())
metadata = ' | '.join(metadata)
thumbnail = None
imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None)
if imagelink is not None:
thumbnail = imagelink.attrib.get('src')
if not thumbnail.startswith("https://www.bing.com"):
thumbnail = 'https://www.bing.com/' + thumbnail
results.append(
{
'url': url,
'title': title,
'content': content,
'thumbnail': thumbnail,
'metadata': metadata,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News."""
# pylint: disable=import-outside-toplevel
from searx.engines.bing import fetch_traits as _f
_f(engine_traits)
# fix market codes not known by bing news:
# In bing the market code 'zh-cn' exists, but there is no 'news' category in
# bing for this market. Alternatively we use the the market code from Honk
# Kong. Even if this is not correct, it is better than having no hits at
# all, or sending false queries to bing that could raise the suspicion of a
# bot.
# HINT: 'en-hk' is the region code it does not indicate the language en!!
engine_traits.regions['zh-CN'] = 'en-hk'

View File

@@ -0,0 +1,98 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""Bing-Videos: description see :py:obj:`searx.engines.bing`.
"""
from typing import TYPE_CHECKING
import json
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import set_bing_cookies
from searx.engines.bing import fetch_traits # pylint: disable=unused-import
from searx.engines.bing_images import time_map
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://www.bing.com/videos',
"wikidata_id": 'Q4914152',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-video-search-api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['videos', 'web']
paging = True
safesearch = True
time_range_support = True
base_url = 'https://www.bing.com/videos/asyncv2'
"""Bing (Videos) async search URL."""
def request(query, params):
"""Assemble a Bing-Video request."""
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
set_bing_cookies(params, engine_language, engine_region)
# build URL query
#
# example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
query_params = {
'q': query,
'async': 'content',
# to simplify the page count lets use the default of 35 images per page
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count': 35,
}
# time range
#
# example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR'
if params['time_range']:
query_params['form'] = 'VRFLTR'
query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Video"""
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'):
metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
content = '{0} - {1}'.format(metadata['du'], info)
thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0]
results.append(
{
'url': metadata['murl'],
'thumbnail': thumbnail,
'title': metadata.get('vt', ''),
'content': content,
'template': 'videos.html',
}
)
return results

56
searx/engines/bitchute.py Normal file
View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""bitchute (Videos)"""
from json import dumps
from datetime import datetime
from searx.utils import html_to_text
about = {
"website": 'https://bitchute.com',
"wikidata_id": "Q45287179",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
base_url = "https://api.bitchute.com/api/beta/search/videos"
categories = ['videos']
paging = True
results_per_page = 20
def request(query, params):
start_index = (params["pageno"] - 1) * results_per_page
data = {"offset": start_index, "limit": results_per_page, "query": query, "sensitivity_id": "normal", "sort": "new"}
params["url"] = base_url
params["method"] = 'POST'
params['headers']['content-type'] = "application/json"
params['data'] = dumps(data)
return params
def response(resp):
search_res = resp.json()
results = []
for item in search_res.get('videos', []):
results.append(
{
"title": item['video_name'],
"url": 'https://www.bitchute.com/video/' + item['video_id'],
"content": html_to_text(item['description']),
"author": item['channel']['channel_name'],
"publishedDate": datetime.strptime(item["date_published"], "%Y-%m-%dT%H:%M:%S.%fZ"),
"length": item['duration'],
"views": item['view_count'],
"thumbnail": item['thumbnail_url'],
"iframe_src": 'https://www.bitchute.com/embed/' + item['video_id'],
"template": "videos.html",
}
)
return results

67
searx/engines/bpb.py Normal file
View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""BPB refers to ``Bundeszentrale für poltische Bildung``, which is a German
governmental institution aiming to reduce misinformation by providing resources
about politics and history.
"""
from datetime import datetime
from urllib.parse import urlencode
about = {
'website': "https://www.bpb.de",
'official_api_documentation': None,
'use_official_api': False,
'require_api_key': False,
'results': 'JSON',
'language': 'de',
}
paging = True
categories = ['general']
base_url = "https://www.bpb.de"
def request(query, params):
args = {
'query[term]': query,
'page': params['pageno'] - 1,
'sort[direction]': 'descending',
'payload[nid]': 65350,
}
params['url'] = f"{base_url}/bpbapi/filter/search?{urlencode(args)}"
return params
def response(resp):
results = []
json_resp = resp.json()
for result in json_resp['teaser']:
thumbnail = None
if result['teaser']['image']:
thumbnail = base_url + result['teaser']['image']['sources'][-1]['url']
metadata = result['extension']['overline']
authors = ', '.join(author['name'] for author in result['extension'].get('authors', []))
if authors:
metadata += f" | {authors}"
publishedDate = None
if result['extension'].get('publishingDate'):
publishedDate = datetime.fromtimestamp(result['extension']['publishingDate'])
results.append(
{
'url': base_url + result['teaser']['link']['url'],
'title': result['teaser']['title'],
'content': result['teaser']['text'],
'thumbnail': thumbnail,
'publishedDate': publishedDate,
'metadata': metadata,
}
)
return results

505
searx/engines/brave.py Normal file
View File

@@ -0,0 +1,505 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
<time_range_support>` is limited (see remarks).
Configured ``brave`` engines:
.. code:: yaml
- name: brave
engine: brave
...
brave_category: search
time_range_support: true
paging: true
- name: brave.images
engine: brave
...
brave_category: images
- name: brave.videos
engine: brave
...
brave_category: videos
- name: brave.news
engine: brave
...
brave_category: news
- name: brave.goggles
time_range_support: true
paging: true
...
brave_category: goggles
.. _brave regions:
Brave regions
=============
Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
locales. To get a mapping, all *officiat de-facto* languages of the Brave
region are mapped to regions in SearXNG (see :py:obj:`babel
<babel.languages.get_official_languages>`):
.. code:: python
"regions": {
..
"en-CA": "ca",
"fr-CA": "ca",
..
}
.. note::
The language (aka region) support of Brave's index is limited to very basic
languages. The search results for languages like Chinese or Arabic are of
low quality.
.. _brave googles:
Brave Goggles
=============
.. _list of Goggles: https://search.brave.com/goggles/discover
.. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf
.. _Goggles Quickstart: https://github.com/brave/goggles-quickstart
Goggles allow you to choose, alter, or extend the ranking of Brave Search
results (`Goggles Whitepaper`_). Goggles are openly developed by the community
of Brave Search users.
Select from the `list of Goggles`_ people have published, or create your own
(`Goggles Quickstart`_).
.. _brave languages:
Brave languages
===============
Brave's language support is limited to the UI (menus, area local notations,
etc). Brave's index only seems to support a locale, but it does not seem to
support any languages in its index. The choice of available languages is very
small (and its not clear to me where the difference in UI is when switching
from en-us to en-ca or en-gb).
In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
UI languages are stored in a custom field named ``ui_lang``:
.. code:: python
"custom": {
"ui_lang": {
"ca": "ca",
"de-DE": "de-de",
"en-CA": "en-ca",
"en-GB": "en-gb",
"en-US": "en-us",
"es": "es",
"fr-CA": "fr-ca",
"fr-FR": "fr-fr",
"ja-JP": "ja-jp",
"pt-BR": "pt-br",
"sq-AL": "sq-al"
}
},
Implementations
===============
"""
from typing import Any, TYPE_CHECKING
from urllib.parse import (
urlencode,
urlparse,
)
from dateutil import parser
from lxml import html
from searx import locales
from searx.utils import (
extr,
extract_text,
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
js_variable_to_python,
get_embeded_stream_url,
)
from searx.enginelib.traits import EngineTraits
from searx.result_types import EngineResults
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://search.brave.com/',
"wikidata_id": 'Q22906900',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
base_url = "https://search.brave.com/"
categories = []
brave_category = 'search'
Goggles = Any
"""Brave supports common web-search, videos, images, news, and goggles search.
- ``search``: Common WEB search
- ``videos``: search for videos
- ``images``: search for images
- ``news``: search for news
- ``goggles``: Common WEB search with custom rules
"""
brave_spellcheck = False
"""Brave supports some kind of spell checking. When activated, Brave tries to
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
the UI of Brave the user gets warned about this, since we can not warn the user
in SearXNG, the spellchecking is disabled by default.
"""
send_accept_language_header = True
paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All) and in the goggles category."""
max_page = 10
"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
to do more won't return any result and you will most likely be flagged as a bot.
"""
safesearch = True
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
time_range_support = False
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
category All) and in the goggles category."""
time_range_map = {
'day': 'pd',
'week': 'pw',
'month': 'pm',
'year': 'py',
}
def request(query, params):
# Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
params['headers']['Accept-Encoding'] = 'gzip, deflate'
args = {
'q': query,
'source': 'web',
}
if brave_spellcheck:
args['spellcheck'] = '1'
if brave_category in ('search', 'goggles'):
if params.get('pageno', 1) - 1:
args['offset'] = params.get('pageno', 1) - 1
if time_range_map.get(params['time_range']):
args['tf'] = time_range_map.get(params['time_range'])
if brave_category == 'goggles':
args['goggles_id'] = Goggles
params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
# set properties in the cookies
params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
# the useLocation is IP based, we use cookie 'country' for the region
params['cookies']['useLocation'] = '0'
params['cookies']['summarizer'] = '0'
engine_region = traits.get_region(params['searxng_locale'], 'all')
params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
params['cookies']['ui_lang'] = ui_lang
logger.debug("cookies %s", params['cookies'])
params['headers']['Sec-Fetch-Dest'] = "document"
params['headers']['Sec-Fetch-Mode'] = "navigate"
params['headers']['Sec-Fetch-Site'] = "same-origin"
params['headers']['Sec-Fetch-User'] = "?1"
def _extract_published_date(published_date_raw):
if published_date_raw is None:
return None
try:
return parser.parse(published_date_raw)
except parser.ParserError:
return None
def response(resp) -> EngineResults:
if brave_category in ('search', 'goggles'):
return _parse_search(resp)
if brave_category in ('news'):
return _parse_news(resp)
# Example script source containing the data:
#
# kit.start(app, element, {
# node_ids: [0, 19],
# data: [{type:"data",data: .... ["q","goggles_id"],route:1,url:1}}]
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
js_object = "[{" + extr(resp.text, "data: [{", "}}],") + "}}]"
json_data = js_variable_to_python(js_object)
# json_data is a list and at the second position (0,1) in this list we find the "response" data we need ..
json_resp = json_data[1]['data']['body']['response']
if brave_category == 'images':
return _parse_images(json_resp)
if brave_category == 'videos':
return _parse_videos(json_resp)
raise ValueError(f"Unsupported brave category: {brave_category}")
def _parse_search(resp) -> EngineResults:
result_list = EngineResults()
dom = html.fromstring(resp.text)
# I doubt that Brave is still providing the "answer" class / I haven't seen
# answers in brave for a long time.
answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
if answer_tag:
url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
answer = extract_text(answer_tag)
if answer is not None:
result_list.add(result_list.types.Answer(answer=answer, url=url))
# xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
xpath_results = '//div[contains(@class, "snippet ")]'
for result in eval_xpath_list(dom, xpath_results):
url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
title_tag = eval_xpath_getindex(
result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
)
if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
continue
content: str = extract_text(
eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
) # type: ignore
pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
pub_date = _extract_published_date(pub_date_raw)
if pub_date and content.startswith(pub_date_raw):
content = content.lstrip(pub_date_raw).strip("- \n\t")
thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
item = {
'url': url,
'title': extract_text(title_tag),
'content': content,
'publishedDate': pub_date,
'thumbnail': thumbnail,
}
video_tag = eval_xpath_getindex(
result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
)
if video_tag is not None:
# In my tests a video tag in the WEB search was most often not a
# video, except the ones from youtube ..
iframe_src = get_embeded_stream_url(url)
if iframe_src:
item['iframe_src'] = iframe_src
item['template'] = 'videos.html'
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
pub_date_raw = extract_text(
eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
)
item['publishedDate'] = _extract_published_date(pub_date_raw)
else:
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
result_list.append(item)
return result_list
def _parse_news(resp) -> EngineResults:
result_list = EngineResults()
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
# import pdb
# pdb.set_trace()
url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
if url is None:
continue
title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
item = {
"url": url,
"title": title,
"content": content,
"thumbnail": thumbnail,
}
result_list.append(item)
return result_list
def _parse_images(json_resp) -> EngineResults:
result_list = EngineResults()
for result in json_resp["results"]:
item = {
'url': result['url'],
'title': result['title'],
'content': result['description'],
'template': 'images.html',
'resolution': result['properties']['format'],
'source': result['source'],
'img_src': result['properties']['url'],
'thumbnail_src': result['thumbnail']['src'],
}
result_list.append(item)
return result_list
def _parse_videos(json_resp) -> EngineResults:
result_list = EngineResults()
for result in json_resp["results"]:
url = result['url']
item = {
'url': url,
'title': result['title'],
'content': result['description'],
'template': 'videos.html',
'length': result['video']['duration'],
'duration': result['video']['duration'],
'publishedDate': _extract_published_date(result['age']),
}
if result['thumbnail'] is not None:
item['thumbnail'] = result['thumbnail']['src']
iframe_src = get_embeded_stream_url(url)
if iframe_src:
item['iframe_src'] = iframe_src
result_list.append(item)
return result_list
def fetch_traits(engine_traits: EngineTraits):
"""Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
regions>` from Brave."""
# pylint: disable=import-outside-toplevel, too-many-branches
import babel.languages
from searx.locales import region_tag, language_tag
from searx.network import get # see https://github.com/searxng/searxng/issues/762
engine_traits.custom["ui_lang"] = {}
headers = {
'Accept-Encoding': 'gzip, deflate',
}
lang_map = {'no': 'nb'} # norway
# languages (UI)
resp = get('https://search.brave.com/settings', headers=headers)
if not resp.ok: # type: ignore
print("ERROR: response from Brave is not OK.")
dom = html.fromstring(resp.text) # type: ignore
for option in dom.xpath('//section//option[@value="en-us"]/../option'):
ui_lang = option.get('value')
try:
l = babel.Locale.parse(ui_lang, sep='-')
if l.territory:
sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
else:
sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
continue
conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
if conflict:
if conflict != ui_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
continue
engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
# search regions of brave
resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers)
if not resp.ok: # type: ignore
print("ERROR: response from Brave is not OK.")
country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
country_js = country_js[: country_js.index("},k={default")]
country_tags = js_variable_to_python(country_js)
for k, v in country_tags.items():
if k == 'all':
engine_traits.all_locale = 'all'
continue
country_tag = v['value']
# add official languages of the country ..
for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
lang_tag = lang_map.get(lang_tag, lang_tag)
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
# print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != country_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
continue
engine_traits.regions[sxng_tag] = country_tag

118
searx/engines/bt4g.py Normal file
View File

@@ -0,0 +1,118 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""BT4G_ (bt4g.com) is not a tracker and doesn't store any content and only
collects torrent metadata (such as file names and file sizes) and a magnet link
(torrent identifier).
This engine does not parse the HTML page because there is an API in XML (RSS).
The RSS feed provides fewer data like amount of seeders/leechers and the files
in the torrent file. It's a tradeoff for a "stable" engine as the XML from RSS
content will change way less than the HTML page.
.. _BT4G: https://bt4g.com/
Configuration
=============
The engine has the following additional settings:
- :py:obj:`bt4g_order_by`
- :py:obj:`bt4g_category`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific torrent searches. For example a engine to search only for
Movies and sort the result list by the count of seeders.
.. code:: yaml
- name: bt4g.movie
engine: bt4g
shortcut: bt4gv
categories: video
bt4g_order_by: seeders
bt4g_category: 'movie'
Implementations
===============
"""
from datetime import datetime
from urllib.parse import quote
from lxml import etree
# about
about = {
"website": 'https://bt4gprx.com',
"use_official_api": False,
"require_api_key": False,
"results": 'XML',
}
# engine dependent config
categories = ['files']
paging = True
time_range_support = True
# search-url
url = 'https://bt4gprx.com'
search_url = url + '/search?q={search_term}&orderby={order_by}&category={category}&p={pageno}&page=rss'
bt4g_order_by = 'relevance'
"""Result list can be ordered by ``relevance`` (default), ``size``, ``seeders``
or ``time``.
.. hint::
When *time_range* is activate, the results always ordered by ``time``.
"""
bt4g_category = 'all'
"""BT$G offers categories: ``all`` (default), ``audio``, ``movie``, ``doc``,
``app`` and `` other``.
"""
def request(query, params):
order_by = bt4g_order_by
if params['time_range']:
order_by = 'time'
params['url'] = search_url.format(
search_term=quote(query),
order_by=order_by,
category=bt4g_category,
pageno=params['pageno'],
)
return params
def response(resp):
results = []
search_results = etree.XML(resp.content)
# return empty array if nothing is found
if len(search_results) == 0:
return []
for entry in search_results.xpath('./channel/item'):
title = entry.find("title").text
link = entry.find("guid").text
fullDescription = entry.find("description").text.split('<br>')
magnetlink = entry.find("link").text
pubDate = entry.find("pubDate").text
results.append(
{
'url': link,
'title': title,
'magnetlink': magnetlink,
'seed': 'N/A',
'leech': 'N/A',
'filesize': fullDescription[1],
'publishedDate': datetime.strptime(pubDate, '%a,%d %b %Y %H:%M:%S %z'),
'template': 'torrent.html',
}
)
return results

85
searx/engines/btdigg.py Normal file
View File

@@ -0,0 +1,85 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
BTDigg (Videos, Music, Files)
"""
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://btdig.com',
"wikidata_id": 'Q4836698',
"official_api_documentation": {'url': 'https://btdig.com/contacts', 'comment': 'on demand'},
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files']
paging = True
# search-url
url = 'https://btdig.com'
search_url = url + '/search?q={search_term}&p={pageno}'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'] - 1)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = dom.xpath('//div[@class="one_result"]')
# return empty array if nothing is found
if not search_res:
return []
# parse results
for result in search_res:
link = result.xpath('.//div[@class="torrent_name"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
title = extract_text(link)
excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]
content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False)
content = content.strip().replace('\n', ' | ')
content = ' '.join(content.split())
filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0]
files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0]
# convert files to int if possible
try:
files = int(files)
except: # pylint: disable=bare-except
files = None
magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href']
# append result
results.append(
{
'url': href,
'title': title,
'content': content,
'filesize': filesize,
'files': files,
'magnetlink': magnetlink,
'template': 'torrent.html',
}
)
# return results sorted by seeder
return results

View File

@@ -0,0 +1,59 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""media.ccc.de"""
import datetime
from urllib.parse import urlencode
from dateutil import parser
about = {
'website': 'https://media.ccc.de',
'official_api_documentation': 'https://github.com/voc/voctoweb',
'use_official_api': True,
'require_api_key': False,
'results': 'JSON',
}
categories = ['videos']
paging = True
api_url = "https://api.media.ccc.de"
def request(query, params):
args = {'q': query, 'page': params['pageno']}
params['url'] = f"{api_url}/public/events/search?{urlencode(args)}"
return params
def response(resp):
results = []
for item in resp.json()['events']:
publishedDate = None
if item.get('date'):
publishedDate = parser.parse(item['date'])
iframe_src = None
for rec in item['recordings']:
if rec['mime_type'].startswith('video'):
if not iframe_src:
iframe_src = rec['recording_url']
elif rec['mime_type'] == 'video/mp4':
# prefer mp4 (minimal data rates)
iframe_src = rec['recording_url']
results.append(
{
'template': 'videos.html',
'url': item['frontend_link'],
'title': item['title'],
'content': item['description'],
'thumbnail': item['thumb_url'],
'publishedDate': publishedDate,
'length': datetime.timedelta(seconds=item['length']),
'iframe_src': iframe_src,
}
)
return results

68
searx/engines/chefkoch.py Normal file
View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Chefkoch is a German database of recipes.
"""
from datetime import datetime
from urllib.parse import urlencode
about = {
'website': "https://www.chefkoch.de",
'official_api_documentation': None,
'use_official_api': False,
'require_api_key': False,
'results': 'JSON',
'language': 'de',
}
paging = True
categories = []
number_of_results = 20
skip_premium = True
base_url = "https://api.chefkoch.de"
thumbnail_format = "crop-240x300"
def request(query, params):
args = {'query': query, 'limit': number_of_results, 'offset': (params['pageno'] - 1) * number_of_results}
params['url'] = f"{base_url}/v2/search-gateway/recipes?{urlencode(args)}"
return params
def response(resp):
results = []
json = resp.json()
for result in json['results']:
recipe = result['recipe']
if skip_premium and (recipe['isPremium'] or recipe['isPlus']):
continue
publishedDate = None
if recipe['submissionDate']:
publishedDate = datetime.strptime(result['recipe']['submissionDate'][:19], "%Y-%m-%dT%H:%M:%S")
content = [
f"Schwierigkeitsstufe (1-3): {recipe['difficulty']}",
f"Zubereitungszeit: {recipe['preparationTime']}min",
f"Anzahl der Zutaten: {recipe['ingredientCount']}",
]
if recipe['subtitle']:
content.insert(0, recipe['subtitle'])
results.append(
{
'url': recipe['siteUrl'],
'title': recipe['title'],
'content': " | ".join(content),
'thumbnail': recipe['previewImageUrlTemplate'].replace("<format>", thumbnail_format),
'publishedDate': publishedDate,
}
)
return results

223
searx/engines/chinaso.py Normal file
View File

@@ -0,0 +1,223 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""ChinaSo_, a search engine for the chinese language area.
.. attention::
ChinaSo engine does not return real URL, the links from these search
engines violate the privacy of the users!!
We try to find a solution for this problem, please follow `issue #4694`_.
As long as the problem has not been resolved, these engines are
not active in a standard setup (``inactive: true``).
.. _ChinaSo: https://www.chinaso.com/
.. _issue #4694: https://github.com/searxng/searxng/issues/4694
Configuration
=============
The engine has the following additional settings:
- :py:obj:`chinaso_category` (:py:obj:`ChinasoCategoryType`)
- :py:obj:`chinaso_news_source` (:py:obj:`ChinasoNewsSourceType`)
In the example below, all three ChinaSO engines are using the :ref:`network
<engine network>` from the ``chinaso news`` engine.
.. code:: yaml
- name: chinaso news
engine: chinaso
shortcut: chinaso
categories: [news]
chinaso_category: news
chinaso_news_source: all
- name: chinaso images
engine: chinaso
network: chinaso news
shortcut: chinasoi
categories: [images]
chinaso_category: images
- name: chinaso videos
engine: chinaso
network: chinaso news
shortcut: chinasov
categories: [videos]
chinaso_category: videos
Implementations
===============
"""
import typing
from urllib.parse import urlencode
from datetime import datetime
from searx.exceptions import SearxEngineAPIException
from searx.utils import html_to_text
about = {
"website": "https://www.chinaso.com/",
"wikidata_id": "Q10846064",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
"language": "zh",
}
paging = True
time_range_support = True
results_per_page = 10
categories = []
ChinasoCategoryType = typing.Literal['news', 'videos', 'images']
"""ChinaSo supports news, videos, images search.
- ``news``: search for news
- ``videos``: search for videos
- ``images``: search for images
In the category ``news`` you can additionally filter by option
:py:obj:`chinaso_news_source`.
"""
chinaso_category = 'news'
"""Configure ChinaSo category (:py:obj:`ChinasoCategoryType`)."""
ChinasoNewsSourceType = typing.Literal['CENTRAL', 'LOCAL', 'BUSINESS', 'EPAPER', 'all']
"""Filtering ChinaSo-News results by source:
- ``CENTRAL``: central publication
- ``LOCAL``: local publication
- ``BUSINESS``: business publication
- ``EPAPER``: E-Paper
- ``all``: all sources
"""
chinaso_news_source: ChinasoNewsSourceType = 'all'
"""Configure ChinaSo-News type (:py:obj:`ChinasoNewsSourceType`)."""
time_range_dict = {'day': '24h', 'week': '1w', 'month': '1m', 'year': '1y'}
base_url = "https://www.chinaso.com"
def init(_):
if chinaso_category not in ('news', 'videos', 'images'):
raise ValueError(f"Unsupported category: {chinaso_category}")
if chinaso_category == 'news' and chinaso_news_source not in typing.get_args(ChinasoNewsSourceType):
raise ValueError(f"Unsupported news source: {chinaso_news_source}")
def request(query, params):
query_params = {"q": query}
if time_range_dict.get(params['time_range']):
query_params["stime"] = time_range_dict[params['time_range']]
query_params["etime"] = 'now'
category_config = {
'news': {'endpoint': '/v5/general/v1/web/search', 'params': {'pn': params["pageno"], 'ps': results_per_page}},
'images': {
'endpoint': '/v5/general/v1/search/image',
'params': {'start_index': (params["pageno"] - 1) * results_per_page, 'rn': results_per_page},
},
'videos': {
'endpoint': '/v5/general/v1/search/video',
'params': {'start_index': (params["pageno"] - 1) * results_per_page, 'rn': results_per_page},
},
}
if chinaso_news_source != 'all':
if chinaso_news_source == 'EPAPER':
category_config['news']['params']["type"] = 'EPAPER'
else:
category_config['news']['params']["cate"] = chinaso_news_source
query_params.update(category_config[chinaso_category]['params'])
params["url"] = f"{base_url}{category_config[chinaso_category]['endpoint']}?{urlencode(query_params)}"
return params
def response(resp):
try:
data = resp.json()
except Exception as e:
raise SearxEngineAPIException(f"Invalid response: {e}") from e
parsers = {'news': parse_news, 'images': parse_images, 'videos': parse_videos}
return parsers[chinaso_category](data)
def parse_news(data):
results = []
if not data.get("data", {}).get("data"):
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["data"]:
published_date = None
if entry.get("timestamp"):
try:
published_date = datetime.fromtimestamp(int(entry["timestamp"]))
except (ValueError, TypeError):
pass
results.append(
{
'title': html_to_text(entry["title"]),
'url': entry["url"],
'content': html_to_text(entry["snippet"]),
'publishedDate': published_date,
}
)
return results
def parse_images(data):
results = []
if not data.get("data", {}).get("arrRes"):
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["arrRes"]:
results.append(
{
'url': entry["web_url"],
'title': html_to_text(entry["title"]),
'content': html_to_text(entry["ImageInfo"]),
'template': 'images.html',
'img_src': entry["url"].replace("http://", "https://"),
'thumbnail_src': entry["largeimage"].replace("http://", "https://"),
}
)
return results
def parse_videos(data):
results = []
if not data.get("data", {}).get("arrRes"):
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["arrRes"]:
published_date = None
if entry.get("VideoPubDate"):
try:
published_date = datetime.fromtimestamp(int(entry["VideoPubDate"]))
except (ValueError, TypeError):
pass
results.append(
{
'url': entry["url"],
'title': html_to_text(entry["raw_title"]),
'template': 'videos.html',
'publishedDate': published_date,
'thumbnail': entry["image_src"].replace("http://", "https://"),
}
)
return results

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Cloudflare AI engine"""
from json import loads, dumps
from searx.exceptions import SearxEngineAPIException
about = {
"website": 'https://ai.cloudflare.com',
"wikidata_id": None,
"official_api_documentation": 'https://developers.cloudflare.com/workers-ai',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
cf_account_id = ''
cf_ai_api = ''
cf_ai_gateway = ''
cf_ai_model = ''
cf_ai_model_display_name = 'Cloudflare AI'
# Assistant messages hint to the AI about the desired output format. Not all models support this role.
cf_ai_model_assistant = 'Keep your answers as short and effective as possible.'
# System messages define the AI's personality. You can use them to set rules and how you expect the AI to behave.
cf_ai_model_system = 'You are a self-aware language model who is honest and direct about any question from the user.'
def request(query, params):
params['query'] = query
params['url'] = f'https://gateway.ai.cloudflare.com/v1/{cf_account_id}/{cf_ai_gateway}/workers-ai/{cf_ai_model}'
params['method'] = 'POST'
params['headers']['Authorization'] = f'Bearer {cf_ai_api}'
params['headers']['Content-Type'] = 'application/json'
params['data'] = dumps(
{
'messages': [
{'role': 'assistant', 'content': cf_ai_model_assistant},
{'role': 'system', 'content': cf_ai_model_system},
{'role': 'user', 'content': params['query']},
]
}
).encode('utf-8')
return params
def response(resp):
results = []
json = loads(resp.text)
if 'error' in json:
raise SearxEngineAPIException('Cloudflare AI error: ' + json['error'])
if 'result' in json:
results.append(
{
'content': json['result']['response'],
'infobox': cf_ai_model_display_name,
}
)
return results

243
searx/engines/command.py Normal file
View File

@@ -0,0 +1,243 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""With *command engines* administrators can run engines to integrate arbitrary
shell commands.
.. attention::
When creating and enabling a ``command`` engine on a public instance, you
must be careful to avoid leaking private data.
The easiest solution is to limit the access by setting ``tokens`` as described
in section :ref:`private engines`. The engine base is flexible. Only your
imagination can limit the power of this engine (and maybe security concerns).
Configuration
=============
The following options are available:
``command``:
A comma separated list of the elements of the command. A special token
``{{QUERY}}`` tells where to put the search terms of the user. Example:
.. code:: yaml
['ls', '-l', '-h', '{{QUERY}}']
``delimiter``:
A mapping containing a delimiter ``char`` and the *titles* of each element in
``keys``.
``parse_regex``:
A dict containing the regular expressions for each result key.
``query_type``:
The expected type of user search terms. Possible values: ``path`` and
``enum``.
``path``:
Checks if the user provided path is inside the working directory. If not,
the query is not executed.
``enum``:
Is a list of allowed search terms. If the user submits something which is
not included in the list, the query returns an error.
``query_enum``:
A list containing allowed search terms if ``query_type`` is set to ``enum``.
``working_dir``:
The directory where the command has to be executed. Default: ``./``.
``result_separator``:
The character that separates results. Default: ``\\n``.
Example
=======
The example engine below can be used to find files with a specific name in the
configured working directory:
.. code:: yaml
- name: find
engine: command
command: ['find', '.', '-name', '{{QUERY}}']
query_type: path
shortcut: fnd
delimiter:
chars: ' '
keys: ['line']
Implementations
===============
"""
import re
from os.path import expanduser, isabs, realpath, commonprefix
from shlex import split as shlex_split
from subprocess import Popen, PIPE
from threading import Thread
from searx import logger
from searx.result_types import EngineResults
engine_type = 'offline'
paging = True
command = []
delimiter = {}
parse_regex = {}
query_type = ''
query_enum = []
environment_variables = {}
working_dir = realpath('.')
result_separator = '\n'
timeout = 4.0
_command_logger = logger.getChild('command')
_compiled_parse_regex = {}
def init(engine_settings):
check_parsing_options(engine_settings)
if 'command' not in engine_settings:
raise ValueError('engine command : missing configuration key: command')
global command, working_dir, delimiter, parse_regex, environment_variables # pylint: disable=global-statement
command = engine_settings['command']
if 'working_dir' in engine_settings:
working_dir = engine_settings['working_dir']
if not isabs(engine_settings['working_dir']):
working_dir = realpath(working_dir)
if 'parse_regex' in engine_settings:
parse_regex = engine_settings['parse_regex']
for result_key, regex in parse_regex.items():
_compiled_parse_regex[result_key] = re.compile(regex, flags=re.MULTILINE)
if 'delimiter' in engine_settings:
delimiter = engine_settings['delimiter']
if 'environment_variables' in engine_settings:
environment_variables = engine_settings['environment_variables']
def search(query, params) -> EngineResults:
res = EngineResults()
cmd = _get_command_to_run(query)
if not cmd:
return res
reader_thread = Thread(target=_get_results_from_process, args=(res, cmd, params['pageno']))
reader_thread.start()
reader_thread.join(timeout=timeout)
return res
def _get_command_to_run(query):
params = shlex_split(query)
__check_query_params(params)
cmd = []
for c in command:
if c == '{{QUERY}}':
cmd.extend(params)
else:
cmd.append(c)
return cmd
def _get_results_from_process(res: EngineResults, cmd, pageno):
leftover = ''
count = 0
start, end = __get_results_limits(pageno)
with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process:
line = process.stdout.readline()
while line:
buf = leftover + line.decode('utf-8')
raw_results = buf.split(result_separator)
if raw_results[-1]:
leftover = raw_results[-1]
raw_results = raw_results[:-1]
for raw_result in raw_results:
result = __parse_single_result(raw_result)
if result is None:
_command_logger.debug('skipped result:', raw_result)
continue
if start <= count and count <= end: # pylint: disable=chained-comparison
res.add(res.types.KeyValue(kvmap=result))
count += 1
if end < count:
return res
line = process.stdout.readline()
return_code = process.wait(timeout=timeout)
if return_code != 0:
raise RuntimeError('non-zero return code when running command', cmd, return_code)
return None
def __get_results_limits(pageno):
start = (pageno - 1) * 10
end = start + 9
return start, end
def __check_query_params(params):
if not query_type:
return
if query_type == 'path':
query_path = params[-1]
query_path = expanduser(query_path)
if commonprefix([realpath(query_path), working_dir]) != working_dir:
raise ValueError('requested path is outside of configured working directory')
elif query_type == 'enum' and len(query_enum) > 0:
for param in params:
if param not in query_enum:
raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum)
def check_parsing_options(engine_settings):
"""Checks if delimiter based parsing or regex parsing is configured correctly"""
if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings:
raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex')
if 'delimiter' in engine_settings and 'parse_regex' in engine_settings:
raise ValueError('failed to init settings for parsing lines: too many settings')
if 'delimiter' in engine_settings:
if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']:
raise ValueError
def __parse_single_result(raw_result):
"""Parses command line output based on configuration"""
result = {}
if delimiter:
elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1)
if len(elements) != len(delimiter['keys']):
return {}
for i in range(len(elements)): # pylint: disable=consider-using-enumerate
result[delimiter['keys'][i]] = elements[i]
if parse_regex:
for result_key, regex in _compiled_parse_regex.items():
found = regex.search(raw_result)
if not found:
return {}
result[result_key] = raw_result[found.start() : found.end()]
return result

151
searx/engines/core.py Normal file
View File

@@ -0,0 +1,151 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""CORE_ (COnnecting REpositories) provides a comprehensive bibliographic
database of the worlds scholarly literature, collecting and indexing
research from repositories and journals.
.. _CORE: https://core.ac.uk/about
.. _core engine config:
Configuration
=============
The engine has the following additional settings:
- :py:obj:`api_key`
.. code:: yaml
- name: core.ac.uk
engine: core
categories: science
shortcut: cor
api_key: "..."
timeout: 5
Implementations
===============
"""
# pylint: disable=too-many-branches
from datetime import datetime
from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
about = {
"website": 'https://core.ac.uk',
"wikidata_id": 'Q22661180',
"official_api_documentation": 'https://api.core.ac.uk/docs/v3',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
api_key = 'unset'
"""For an API key register at https://core.ac.uk/services/api and insert
the API key in the engine :ref:`core engine config`."""
categories = ['science', 'scientific publications']
paging = True
nb_per_page = 10
base_url = 'https://api.core.ac.uk/v3/search/works/'
def request(query, params):
if api_key == 'unset':
raise SearxEngineAPIException('missing CORE API key')
# API v3 uses different parameters
search_params = {
'q': query,
'offset': (params['pageno'] - 1) * nb_per_page,
'limit': nb_per_page,
'sort': 'relevance',
}
params['url'] = base_url + '?' + urlencode(search_params)
params['headers'] = {'Authorization': f'Bearer {api_key}'}
return params
def response(resp):
results = []
json_data = resp.json()
for result in json_data.get('results', []):
# Get title
if not result.get('title'):
continue
# Get URL - try different options
url = None
# Try DOI first
doi = result.get('doi')
if doi:
url = f'https://doi.org/{doi}'
if url is None and result.get('doi'):
# use the DOI reference
url = 'https://doi.org/' + str(result['doi'])
elif result.get('id'):
url = 'https://core.ac.uk/works/' + str(result['id'])
elif result.get('downloadUrl'):
url = result['downloadUrl']
elif result.get('sourceFulltextUrls'):
url = result['sourceFulltextUrls']
else:
continue
# Published date
published_date = None
raw_date = result.get('publishedDate') or result.get('depositedDate')
if raw_date:
try:
published_date = datetime.fromisoformat(result['publishedDate'].replace('Z', '+00:00'))
except (ValueError, AttributeError):
pass
# Handle journals
journals = []
if result.get('journals'):
journals = [j.get('title') for j in result['journals'] if j.get('title')]
# Handle publisher
publisher = result.get('publisher', '').strip("'")
if publisher:
publisher = publisher.strip("'")
# Handle authors
authors = set()
for i in result.get('authors', []):
name = i.get("name")
if name:
authors.add(name)
results.append(
{
'template': 'paper.html',
'title': result.get('title'),
'url': url,
'content': result.get('fullText', '') or '',
# 'comments': '',
'tags': result.get('fieldOfStudy', []),
'publishedDate': published_date,
'type': result.get('documentType', '') or '',
'authors': authors,
'editor': ', '.join(result.get('contributors', [])),
'publisher': publisher,
'journal': ', '.join(journals),
'doi': result.get('doi'),
# 'issn' : ''
# 'isbn' : ''
'pdf_url': result.get('downloadUrl', {}) or result.get("sourceFulltextUrls", {}),
}
)
return results

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Cppreference
"""
from lxml import html
from searx.utils import eval_xpath
about = {
"website": "https://en.cppreference.com/",
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['it']
url = 'https://en.cppreference.com/'
search_url = url + 'mwiki/index.php?title=Special%3ASearch&search={query}'
def request(query, params):
params['url'] = search_url.format(query=query)
return query
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath(dom, '//div[contains(@class, "mw-search-result-heading")]'):
results.append(
{
'url': url + eval_xpath(result, './/a/@href')[0],
'title': eval_xpath(result, './/a/text()')[0],
}
)
return results

70
searx/engines/crates.py Normal file
View File

@@ -0,0 +1,70 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Cargo search on crates.io"""
from collections import OrderedDict
from urllib.parse import urlencode
from dateutil import parser
about = {
"website": "https://crates.io/",
"wikidata_id": None,
"official_api_documentation": "https://crates.io/data-access",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ["it", "packages", "cargo"]
# engine dependent config
paging = True
page_size = 10
search_url = "https://crates.io/api/v1/crates"
linked_terms = OrderedDict(
[
("homepage", "Project homepage"),
("documentation", "Documentation"),
("repository", "Source code"),
]
)
def request(query: str, params):
args = urlencode({"page": params["pageno"], "q": query, "per_page": page_size})
params["url"] = f"{search_url}?{args}"
return params
def response(resp):
results = []
for package in resp.json()["crates"]:
published_date = package.get("updated_at")
published_date = parser.parse(published_date)
links = {}
for k, v in linked_terms.items():
l = package.get(k)
if l:
links[v] = l
results.append(
{
"template": "packages.html",
"url": f'https://crates.io/crates/{package["name"]}',
"title": package["name"],
"package_name": package["name"],
"tags": package["keywords"],
"content": package["description"],
"version": package["newest_version"] or package["max_version"] or package["max_stable_version"],
"publishedDate": published_date,
"links": links,
}
)
return results

63
searx/engines/crossref.py Normal file
View File

@@ -0,0 +1,63 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""CrossRef"""
from urllib.parse import urlencode
from datetime import datetime
about = {
"website": "https://www.crossref.org/",
"wikidata_id": "Q5188229",
"official_api_documentation": "https://api.crossref.org",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
categories = ["science", "scientific publications"]
paging = True
search_url = "https://api.crossref.org/works"
def request(query, params):
params["url"] = search_url + "?" + urlencode({"query": query, "offset": 20 * (params["pageno"] - 1)})
return params
def response(resp):
results = []
for record in resp.json()["message"]["items"]:
if record["type"] == "component":
# These seem to be files published along with papers. Not something you'd search for
continue
result = {
"template": "paper.html",
"content": record.get("abstract", ""),
"doi": record.get("DOI"),
"pages": record.get("page"),
"publisher": record.get("publisher"),
"tags": record.get("subject"),
"type": record.get("type"),
"url": record.get("URL"),
"volume": record.get("volume"),
}
if record["type"] == "book-chapter":
result["title"] = record["container-title"][0]
if record["title"][0].lower().strip() != result["title"].lower().strip():
result["title"] += f" ({record['title'][0]})"
else:
result["title"] = record["title"][0] if "title" in record else record.get("container-title", [None])[0]
result["journal"] = record.get("container-title", [None])[0] if "title" in record else None
if "resource" in record and "primary" in record["resource"] and "URL" in record["resource"]["primary"]:
result["url"] = record["resource"]["primary"]["URL"]
if "published" in record and "date-parts" in record["published"]:
result["publishedDate"] = datetime(*(record["published"]["date-parts"][0] + [1, 1][:3]))
result["authors"] = [a.get("given", "") + " " + a.get("family", "") for a in record.get("author", [])]
result["isbn"] = record.get("isbn") or [i["value"] for i in record.get("isbn-type", [])]
# All the links are not PDFs, even if the URL ends with ".pdf"
# result["pdf_url"] = record.get("link", [{"URL": None}])[0]["URL"]
results.append(result)
return results

View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Currency convert (DuckDuckGo)
"""
import json
from searx.result_types import EngineResults
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": False,
"require_api_key": False,
"results": 'JSONP',
"description": "Service from DuckDuckGo.",
}
engine_type = 'online_currency'
categories = []
base_url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'
weight = 100
https_support = True
def request(_query, params):
params['url'] = base_url.format(params['from'], params['to'])
return params
def response(resp) -> EngineResults:
res = EngineResults()
# remove first and last lines to get only json
json_resp = resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2]
try:
conversion_rate = float(json.loads(json_resp)["to"][0]["mid"])
except IndexError:
return res
answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format(
resp.search_params['amount'],
resp.search_params['from'],
resp.search_params['amount'] * conversion_rate,
resp.search_params['to'],
conversion_rate,
resp.search_params['from_name'],
resp.search_params['to_name'],
)
url = f"https://duckduckgo.com/?q={resp.search_params['from']}+to+{resp.search_params['to']}"
res.add(res.types.Answer(answer=answer, url=url))
return res

View File

@@ -0,0 +1,251 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dailymotion (Videos)
~~~~~~~~~~~~~~~~~~~~
.. _REST GET: https://developers.dailymotion.com/tools/
.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters
.. _Video filters API: https://developers.dailymotion.com/api/#video-filters
.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection
"""
from typing import TYPE_CHECKING
from datetime import datetime, timedelta
from urllib.parse import urlencode
import time
import babel
from searx.network import get, raise_for_httperror # see https://github.com/searxng/searxng/issues/762
from searx.utils import html_to_text
from searx.exceptions import SearxEngineAPIException
from searx.locales import region_tag, language_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.dailymotion.com',
"wikidata_id": 'Q769222',
"official_api_documentation": 'https://www.dailymotion.com/developer',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['videos']
paging = True
number_of_results = 10
time_range_support = True
time_delta_dict = {
"day": timedelta(days=1),
"week": timedelta(days=7),
"month": timedelta(days=31),
"year": timedelta(days=365),
}
safesearch = True
safesearch_params = {
2: {'is_created_for_kids': 'true'},
1: {'is_created_for_kids': 'true'},
0: {},
}
"""True if this video is "Created for Kids" / intends to target an audience
under the age of 16 (``is_created_for_kids`` in `Video filters API`_ )
"""
family_filter_map = {
2: 'true',
1: 'true',
0: 'false',
}
"""By default, the family filter is turned on. Setting this parameter to
``false`` will stop filtering-out explicit content from searches and global
contexts (``family_filter`` in `Global API Parameters`_ ).
"""
result_fields = [
'allow_embed',
'description',
'title',
'created_time',
'duration',
'url',
'thumbnail_360_url',
'id',
]
"""`Fields selection`_, by default, a few fields are returned. To request more
specific fields, the ``fields`` parameter is used with the list of fields
SearXNG needs in the response to build a video result list.
"""
search_url = 'https://api.dailymotion.com/videos?'
"""URL to retrieve a list of videos.
- `REST GET`_
- `Global API Parameters`_
- `Video filters API`_
"""
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
"""URL template to embed video in SearXNG's result list."""
def request(query, params):
if not query:
return False
eng_region: str = traits.get_region(params['searxng_locale'], 'en_US') # type: ignore
eng_lang = traits.get_language(params['searxng_locale'], 'en')
args = {
'search': query,
'family_filter': family_filter_map.get(params['safesearch'], 'false'),
'thumbnail_ratio': 'original', # original|widescreen|square
# https://developers.dailymotion.com/api/#video-filters
'languages': eng_lang,
'page': params['pageno'],
'password_protected': 'false',
'private': 'false',
'sort': 'relevance',
'limit': number_of_results,
'fields': ','.join(result_fields),
}
args.update(safesearch_params.get(params['safesearch'], {}))
# Don't add localization and country arguments if the user does select a
# language (:de, :en, ..)
if len(params['searxng_locale'].split('-')) > 1:
# https://developers.dailymotion.com/api/#global-parameters
args['localization'] = eng_region
args['country'] = eng_region.split('_')[1]
# Insufficient rights for the `ams_country' parameter of route `GET /videos'
# 'ams_country': eng_region.split('_')[1],
time_delta = time_delta_dict.get(params["time_range"])
if time_delta:
created_after = datetime.now() - time_delta
args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(args)
params['url'] = search_url + query_str
return params
# get response from search-request
def response(resp):
results = []
search_res = resp.json()
# check for an API error
if 'error' in search_res:
raise SearxEngineAPIException(search_res['error'].get('message'))
raise_for_httperror(resp)
# parse results
for res in search_res.get('list', []):
title = res['title']
url = res['url']
content = html_to_text(res['description'])
if len(content) > 300:
content = content[:300] + '...'
publishedDate = datetime.fromtimestamp(res['created_time'], None)
length = time.gmtime(res.get('duration'))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
thumbnail = res['thumbnail_360_url']
thumbnail = thumbnail.replace("http://", "https://")
item = {
'template': 'videos.html',
'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate,
'length': length,
'thumbnail': thumbnail,
}
# HINT: no mater what the value is, without API token videos can't shown
# embedded
if res['allow_embed']:
item['iframe_src'] = iframe_src.format(video_id=res['id'])
results.append(item)
# return results
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch locales & languages from dailymotion.
Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_.
There are duplications in the locale codes returned from Dailymotion which
can be ignored::
en_EN --> en_GB, en_US
ar_AA --> ar_EG, ar_AE, ar_SA
The language list `api/languages <https://api.dailymotion.com/languages>`_
contains over 7000 *languages* codes (see PR1071_). We use only those
language codes that are used in the locales.
.. _PR1071: https://github.com/searxng/searxng/pull/1071
"""
resp = get('https://api.dailymotion.com/locales')
if not resp.ok: # type: ignore
print("ERROR: response from dailymotion/locales is not OK.")
for item in resp.json()['list']: # type: ignore
eng_tag = item['locale']
if eng_tag in ('en_EN', 'ar_AA'):
continue
try:
sxng_tag = region_tag(babel.Locale.parse(eng_tag))
except babel.UnknownLocaleError:
print("ERROR: item unknown --> %s" % item)
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()]
resp = get('https://api.dailymotion.com/languages')
if not resp.ok: # type: ignore
print("ERROR: response from dailymotion/languages is not OK.")
for item in resp.json()['list']: # type: ignore
eng_tag = item['code']
if eng_tag in locale_lang_list:
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
engine_traits.languages[sxng_tag] = eng_tag

52
searx/engines/deepl.py Normal file
View File

@@ -0,0 +1,52 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Deepl translation engine"""
from searx.result_types import EngineResults
about = {
"website": 'https://deepl.com',
"wikidata_id": 'Q43968444',
"official_api_documentation": 'https://www.deepl.com/docs-api',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general', 'translate']
url = 'https://api-free.deepl.com/v2/translate'
api_key = None
def request(_query, params):
'''pre-request callback
params<dict>:
- ``method`` : POST/GET
- ``headers``: {}
- ``data``: {} # if method == POST
- ``url``: ''
- ``category``: 'search category'
- ``pageno``: 1 # number of the requested page
'''
params['url'] = url
params['method'] = 'POST'
params['data'] = {'auth_key': api_key, 'text': params['query'], 'target_lang': params['to_lang'][1]}
return params
def response(resp) -> EngineResults:
res = EngineResults()
data = resp.json()
if not data.get('translations'):
return res
translations = [res.types.Translations.Item(text=t['text']) for t in data['translations']]
res.add(res.types.Translations(translations=translations))
return res

61
searx/engines/deezer.py Normal file
View File

@@ -0,0 +1,61 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Deezer (Music)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://deezer.com',
"wikidata_id": 'Q602243',
"official_api_documentation": 'https://developers.deezer.com/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['music']
paging = True
# search-url
url = 'https://api.deezer.com/'
search_url = url + 'search?{query}&index={offset}'
iframe_src = "https://www.deezer.com/plugins/player?type=tracks&id={audioid}"
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 25
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
return params
# get response from search-request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get('data', []):
if result['type'] == 'track':
title = result['title']
url = result['link'] # pylint: disable=redefined-outer-name
if url.startswith('http://'):
url = 'https' + url[4:]
content = '{} - {} - {}'.format(result['artist']['name'], result['album']['title'], result['title'])
# append result
results.append(
{'url': url, 'title': title, 'iframe_src': iframe_src.format(audioid=result['id']), 'content': content}
)
# return results
return results

View File

@@ -0,0 +1,86 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Within this module we implement a *demo offline engine*. Do not look to
close to the implementation, its just a simple example. To get in use of this
*demo* engine add the following entry to your engines list in ``settings.yml``:
.. code:: yaml
- name: my offline engine
engine: demo_offline
shortcut: demo
disabled: false
"""
import json
from searx.result_types import EngineResults
from searx.enginelib import EngineCache
engine_type = 'offline'
categories = ['general']
disabled = True
timeout = 2.0
about = {
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
# if there is a need for globals, use a leading underline
_my_offline_engine: str = ""
CACHE: EngineCache
"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
seconds."""
def init(engine_settings):
"""Initialization of the (offline) engine. The origin of this demo engine is a
simple json string which is loaded in this example while the engine is
initialized."""
global _my_offline_engine, CACHE # pylint: disable=global-statement
CACHE = EngineCache(engine_settings["name"]) # type:ignore
_my_offline_engine = (
'[ {"value": "%s"}'
', {"value":"first item"}'
', {"value":"second item"}'
', {"value":"third item"}'
']' % engine_settings.get('name')
)
def search(query, request_params) -> EngineResults:
"""Query (offline) engine and return results. Assemble the list of results
from your local engine. In this demo engine we ignore the 'query' term,
usual you would pass the 'query' term to your local engine to filter out the
results.
"""
res = EngineResults()
count = CACHE.get("count", 0)
for row in json.loads(_my_offline_engine):
count += 1
kvmap = {
'query': query,
'language': request_params['searxng_locale'],
'value': row.get("value"),
}
res.add(
res.types.KeyValue(
caption=f"Demo Offline Engine Result #{count}",
key_title="Name",
value_title="Value",
kvmap=kvmap,
)
)
res.add(res.types.LegacyResult(number_of_results=count))
# cache counter value for 20sec
CACHE.set("count", count, expire=20)
return res

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Within this module we implement a *demo online engine*. Do not look to
close to the implementation, its just a simple example which queries `The Art
Institute of Chicago <https://www.artic.edu>`_
To get in use of this *demo* engine add the following entry to your engines
list in ``settings.yml``:
.. code:: yaml
- name: my online engine
engine: demo_online
shortcut: demo
disabled: false
"""
from json import loads
from urllib.parse import urlencode
from searx.result_types import EngineResults
engine_type = 'online'
send_accept_language_header = True
categories = ['general']
disabled = True
timeout = 2.0
categories = ['images']
paging = True
page_size = 20
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
image_api = 'https://www.artic.edu/iiif/2/'
about = {
"website": 'https://www.artic.edu',
"wikidata_id": 'Q239303',
"official_api_documentation": 'http://api.artic.edu/docs/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# if there is a need for globals, use a leading underline
_my_online_engine = None
def init(engine_settings):
"""Initialization of the (online) engine. If no initialization is needed, drop
this init function.
"""
global _my_online_engine # pylint: disable=global-statement
_my_online_engine = engine_settings.get('name')
def request(query, params):
"""Build up the ``params`` for the online request. In this example we build a
URL to fetch images from `artic.edu <https://artic.edu>`__
"""
args = urlencode(
{
'q': query,
'page': params['pageno'],
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
'limit': page_size,
}
)
params['url'] = search_api + args
return params
def response(resp) -> EngineResults:
"""Parse out the result items from the response. In this example we parse the
response from `api.artic.edu <https://artic.edu>`__ and filter out all
images.
"""
res = EngineResults()
json_data = loads(resp.text)
res.add(
res.types.Answer(
answer="this is a dummy answer ..",
url="https://example.org",
)
)
for result in json_data['data']:
if not result['image_id']:
continue
res.append(
{
'url': 'https://artic.edu/artworks/%(id)s' % result,
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
'content': "%(medium_display)s // %(dimensions)s" % result,
'author': ', '.join(result['artist_titles']),
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
'template': 'images.html',
}
)
return res

67
searx/engines/destatis.py Normal file
View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""DeStatis
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, eval_xpath_list, extract_text
about = {
'website': 'https://www.destatis.de',
'official_api_documentation': 'https://destatis.api.bund.dev/',
'use_official_api': False,
'require_api_key': False,
'results': 'HTML',
'language': 'de',
}
categories = []
paging = True
base_url = "https://www.destatis.de"
search_url = f"{base_url}/SiteGlobals/Forms/Suche/Expertensuche_Formular.html"
# pylint: disable-next=line-too-long
results_xpath = '//div[contains(@class, "l-content-wrapper")]/div[contains(@class, "row")]/div[contains(@class, "column")]/div[contains(@class, "c-result"){extra}]'
results_xpath_filter_recommended = " and not(contains(@class, 'c-result--recommended'))"
url_xpath = './/a/@href'
title_xpath = './/a/text()'
date_xpath = './/a/span[contains(@class, "c-result__date")]'
content_xpath = './/div[contains(@class, "column")]/p/text()'
doctype_xpath = './/div[contains(@class, "c-result__doctype")]/p'
def request(query, params):
args = {
'templateQueryString': query,
'gtp': f"474_list%3D{params['pageno']}",
}
params['url'] = f"{search_url}?{urlencode(args)}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
# filter out suggested results on further page because they're the same on each page
extra_xpath = results_xpath_filter_recommended if resp.search_params['pageno'] > 1 else ''
res_xpath = results_xpath.format(extra=extra_xpath)
for result in eval_xpath_list(dom, res_xpath):
doctype = extract_text(eval_xpath(result, doctype_xpath))
date = extract_text(eval_xpath(result, date_xpath))
metadata = [meta for meta in (doctype, date) if meta != ""]
results.append(
{
'url': base_url + "/" + extract_text(eval_xpath(result, url_xpath)),
'title': extract_text(eval_xpath(result, title_xpath)),
'content': extract_text(eval_xpath(result, content_xpath)),
'metadata': ', '.join(metadata),
}
)
return results

View File

@@ -0,0 +1,87 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Deviantart (Images)
"""
import urllib.parse
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list
# about
about = {
"website": 'https://www.deviantart.com/',
"wikidata_id": 'Q46523',
"official_api_documentation": 'https://www.deviantart.com/developers/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images']
paging = True
# search-url
base_url = 'https://www.deviantart.com'
results_xpath = '//div[@class="_2pZkk"]/div/div/a'
url_xpath = './@href'
thumbnail_src_xpath = './div/img/@src'
img_src_xpath = './div/img/@srcset'
title_xpath = './@aria-label'
premium_xpath = '../div/div/div/text()'
premium_keytext = 'Watch the artist to view this deviation'
cursor_xpath = '(//a[@class="_1OGeq"]/@href)[last()]'
def request(query, params):
# https://www.deviantart.com/search?q=foo
nextpage_url = params['engine_data'].get('nextpage')
# don't use nextpage when user selected to jump back to page 1
if params['pageno'] > 1 and nextpage_url is not None:
params['url'] = nextpage_url
else:
params['url'] = f"{base_url}/search?{urllib.parse.urlencode({'q': query})}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, results_xpath):
# skip images that are blurred
_text = extract_text(eval_xpath(result, premium_xpath))
if _text and premium_keytext in _text:
continue
img_src = extract_text(eval_xpath(result, img_src_xpath))
if img_src:
img_src = img_src.split(' ')[0]
parsed_url = urllib.parse.urlparse(img_src)
img_src = parsed_url._replace(path=parsed_url.path.split('/v1')[0]).geturl()
results.append(
{
'template': 'images.html',
'url': extract_text(eval_xpath(result, url_xpath)),
'img_src': img_src,
'thumbnail_src': extract_text(eval_xpath(result, thumbnail_src_xpath)),
'title': extract_text(eval_xpath(result, title_xpath)),
}
)
nextpage_url = extract_text(eval_xpath(dom, cursor_xpath))
if nextpage_url:
results.append(
{
'engine_data': nextpage_url.replace("http://", "https://"),
'key': 'nextpage',
}
)
return results

105
searx/engines/dictzone.py Normal file
View File

@@ -0,0 +1,105 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dictzone
"""
import urllib.parse
from lxml import html
from searx.utils import eval_xpath, extract_text
from searx.result_types import EngineResults
from searx.network import get as http_get # https://github.com/searxng/searxng/issues/762
# about
about = {
"website": 'https://dictzone.com/',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
engine_type = 'online_dictionary'
categories = ['general', 'translate']
base_url = "https://dictzone.com"
weight = 100
https_support = True
def request(query, params): # pylint: disable=unused-argument
from_lang = params["from_lang"][2] # "english"
to_lang = params["to_lang"][2] # "german"
query = params["query"]
params["url"] = f"{base_url}/{from_lang}-{to_lang}-dictionary/{urllib.parse.quote_plus(query)}"
return params
def _clean_up_node(node):
for x in ["./i", "./span", "./button"]:
for n in node.xpath(x):
n.getparent().remove(n)
def response(resp) -> EngineResults:
results = EngineResults()
item_list = []
if not resp.ok:
return results
dom = html.fromstring(resp.text)
for result in eval_xpath(dom, ".//table[@id='r']//tr"):
# each row is an Translations.Item
td_list = result.xpath("./td")
if len(td_list) != 2:
# ignore header columns "tr/th"
continue
col_from, col_to = td_list
_clean_up_node(col_from)
text = f"{extract_text(col_from)}"
synonyms = []
p_list = col_to.xpath(".//p")
for i, p_item in enumerate(p_list):
smpl: str = extract_text(p_list[i].xpath("./i[@class='smpl']")) # type: ignore
_clean_up_node(p_item)
p_text: str = extract_text(p_item) # type: ignore
if smpl:
p_text += " // " + smpl
if i == 0:
text += f" : {p_text}"
continue
synonyms.append(p_text)
item = results.types.Translations.Item(text=text, synonyms=synonyms)
item_list.append(item)
# the "autotranslate" of dictzone is loaded by the JS from URL:
# https://dictzone.com/trans/hello%20world/en_de
from_lang = resp.search_params["from_lang"][1] # "en"
to_lang = resp.search_params["to_lang"][1] # "de"
query = resp.search_params["query"]
# works only sometimes?
autotranslate = http_get(f"{base_url}/trans/{query}/{from_lang}_{to_lang}", timeout=1.0)
if autotranslate.ok and autotranslate.text:
item_list.insert(0, results.types.Translations.Item(text=autotranslate.text))
if item_list:
results.add(results.types.Translations(translations=item_list, url=resp.search_params["url"]))
return results

64
searx/engines/digbt.py Normal file
View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DigBT (Videos, Music, Files)
"""
from urllib.parse import urljoin
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://digbt.org',
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['videos', 'music', 'files']
paging = True
URL = 'https://digbt.org'
SEARCH_URL = URL + '/search/{query}-time-{pageno}'
FILESIZE = 3
FILESIZE_MULTIPLIER = 4
def request(query, params):
params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno'])
return params
def response(resp):
dom = html.fromstring(resp.text)
search_res = dom.xpath('.//td[@class="x-item"]')
if not search_res:
return []
results = []
for result in search_res:
url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
title = extract_text(result.xpath('.//a[@title]'))
content = extract_text(result.xpath('.//div[@class="files"]'))
files_data = extract_text(result.xpath('.//div[@class="tail"]')).split()
filesize = f"{files_data[FILESIZE]} {files_data[FILESIZE_MULTIPLIER]}"
magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0]
results.append(
{
'url': url,
'title': title,
'content': content,
'filesize': filesize,
'magnetlink': magnetlink,
'seed': 'N/A',
'leech': 'N/A',
'template': 'torrent.html',
}
)
return results

181
searx/engines/discourse.py Normal file
View File

@@ -0,0 +1,181 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. sidebar:: info
- `builtwith.com Discourse <https://trends.builtwith.com/websitelist/Discourse>`_
Discourse is an open source Internet forum system. To search in a forum this
engine offers some additional settings:
- :py:obj:`base_url`
- :py:obj:`api_order`
- :py:obj:`search_endpoint`
- :py:obj:`show_avatar`
- :py:obj:`api_key`
- :py:obj:`api_username`
Example
=======
To search in your favorite Discourse forum, add a configuration like shown here
for the ``paddling.com`` forum:
.. code:: yaml
- name: paddling
engine: discourse
shortcut: paddle
base_url: 'https://forums.paddling.com/'
api_order: views
categories: ['social media', 'sports']
show_avatar: true
If the forum is private, you need to add an API key and username for the search:
.. code:: yaml
- name: paddling
engine: discourse
shortcut: paddle
base_url: 'https://forums.paddling.com/'
api_order: views
categories: ['social media', 'sports']
show_avatar: true
api_key: '<KEY>'
api_username: 'system'
Implementations
===============
"""
from urllib.parse import urlencode
from datetime import datetime, timedelta
import html
from dateutil import parser
from flask_babel import gettext
about = {
"website": "https://discourse.org/",
"wikidata_id": "Q15054354",
"official_api_documentation": "https://docs.discourse.org/",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
base_url: str = None # type: ignore
"""URL of the Discourse forum."""
search_endpoint = '/search.json'
"""URL path of the `search endpoint`_.
.. _search endpoint: https://docs.discourse.org/#tag/Search
"""
api_order = 'likes'
"""Order method, valid values are: ``latest``, ``likes``, ``views``, ``latest_topic``"""
show_avatar = False
"""Show avatar of the user who send the post."""
api_key = ''
"""API key of the Discourse forum."""
api_username = ''
"""API username of the Discourse forum."""
paging = True
time_range_support = True
AGO_TIMEDELTA = {
'day': timedelta(days=1),
'week': timedelta(days=7),
'month': timedelta(days=31),
'year': timedelta(days=365),
}
def request(query, params):
if len(query) <= 2:
return None
q = [query, f'order:{api_order}']
time_range = params.get('time_range')
if time_range:
after_date = datetime.now() - AGO_TIMEDELTA[time_range]
q.append('after:' + after_date.strftime('%Y-%m-%d'))
args = {
'q': ' '.join(q),
'page': params['pageno'],
}
params['url'] = f'{base_url}{search_endpoint}?{urlencode(args)}'
params['headers'] = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
}
if api_key != '':
params['headers']['Api-Key'] = api_key
if api_username != '':
params['headers']['Api-Username'] = api_username
return params
def response(resp):
results = []
json_data = resp.json()
if ('topics' or 'posts') not in json_data.keys():
return []
topics = {}
for item in json_data['topics']:
topics[item['id']] = item
for post in json_data['posts']:
result = topics.get(post['topic_id'], {})
url = f"{base_url}/p/{post['id']}"
status = gettext("closed") if result.get('closed', '') else gettext("open")
comments = result.get('posts_count', 0)
publishedDate = parser.parse(result['created_at'])
metadata = []
metadata.append('@' + post.get('username', ''))
if int(comments) > 1:
metadata.append(f'{gettext("comments")}: {comments}')
if result.get('has_accepted_answer'):
metadata.append(gettext("answered"))
elif int(comments) > 1:
metadata.append(status)
result = {
'url': url,
'title': html.unescape(result['title']),
'content': html.unescape(post.get('blurb', '')),
'metadata': ' | '.join(metadata),
'publishedDate': publishedDate,
'upstream': {'topics': result},
}
avatar = post.get('avatar_template', '').replace('{size}', '96')
if show_avatar and avatar:
result['thumbnail'] = base_url + avatar
results.append(result)
results.append({'number_of_results': len(json_data['topics'])})
return results

View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Docker Hub (IT)
"""
# pylint: disable=use-dict-literal
from urllib.parse import urlencode
from dateutil import parser
about = {
"website": 'https://hub.docker.com',
"wikidata_id": 'Q100769064',
"official_api_documentation": 'https://docs.docker.com/registry/spec/api/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['it', 'packages'] # optional
paging = True
base_url = "https://hub.docker.com"
page_size = 10
def request(query, params):
args = {
"query": query,
"from": page_size * (params['pageno'] - 1),
"size": page_size,
}
params['url'] = f"{base_url}/api/search/v3/catalog/search?{urlencode(args)}"
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
results = []
json_resp = resp.json()
for item in json_resp.get("results", []):
image_source = item.get("source")
is_official = image_source in ["store", "official"]
popularity_infos = [f"{item.get('star_count', 0)} stars"]
architectures = []
for rate_plan in item.get("rate_plans", []):
pull_count = rate_plan.get("repositories", [{}])[0].get("pull_count")
if pull_count:
popularity_infos.insert(0, f"{pull_count} pulls")
architectures.extend(arch['name'] for arch in rate_plan.get("architectures", []) if arch['name'])
result = {
'template': 'packages.html',
'url': base_url + ("/_/" if is_official else "/r/") + item.get("slug", ""),
'title': item.get("name"),
'content': item.get("short_description"),
'thumbnail': item["logo_url"].get("large") or item["logo_url"].get("small"),
'package_name': item.get("name"),
'maintainer': item["publisher"].get("name"),
'publishedDate': parser.parse(item.get("updated_at") or item.get("created_at")),
'popularity': ', '.join(popularity_infos),
'tags': architectures,
}
results.append(result)
return results

87
searx/engines/doku.py Normal file
View File

@@ -0,0 +1,87 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Doku Wiki
"""
from urllib.parse import urlencode
from urllib.parse import urljoin
from lxml.html import fromstring
from searx.utils import extract_text, eval_xpath
# about
about = {
"website": 'https://www.dokuwiki.org/',
"wikidata_id": 'Q851864',
"official_api_documentation": 'https://www.dokuwiki.org/devel:xmlrpc',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general'] # 'images', 'music', 'videos', 'files'
paging = False
number_of_results = 5
# search-url
# Doku is OpenSearch compatible
base_url = 'http://localhost:8090'
search_url = (
# fmt: off
'/?do=search'
'&{query}'
# fmt: on
)
# '&startRecord={offset}'
# '&maximumRecords={limit}'
# do search-request
def request(query, params):
params['url'] = base_url + search_url.format(query=urlencode({'id': query}))
return params
# get response from search-request
def response(resp):
results = []
doc = fromstring(resp.text)
# parse results
# Quickhits
for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
try:
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
except: # pylint: disable=bare-except
continue
if not res_url:
continue
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
# append result
results.append({'title': title, 'content': "", 'url': urljoin(base_url, res_url)})
# Search results
for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
try:
if r.tag == "dt":
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
elif r.tag == "dd":
content = extract_text(eval_xpath(r, '.'))
# append result
results.append({'title': title, 'content': content, 'url': urljoin(base_url, res_url)})
except: # pylint: disable=bare-except
continue
if not res_url:
continue
# return results
return results

496
searx/engines/duckduckgo.py Normal file
View File

@@ -0,0 +1,496 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo WEB
~~~~~~~~~~~~~~
"""
from __future__ import annotations
import json
import re
import typing
from urllib.parse import quote_plus
import babel
import lxml.html
from searx import (
locales,
external_bang,
)
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
extr,
extract_text,
)
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.enginelib.traits import EngineTraits
from searx.enginelib import EngineCache
from searx.exceptions import SearxEngineCaptchaException
from searx.result_types import EngineResults
if typing.TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://lite.duckduckgo.com/lite/',
"wikidata_id": 'Q12805',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
send_accept_language_header = True
"""DuckDuckGo-Lite tries to guess user's preferred language from the HTTP
``Accept-Language``. Optional the user can select a region filter (but not a
language).
"""
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True # user can't select but the results are filtered
url = "https://html.duckduckgo.com/html/"
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
_CACHE: EngineCache = None # type: ignore
"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
seconds."""
def get_cache():
global _CACHE # pylint: disable=global-statement
if _CACHE is None:
_CACHE = EngineCache("duckduckgo") # type:ignore
return _CACHE
def get_vqd(query: str, region: str, force_request: bool = False) -> str:
"""Returns the ``vqd`` that fits to the *query*.
:param query: The query term
:param region: DDG's region code
:param force_request: force a request to get a vqd value from DDG
TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
by all request to DDG:
- DuckDuckGo Lite: ``https://lite.duckduckgo.com/lite`` (POST form data)
- DuckDuckGo Web: ``https://links.duckduckgo.com/d.js?q=...&vqd=...``
- DuckDuckGo Images: ``https://duckduckgo.com/i.js??q=...&vqd=...``
- DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
- DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
(such as extremely long search terms that are often sent by bots), no ``vqd``
value can be determined.
If SearXNG cannot determine a ``vqd`` value, then no request should go out
to DDG.
.. attention::
A request with a wrong ``vqd`` value leads to DDG temporarily putting
SearXNG's IP on a block list.
Requests from IPs in this block list run into timeouts. Not sure, but it
seems the block list is a sliding window: to get my IP rid from the bot list
I had to cool down my IP for 1h (send no requests from that IP to DDG).
"""
cache = get_cache()
key = cache.secret_hash(f"{query}//{region}")
value = cache.get(key=key)
if value is not None and not force_request:
logger.debug("vqd: re-use cached value: %s", value)
return value
logger.debug("vqd: request value from from duckduckgo.com")
resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
if resp.status_code == 200: # type: ignore
value = extr(resp.text, 'vqd="', '"') # type: ignore
if value:
logger.debug("vqd value from duckduckgo.com request: '%s'", value)
else:
logger.error("vqd: can't parse value from ddg response (return empty string)")
return ""
else:
logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code)
if value:
cache.set(key=key, value=value)
else:
logger.error("vqd value from duckduckgo.com ", resp.status_code)
return value
def set_vqd(query: str, region: str, value: str):
cache = get_cache()
key = cache.secret_hash(f"{query}//{region}")
cache.set(key=key, value=value, expire=3600)
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
"""Get DuckDuckGo's language identifier from SearXNG's locale.
DuckDuckGo defines its languages by region codes (see
:py:obj:`fetch_traits`).
To get region and language of a DDG service use:
.. code: python
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
It might confuse, but the ``l`` value of the cookie is what SearXNG calls
the *region*:
.. code:: python
# !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
.. hint::
`DDG-lite <https://lite.duckduckgo.com/lite>`__ and the *no Javascript*
page https://html.duckduckgo.com/html do not offer a language selection
to the user, only a region can be selected by the user (``eng_region``
from the example above). DDG-lite and *no Javascript* store the selected
region in a cookie::
params['cookies']['kl'] = eng_region # 'ar-es'
"""
return eng_traits.custom['lang_region'].get( # type: ignore
sxng_locale, eng_traits.get_language(sxng_locale, default)
)
ddg_reg_map = {
'tw-tzh': 'zh_TW',
'hk-tzh': 'zh_HK',
'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
'es-ca': 'ca_ES',
'id-en': 'id_ID',
'no-no': 'nb_NO',
'jp-jp': 'ja_JP',
'kr-kr': 'ko_KR',
'xa-ar': 'ar_SA',
'sl-sl': 'sl_SI',
'th-en': 'th_TH',
'vn-en': 'vi_VN',
}
ddg_lang_map = {
# use ar --> ar_EG (Egypt's arabic)
"ar_DZ": 'lang_region',
"ar_JO": 'lang_region',
"ar_SA": 'lang_region',
# use bn --> bn_BD
'bn_IN': 'lang_region',
# use de --> de_DE
'de_CH': 'lang_region',
# use en --> en_US,
'en_AU': 'lang_region',
'en_CA': 'lang_region',
'en_GB': 'lang_region',
# Esperanto
'eo_XX': 'eo',
# use es --> es_ES,
'es_AR': 'lang_region',
'es_CL': 'lang_region',
'es_CO': 'lang_region',
'es_CR': 'lang_region',
'es_EC': 'lang_region',
'es_MX': 'lang_region',
'es_PE': 'lang_region',
'es_UY': 'lang_region',
'es_VE': 'lang_region',
# use fr --> rf_FR
'fr_CA': 'lang_region',
'fr_CH': 'lang_region',
'fr_BE': 'lang_region',
# use nl --> nl_NL
'nl_BE': 'lang_region',
# use pt --> pt_PT
'pt_BR': 'lang_region',
# skip these languages
'od_IN': 'skip',
'io_XX': 'skip',
'tokipona_XX': 'skip',
}
def quote_ddg_bangs(query):
# quote ddg bangs
query_parts = []
# for val in re.split(r'(\s+)', query):
for val in re.split(r'(\s+)', query):
if not val.strip():
continue
if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
val = f"'{val}'"
query_parts.append(val)
return ' '.join(query_parts)
def request(query, params):
query = quote_ddg_bangs(query)
if len(query) >= 500:
# DDG does not accept queries with more than 499 chars
params["url"] = None
return
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
# Note: The API is reverse-engineered from DuckDuckGo's HTML webpage
# (https://html.duckduckgo.com/html/) and may be subject to additional bot detection mechanisms
# and breaking changes in the future.
#
# The params['data'] dictionary can have the following key parameters, in this order:
# - q (str): Search query string
# - b (str): Beginning parameter - empty string for first page requests
# - s (int): Search offset for pagination
# - nextParams (str): Continuation parameters from previous page response, typically empty
# - v (str): Typically 'l' for subsequent pages
# - o (str): Output format, typically 'json'
# - dc (int): Display count - value equal to offset (s) + 1
# - api (str): API endpoint identifier, typically 'd.js'
# - vqd (str): Validation query digest
# - kl (str): Keyboard language/region code (e.g., 'en-us')
# - df (str): Time filter, maps to values like 'd' (day), 'w' (week), 'm' (month), 'y' (year)
params['data']['q'] = query
if params['pageno'] == 1:
params['data']['b'] = ""
elif params['pageno'] >= 2:
offset = 10 + (params['pageno'] - 2) * 15 # Page 2 = 10, Page 3+ = 10 + n*15
params['data']['s'] = offset
params['data']['nextParams'] = form_data.get('nextParams', '')
params['data']['v'] = form_data.get('v', 'l')
params['data']['o'] = form_data.get('o', 'json')
params['data']['dc'] = offset + 1
params['data']['api'] = form_data.get('api', 'd.js')
# vqd is required to request other pages after the first one
vqd = get_vqd(query, eng_region, force_request=False)
if vqd:
params['data']['vqd'] = vqd
else:
# Don't try to call follow up pages without a vqd value.
# DDG recognizes this as a request from a bot. This lowers the
# reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
params["url"] = None
return
if params['searxng_locale'].startswith("zh"):
# Some locales (at least China) do not have a "next page" button and DDG
# will return a HTTP/2 403 Forbidden for a request of such a page.
params["url"] = None
return
# Put empty kl in form data if language/region set to all
if eng_region == "wt-wt":
params['data']['kl'] = ""
else:
params['data']['kl'] = eng_region
params['data']['df'] = ''
if params['time_range'] in time_range_dict:
params['data']['df'] = time_range_dict[params['time_range']]
params['cookies']['df'] = time_range_dict[params['time_range']]
params['cookies']['kl'] = eng_region
params['url'] = url
params['method'] = 'POST'
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
params['headers']['Referer'] = url
params['headers']['Sec-Fetch-Dest'] = "document"
params['headers']['Sec-Fetch-Mode'] = "navigate" # at least this one is used by ddg's bot detection
params['headers']['Sec-Fetch-Site'] = "same-origin"
params['headers']['Sec-Fetch-User'] = "?1"
logger.debug("param headers: %s", params['headers'])
logger.debug("param data: %s", params['data'])
logger.debug("param cookies: %s", params['cookies'])
def is_ddg_captcha(dom):
"""In case of CAPTCHA ddg response its own *not a Robot* dialog and is not
redirected to a CAPTCHA page."""
return bool(eval_xpath(dom, "//form[@id='challenge-form']"))
def response(resp) -> EngineResults:
results = EngineResults()
if resp.status_code == 303:
return results
doc = lxml.html.fromstring(resp.text)
if is_ddg_captcha(doc):
# set suspend time to zero is OK --> ddg does not block the IP
raise SearxEngineCaptchaException(suspended_time=0, message=f"CAPTCHA ({resp.search_params['data'].get('kl')})")
form = eval_xpath(doc, '//input[@name="vqd"]/..')
if len(form):
# some locales (at least China) does not have a "next page" button
form = form[0]
form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
set_vqd(
query=resp.search_params['data']['q'],
region=resp.search_params['data']['kl'],
value=str(form_vqd),
)
# just select "web-result" and ignore results of class "result--ad result--ad--small"
for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
item = {}
title = eval_xpath(div_result, './/h2/a')
if not title:
# this is the "No results." item in the result list
continue
item["title"] = extract_text(title)
item["url"] = eval_xpath(div_result, './/h2/a/@href')[0]
item["content"] = extract_text(
eval_xpath_getindex(div_result, './/a[contains(@class, "result__snippet")]', 0, [])
)
results.append(item)
zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip() # type: ignore
if zero_click and (
"Your IP address is" not in zero_click
and "Your user agent:" not in zero_click
and "URL Decoded:" not in zero_click
):
results.add(
results.types.Answer(
answer=zero_click,
url=eval_xpath_getindex(doc, '//div[@id="zero_click_abstract"]/a/@href', 0), # type: ignore
)
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages & regions from DuckDuckGo.
SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
DuckDuckGo's language "Browsers preferred language" (``wt_WT``) makes no
sense in a SearXNG request since SearXNG's ``all`` will not add a
``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
is ``wt-wt`` (the region).
Beside regions DuckDuckGo also defines its languages by region codes. By
example these are the english languages in DuckDuckGo:
- en_US
- en_AU
- en_CA
- en_GB
The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
SearXNG's locale.
"""
# pylint: disable=too-many-branches, too-many-statements, disable=import-outside-toplevel
from searx.utils import js_variable_to_python
# fetch regions
engine_traits.all_locale = 'wt-wt'
# updated from u661.js to u.7669f071a13a7daa57cb / should be updated automatically?
resp = get('https://duckduckgo.com/dist/util/u.7669f071a13a7daa57cb.js')
if not resp.ok: # type: ignore
print("ERROR: response from DuckDuckGo is not OK.")
js_code = extr(resp.text, 'regions:', ',snippetLengths') # type: ignore
regions = json.loads(js_code)
for eng_tag, name in regions.items():
if eng_tag == 'wt-wt':
engine_traits.all_locale = 'wt-wt'
continue
region = ddg_reg_map.get(eng_tag)
if region == 'skip':
continue
if not region:
eng_territory, eng_lang = eng_tag.split('-')
region = eng_lang + '_' + eng_territory.upper()
try:
sxng_tag = locales.region_tag(babel.Locale.parse(region))
except babel.UnknownLocaleError:
print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
# fetch languages
engine_traits.custom['lang_region'] = {}
js_code = extr(resp.text, 'languages:', ',regions') # type: ignore
languages = js_variable_to_python(js_code)
for eng_lang, name in languages.items():
if eng_lang == 'wt_WT':
continue
babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
if babel_tag == 'skip':
continue
try:
if babel_tag == 'lang_region':
sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
engine_traits.custom['lang_region'][sxng_tag] = eng_lang
continue
sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
except babel.UnknownLocaleError:
print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang

View File

@@ -0,0 +1,264 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo Instant Answer API
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
reverse engineering we can see that some services (e.g. instant answers) still
in use from the DDG search engine.
As far we can say the *instant answers* API does not support languages, or at
least we could not find out how language support should work. It seems that
most of the features are based on English terms.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urlparse, urljoin
from lxml import html
from searx.data import WIKIDATA_UNITS
from searx.utils import extract_text, html_to_text, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.result_types import EngineResults
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
def is_broken_text(text):
"""duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
The href URL is broken, the "Related website" may contains some HTML.
The best solution seems to ignore these results.
"""
return text.startswith('http') and ' ' in text
def result_to_text(text, htmlResult):
# TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
result = None
dom = html.fromstring(htmlResult)
a = dom.xpath('//a')
if len(a) >= 1:
result = extract_text(a[0])
else:
result = text
if not is_broken_text(result):
return result
return None
def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query}))
return params
def response(resp) -> EngineResults:
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
results = EngineResults()
search_res = resp.json()
# search_res.get('Entity') possible values (not exhaustive) :
# * continent / country / department / location / waterfall
# * actor / musician / artist
# * book / performing art / film / television / media franchise / concert tour / playwright
# * prepared food
# * website / software / os / programming language / file format / software engineer
# * company
content = ''
heading = search_res.get('Heading', '')
attributes = []
urls = []
infobox_id = None
relatedTopics = []
# add answer if there is one
answer = search_res.get('Answer', '')
if answer:
answer_type = search_res.get('AnswerType')
logger.debug('AnswerType="%s" Answer="%s"', answer_type, answer)
if isinstance(answer, str) and answer_type not in ['calc', 'ip']:
results.add(
results.types.Answer(
answer=html_to_text(answer),
url=search_res.get('AbstractURL', ''),
)
)
# add infobox
if 'Definition' in search_res:
content = content + search_res.get('Definition', '')
if 'Abstract' in search_res:
content = content + search_res.get('Abstract', '')
# image
image = search_res.get('Image')
image = None if image == '' else image
if image is not None and urlparse(image).netloc == '':
image = urljoin('https://duckduckgo.com', image)
# urls
# Official website, Wikipedia page
for ddg_result in search_res.get('Results', []):
firstURL = ddg_result.get('FirstURL')
text = ddg_result.get('Text')
if firstURL is not None and text is not None:
urls.append({'title': text, 'url': firstURL})
results.append({'title': heading, 'url': firstURL})
# related topics
for ddg_result in search_res.get('RelatedTopics', []):
if 'FirstURL' in ddg_result:
firstURL = ddg_result.get('FirstURL')
text = ddg_result.get('Text')
if not is_broken_text(text):
suggestion = result_to_text(text, ddg_result.get('Result'))
if suggestion != heading and suggestion is not None:
results.append({'suggestion': suggestion})
elif 'Topics' in ddg_result:
suggestions = []
relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
for topic_result in ddg_result.get('Topics', []):
suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
if suggestion != heading and suggestion is not None:
suggestions.append(suggestion)
# abstract
abstractURL = search_res.get('AbstractURL', '')
if abstractURL != '':
# add as result ? problem always in english
infobox_id = abstractURL
urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
results.append({'url': abstractURL, 'title': heading})
# definition
definitionURL = search_res.get('DefinitionURL', '')
if definitionURL != '':
# add as result ? as answer ? problem always in english
infobox_id = definitionURL
urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
# to merge with wikidata's infobox
if infobox_id:
infobox_id = replace_http_by_https(infobox_id)
# attributes
# some will be converted to urls
if 'Infobox' in search_res:
infobox = search_res.get('Infobox')
if 'content' in infobox:
osm_zoom = 17
coordinates = None
for info in infobox.get('content'):
data_type = info.get('data_type')
data_label = info.get('label')
data_value = info.get('value')
# Workaround: ddg may return a double quote
if data_value == '""':
continue
# Is it an external URL ?
# * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
# * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
# * netflix_id
external_url = get_external_url(data_type, data_value)
if external_url is not None:
urls.append({'title': data_label, 'url': external_url})
elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
# ignore instance: Wikidata value from "Instance Of" (Qxxxx)
# ignore wiki_maps_trigger: reference to a javascript
# ignore google_play_artist_id: service shutdown
pass
elif data_type == 'string' and data_label == 'Website':
# There is already an URL for the website
pass
elif data_type == 'area':
attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
osm_zoom = area_to_osm_zoom(data_value.get('amount'))
elif data_type == 'coordinates':
if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
# coordinate on Earth
# get the zoom information from the area
coordinates = info
else:
# coordinate NOT on Earth
attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
elif data_type == 'string':
attributes.append({'label': data_label, 'value': data_value})
if coordinates:
data_label = coordinates.get('label')
data_value = coordinates.get('value')
latitude = data_value.get('latitude')
longitude = data_value.get('longitude')
url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
if len(heading) > 0:
# TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
else:
results.append(
{
'infobox': heading,
'id': infobox_id,
'content': content,
'img_src': image,
'attributes': attributes,
'urls': urls,
'relatedTopics': relatedTopics,
}
)
return results
def unit_to_str(unit):
for prefix in WIKIDATA_PREFIX:
if unit.startswith(prefix):
wikidata_entity = unit[len(prefix) :]
real_unit = WIKIDATA_UNITS.get(wikidata_entity)
if real_unit is None:
return unit
return real_unit['symbol']
return unit
def area_to_str(area):
"""parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
unit = unit_to_str(area.get('unit'))
if unit is not None:
try:
amount = float(area.get('amount'))
return '{} {}'.format(amount, unit)
except ValueError:
pass
return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))

View File

@@ -0,0 +1,149 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo Extra (images, videos, news)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
from __future__ import annotations
from datetime import datetime
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from searx.utils import get_embeded_stream_url, html_to_text
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import get_ddg_lang, get_vqd
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON (site requires js to get images)',
}
# engine dependent config
categories = ['images', 'web']
ddg_category = 'images'
"""The category must be any of ``images``, ``videos`` and ``news``
"""
paging = True
safesearch = True
send_accept_language_header = True
safesearch_cookies = {0: '-2', 1: None, 2: '1'}
safesearch_args = {0: '1', 1: None, 2: '1'}
search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
def request(query, params):
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
# request needs a vqd argument
vqd = get_vqd(query, eng_region, force_request=True)
if not vqd:
# some search terms do not have results and therefore no vqd value
params['url'] = None
return params
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
args = {
'q': query,
'o': 'json',
# 'u': 'bing',
'l': eng_region,
'f': ',,,,,',
'vqd': vqd,
}
if params['pageno'] > 1:
args['s'] = (params['pageno'] - 1) * 100
params['cookies']['ad'] = eng_lang # zh_CN
params['cookies']['ah'] = eng_region # "us-en,de-de"
params['cookies']['l'] = eng_region # "hk-tzh"
safe_search = safesearch_cookies.get(params['safesearch'])
if safe_search is not None:
params['cookies']['p'] = safe_search # "-2", "1"
safe_search = safesearch_args.get(params['safesearch'])
if safe_search is not None:
args['p'] = safe_search # "-1", "1"
logger.debug("cookies: %s", params['cookies'])
params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
# sending these two headers prevents rate limiting for the query
params['headers'] = {
'Referer': 'https://duckduckgo.com/',
'X-Requested-With': 'XMLHttpRequest',
}
return params
def _image_result(result):
return {
'template': 'images.html',
'url': result['url'],
'title': result['title'],
'content': '',
'thumbnail_src': result['thumbnail'],
'img_src': result['image'],
'resolution': '%s x %s' % (result['width'], result['height']),
'source': result['source'],
}
def _video_result(result):
return {
'template': 'videos.html',
'url': result['content'],
'title': result['title'],
'content': result['description'],
'thumbnail': result['images'].get('small') or result['images'].get('medium'),
'iframe_src': get_embeded_stream_url(result['content']),
'source': result['provider'],
'length': result['duration'],
'metadata': result.get('uploader'),
}
def _news_result(result):
return {
'url': result['url'],
'title': result['title'],
'content': html_to_text(result['excerpt']),
'source': result['source'],
'publishedDate': datetime.fromtimestamp(result['date']),
}
def response(resp):
results = []
res_json = resp.json()
for result in res_json['results']:
if ddg_category == 'images':
results.append(_image_result(result))
elif ddg_category == 'videos':
results.append(_video_result(result))
elif ddg_category == 'news':
results.append(_news_result(result))
else:
raise ValueError(f"Invalid duckduckgo category: {ddg_category}")
return results

View File

@@ -0,0 +1,158 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo Weather
~~~~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
from json import loads
from urllib.parse import quote
from dateutil import parser as date_parser
from flask_babel import gettext
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import get_ddg_lang
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
send_accept_language_header = True
# engine dependent config
categories = ["weather"]
URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
def generate_condition_table(condition):
res = ""
res += f"<tr><td><b>{gettext('Condition')}</b></td>" f"<td><b>{condition['conditionCode']}</b></td></tr>"
res += (
f"<tr><td><b>{gettext('Temperature')}</b></td>"
f"<td><b>{condition['temperature']}°C / {c_to_f(condition['temperature'])}°F</b></td></tr>"
)
res += (
f"<tr><td>{gettext('Feels like')}</td><td>{condition['temperatureApparent']}°C / "
f"{c_to_f(condition['temperatureApparent'])}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Wind')}</td><td>{condition['windDirection']}° — "
f"{(condition['windSpeed'] * 1.6093440006147):.2f} km/h / {condition['windSpeed']} mph</td></tr>"
)
res += f"<tr><td>{gettext('Visibility')}</td><td>{condition['visibility']} m</td>"
res += f"<tr><td>{gettext('Humidity')}</td><td>{(condition['humidity'] * 100):.1f}%</td></tr>"
return res
def generate_day_table(day):
res = ""
res += (
f"<tr><td>{gettext('Min temp.')}</td><td>{day['temperatureMin']}°C / "
f"{c_to_f(day['temperatureMin'])}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Max temp.')}</td><td>{day['temperatureMax']}°C / "
f"{c_to_f(day['temperatureMax'])}°F</td></tr>"
)
res += f"<tr><td>{gettext('UV index')}</td><td>{day['maxUvIndex']}</td></tr>"
res += f"<tr><td>{gettext('Sunrise')}</td><td>{date_parser.parse(day['sunrise']).strftime('%H:%M')}</td></tr>"
res += f"<tr><td>{gettext('Sunset')}</td><td>{date_parser.parse(day['sunset']).strftime('%H:%M')}</td></tr>"
return res
def request(query, params):
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
# !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
logger.debug("cookies: %s", params['cookies'])
params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0])
return params
def c_to_f(temperature):
return "%.2f" % ((temperature * 1.8) + 32)
def response(resp):
results = []
if resp.text.strip() == "ddg_spice_forecast();":
return []
result = loads(resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2])
current = result["currentWeather"]
title = result['location']
infobox = f"<h3>{gettext('Current condition')}</h3><table><tbody>"
infobox += generate_condition_table(current)
infobox += "</tbody></table>"
last_date = None
for time in result['forecastHourly']['hours']:
current_time = date_parser.parse(time['forecastStart'])
if last_date != current_time.date():
if last_date is not None:
infobox += "</tbody></table>"
infobox += f"<h3>{current_time.strftime('%Y-%m-%d')}</h3>"
infobox += "<table><tbody>"
for day in result['forecastDaily']['days']:
if date_parser.parse(day['forecastStart']).date() == current_time.date():
infobox += generate_day_table(day)
infobox += "</tbody></table><table><tbody>"
last_date = current_time.date()
infobox += f"<tr><td rowspan=\"7\"><b>{current_time.strftime('%H:%M')}</b></td></tr>"
infobox += generate_condition_table(time)
infobox += "</tbody></table>"
results.append(
{
"infobox": title,
"content": infobox,
}
)
return results

71
searx/engines/duden.py Normal file
View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Duden
"""
import re
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.network import raise_for_httperror
# about
about = {
"website": 'https://www.duden.de',
"wikidata_id": 'Q73624591',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'de',
}
categories = ['dictionaries']
paging = True
# search-url
base_url = 'https://www.duden.de/'
search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}'
def request(query, params):
offset = params['pageno'] - 1
if offset == 0:
search_url_fmt = base_url + 'suchen/dudenonline/{query}'
params['url'] = search_url_fmt.format(query=quote(query))
else:
params['url'] = search_url.format(offset=offset, query=quote(query))
# after the last page of results, spelling corrections are returned after a HTTP redirect
# whatever the page number is
params['soft_max_redirects'] = 1
params['raise_for_httperror'] = False
return params
def response(resp):
results = []
if resp.status_code == 404:
return results
raise_for_httperror(resp)
dom = html.fromstring(resp.text)
number_of_results_element = eval_xpath_getindex(
dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None
)
if number_of_results_element is not None:
number_of_results_string = re.sub('[^0-9]', '', number_of_results_element)
results.append({'number_of_results': int(number_of_results_string)})
for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'):
url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
url = urljoin(base_url, url)
title = eval_xpath(result, 'string(.//h2/a)').strip()
content = extract_text(eval_xpath(result, './/p'))
# append result
results.append({'url': url, 'title': title, 'content': content})
return results

View File

@@ -0,0 +1,23 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""Dummy Offline
"""
# about
about = {
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
def search(query, request_params): # pylint: disable=unused-argument
return [
{
'result': 'this is what you get',
}
]

24
searx/engines/dummy.py Normal file
View File

@@ -0,0 +1,24 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Dummy
"""
# about
about = {
"website": None,
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'empty array',
}
# do search-request
def request(query, params): # pylint: disable=unused-argument
return params
# get response from search-request
def response(resp): # pylint: disable=unused-argument
return []

77
searx/engines/ebay.py Normal file
View File

@@ -0,0 +1,77 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Ebay (Videos, Music, Files)
"""
from urllib.parse import quote
from lxml import html
from searx.engines.xpath import extract_text
# about
about = {
"website": 'https://www.ebay.com',
"wikidata_id": 'Q58024',
"official_api_documentation": 'https://developer.ebay.com/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['shopping']
paging = True
# Set base_url in settings.yml in order to
# have the desired local TLD.
base_url = None
search_url = '/sch/i.html?_nkw={query}&_sacat={pageno}'
results_xpath = '//li[contains(@class, "s-item")]'
url_xpath = './/a[@class="s-item__link"]/@href'
title_xpath = './/h3[@class="s-item__title"]'
content_xpath = './/div[@span="SECONDARY_INFO"]'
price_xpath = './/div[contains(@class, "s-item__detail")]/span[@class="s-item__price"][1]/text()'
shipping_xpath = './/span[contains(@class, "s-item__shipping")]/text()'
source_country_xpath = './/span[contains(@class, "s-item__location")]/text()'
thumbnail_xpath = './/img[@class="s-item__image-img"]/@src'
def request(query, params):
params['url'] = f'{base_url}' + search_url.format(query=quote(query), pageno=params['pageno'])
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
results_dom = dom.xpath(results_xpath)
if not results_dom:
return []
for result_dom in results_dom:
url = extract_text(result_dom.xpath(url_xpath))
title = extract_text(result_dom.xpath(title_xpath))
content = extract_text(result_dom.xpath(content_xpath))
price = extract_text(result_dom.xpath(price_xpath))
shipping = extract_text(result_dom.xpath(shipping_xpath))
source_country = extract_text(result_dom.xpath(source_country_xpath))
thumbnail = extract_text(result_dom.xpath(thumbnail_xpath))
if title == "":
continue
results.append(
{
'url': url,
'title': title,
'content': content,
'price': price,
'shipping': shipping,
'source_country': source_country,
'thumbnail': thumbnail,
'template': 'products.html',
}
)
return results

View File

@@ -0,0 +1,194 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""".. sidebar:: info
- :origin:`elasticsearch.py <searx/engines/elasticsearch.py>`
- `Elasticsearch <https://www.elastic.co/elasticsearch/>`_
- `Elasticsearch Guide
<https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html>`_
- `Install Elasticsearch
<https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html>`_
Elasticsearch_ supports numerous ways to query the data it is storing. At the
moment the engine supports the most popular search methods (``query_type``):
- ``match``,
- ``simple_query_string``,
- ``term`` and
- ``terms``.
If none of the methods fit your use case, you can select ``custom`` query type
and provide the JSON payload to submit to Elasticsearch in
``custom_query_json``.
Example
=======
The following is an example configuration for an Elasticsearch_ instance with
authentication configured to read from ``my-index`` index.
.. code:: yaml
- name: elasticsearch
shortcut: els
engine: elasticsearch
base_url: http://localhost:9200
username: elastic
password: changeme
index: my-index
query_type: match
# custom_query_json: '{ ... }'
enable_http: true
"""
from json import loads, dumps
from searx.exceptions import SearxEngineAPIException
from searx.result_types import EngineResults
from searx.extended_types import SXNG_Response
categories = ['general']
paging = True
about = {
'website': 'https://www.elastic.co',
'wikidata_id': 'Q3050461',
'official_api_documentation': 'https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html',
'use_official_api': True,
'require_api_key': False,
'format': 'JSON',
}
base_url = 'http://localhost:9200'
username = ''
password = ''
index = ''
query_type = 'match'
custom_query_json = {}
show_metadata = False
page_size = 10
def init(engine_settings):
if 'query_type' in engine_settings and engine_settings['query_type'] not in _available_query_types:
raise ValueError('unsupported query type', engine_settings['query_type'])
if index == '':
raise ValueError('index cannot be empty')
def request(query, params):
if query_type not in _available_query_types:
return params
if username and password:
params['auth'] = (username, password)
args = {
'from': (params['pageno'] - 1) * page_size,
'size': page_size,
}
data = _available_query_types[query_type](query)
data.update(args)
params['url'] = f"{base_url}/{index}/_search"
params['method'] = 'GET'
params['data'] = dumps(data)
params['headers']['Content-Type'] = 'application/json'
return params
def _match_query(query):
"""
The standard for full text queries.
searx format: "key:value" e.g. city:berlin
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
"""
try:
key, value = query.split(':')
except Exception as e:
raise ValueError('query format must be "key:value"') from e
return {"query": {"match": {key: {'query': value}}}}
def _simple_query_string_query(query):
"""
Accepts query strings, but it is less strict than query_string
The field used can be specified in index.query.default_field in Elasticsearch.
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html
"""
return {'query': {'simple_query_string': {'query': query}}}
def _term_query(query):
"""
Accepts one term and the name of the field.
searx format: "key:value" e.g. city:berlin
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html
"""
try:
key, value = query.split(':')
except Exception as e:
raise ValueError('query format must be key:value') from e
return {'query': {'term': {key: value}}}
def _terms_query(query):
"""
Accepts multiple terms and the name of the field.
searx format: "key:value1,value2" e.g. city:berlin,paris
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html
"""
try:
key, values = query.split(':')
except Exception as e:
raise ValueError('query format must be key:value1,value2') from e
return {'query': {'terms': {key: values.split(',')}}}
def _custom_query(query):
key, value = query.split(':')
custom_query = custom_query_json
for query_key, query_value in custom_query.items():
if query_key == '{{KEY}}':
custom_query[key] = custom_query.pop(query_key)
if query_value == '{{VALUE}}':
custom_query[query_key] = value
return custom_query
def response(resp: SXNG_Response) -> EngineResults:
res = EngineResults()
resp_json = loads(resp.text)
if 'error' in resp_json:
raise SearxEngineAPIException(resp_json["error"])
for result in resp_json["hits"]["hits"]:
kvmap = {key: str(value) if not key.startswith("_") else value for key, value in result["_source"].items()}
if show_metadata:
kvmap["metadata"] = {"index": result["_index"], "id": result["_id"], "score": result["_score"]}
res.add(res.types.KeyValue(kvmap=kvmap))
return res
_available_query_types = {
# Full text queries
# https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html
'match': _match_query,
'simple_query_string': _simple_query_string_query,
# Term-level queries
# https://www.elastic.co/guide/en/elasticsearch/reference/current/term-level-queries.html
'term': _term_query,
'terms': _terms_query,
# Query JSON defined by the instance administrator.
'custom': _custom_query,
}

View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Emojipedia
Emojipedia is an emoji reference website which documents the meaning and
common usage of emoji characters in the Unicode Standard. It is owned by Zedge
since 2021. Emojipedia is a voting member of The Unicode Consortium.[1]
[1] https://en.wikipedia.org/wiki/Emojipedia
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath_list,
extract_text,
)
about = {
"website": 'https://emojipedia.org',
"wikidata_id": 'Q22908129',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = []
base_url = 'https://emojipedia.org'
search_url = base_url + '/search?{query}'
def request(query, params):
params['url'] = search_url.format(
query=urlencode({'q': query}),
)
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//div[starts-with(@class, "EmojisList")]/a'):
url = base_url + result.attrib.get('href')
res = {'url': url, 'title': extract_text(result), 'content': ''}
results.append(res)
return results

54
searx/engines/fdroid.py Normal file
View File

@@ -0,0 +1,54 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
F-Droid (a repository of FOSS applications for Android)
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://f-droid.org/',
"wikidata_id": 'Q1386210',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['files', 'apps']
paging = True
# search-url
base_url = 'https://search.f-droid.org/'
search_url = base_url + '?{query}'
# do search-request
def request(query, params):
query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''})
params['url'] = search_url.format(query=query)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
for app in dom.xpath('//a[@class="package-header"]'):
app_url = app.xpath('./@href')[0]
app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()'))
app_content = (
extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip()
+ ' - '
+ extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip()
)
thumbnail = app.xpath('./img[@class="package-icon"]/@src')[0]
results.append({'url': app_url, 'title': app_title, 'content': app_content, 'thumbnail': thumbnail})
return results

View File

@@ -0,0 +1,54 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""FindThatMeme (Images)"""
from json import dumps
from datetime import datetime
from searx.utils import humanize_bytes
about = {
"website": 'https://findthatmeme.com',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
base_url = "https://findthatmeme.com/api/v1/search"
categories = ['images']
paging = True
def request(query, params):
start_index = (params["pageno"] - 1) * 50
data = {"search": query, "offset": start_index}
params["url"] = base_url
params["method"] = 'POST'
params['headers']['content-type'] = "application/json"
params['data'] = dumps(data)
return params
def response(resp):
search_res = resp.json()
results = []
for item in search_res:
img = 'https://s3.thehackerblog.com/findthatmeme/' + item['image_path']
thumb = 'https://s3.thehackerblog.com/findthatmeme/thumb/' + item.get('thumbnail', '')
date = datetime.strptime(item["updated_at"].split("T")[0], "%Y-%m-%d")
formatted_date = datetime.fromtimestamp(date.timestamp())
results.append(
{
'url': item['source_page_url'],
'title': item['source_site'],
'img_src': img if item['type'] == 'IMAGE' else thumb,
'filesize': humanize_bytes(item['meme_file_size']),
'publishedDate': formatted_date,
'template': 'images.html',
}
)
return results

95
searx/engines/flickr.py Normal file
View File

@@ -0,0 +1,95 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Flickr (Images)
More info on api-key : https://www.flickr.com/services/apps/create/
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://www.flickr.com',
"wikidata_id": 'Q103204',
"official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
categories = ['images']
nb_per_page = 15
paging = True
api_key = None
url = (
'https://api.flickr.com/services/rest/?method=flickr.photos.search'
+ '&api_key={api_key}&{text}&sort=relevance'
+ '&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z'
+ '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
)
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
paging = True
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
def request(query, params):
params['url'] = url.format(
text=urlencode({'text': query}), api_key=api_key, nb_per_page=nb_per_page, page=params['pageno']
)
return params
def response(resp):
results = []
search_results = loads(resp.text)
# return empty array if there are no results
if 'photos' not in search_results:
return []
if 'photo' not in search_results['photos']:
return []
photos = search_results['photos']['photo']
# parse results
for photo in photos:
if 'url_o' in photo:
img_src = photo['url_o']
elif 'url_z' in photo:
img_src = photo['url_z']
else:
continue
# For a bigger thumbnail, keep only the url_z, not the url_n
if 'url_n' in photo:
thumbnail_src = photo['url_n']
elif 'url_z' in photo:
thumbnail_src = photo['url_z']
else:
thumbnail_src = img_src
# append result
results.append(
{
'url': build_flickr_url(photo['owner'], photo['id']),
'title': photo['title'],
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'content': photo['description']['_content'],
'author': photo['ownername'],
'template': 'images.html',
}
)
# return results
return results

View File

@@ -0,0 +1,142 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Flickr (Images)
"""
from typing import TYPE_CHECKING
import json
from time import time
import re
from urllib.parse import urlencode
from searx.utils import ecma_unescape, html_to_text
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about
about = {
"website": 'https://www.flickr.com',
"wikidata_id": 'Q103204',
"official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['images']
paging = True
time_range_support = True
safesearch = False
time_range_dict = {
'day': 60 * 60 * 24,
'week': 60 * 60 * 24 * 7,
'month': 60 * 60 * 24 * 7 * 4,
'year': 60 * 60 * 24 * 7 * 52,
}
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'm', 'n', 't', 'q', 's')
search_url = 'https://www.flickr.com/search?{query}&page={page}'
time_range_url = '&min_upload_date={start}&max_upload_date={end}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M)
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
def _get_time_range_url(time_range):
if time_range in time_range_dict:
return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range]))
return ''
def request(query, params):
params['url'] = search_url.format(query=urlencode({'text': query}), page=params['pageno']) + _get_time_range_url(
params['time_range']
)
return params
def response(resp): # pylint: disable=too-many-branches
results = []
matches = modelexport_re.search(resp.text)
if matches is None:
return results
match = matches.group(1)
model_export = json.loads(match)
if 'legend' not in model_export:
return results
legend = model_export['legend']
# handle empty page
if not legend or not legend[0]:
return results
for x, index in enumerate(legend):
if len(index) != 8:
logger.debug("skip legend enty %s : %s", x, index)
continue
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][index[4]][index[5]][int(index[6])][
index[7]
]
author = ecma_unescape(photo.get('realname', ''))
source = ecma_unescape(photo.get('username', ''))
if source:
source += ' @ Flickr'
title = ecma_unescape(photo.get('title', ''))
content = html_to_text(ecma_unescape(photo.get('description', '')))
img_src = None
# From the biggest to the lowest format
size_data = None
for image_size in image_sizes:
if image_size in photo['sizes']['data']:
size_data = photo['sizes']['data'][image_size]['data']
break
if not size_data:
logger.debug('cannot find valid image size: {0}'.format(repr(photo['sizes']['data'])))
continue
img_src = size_data['url']
resolution = f"{size_data['width']} x {size_data['height']}"
# For a bigger thumbnail, keep only the url_z, not the url_n
if 'n' in photo['sizes']['data']:
thumbnail_src = photo['sizes']['data']['n']['data']['url']
elif 'z' in photo['sizes']['data']:
thumbnail_src = photo['sizes']['data']['z']['data']['url']
else:
thumbnail_src = img_src
if 'ownerNsid' not in photo:
# should not happen, disowned photo? Show it anyway
url = img_src
else:
url = build_flickr_url(photo['ownerNsid'], photo['id'])
result = {
'url': url,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'source': source,
'resolution': resolution,
'template': 'images.html',
}
result['author'] = author.encode(errors='ignore').decode()
result['source'] = source.encode(errors='ignore').decode()
result['title'] = title.encode(errors='ignore').decode()
result['content'] = content.encode(errors='ignore').decode()
results.append(result)
return results

View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Freesound (Sound)
"""
from json import loads
from urllib.parse import urlencode
from datetime import datetime
disabled = True
api_key = ""
# about
about = {
"website": "https://freesound.org",
"wikidata_id": "Q835703",
"official_api_documentation": "https://freesound.org/docs/api",
"use_official_api": True,
"require_api_key": True,
"results": "JSON",
}
# engine dependent config
paging = True
# search url
url = "https://freesound.org/apiv2/"
search_url = (
url + "search/text/?query={query}&page={page}&fields=name,url,download,created,description,type&token={api_key}"
)
# search request
def request(query, params):
params["url"] = search_url.format(
query=urlencode({"q": query}),
page=params["pageno"],
api_key=api_key,
)
return params
# get response from search request
def response(resp):
results = []
search_res = loads(resp.text)
# parse results
for result in search_res.get("results", []):
title = result["name"]
content = result["description"][:128]
publishedDate = datetime.fromisoformat(result["created"])
uri = result["download"]
# append result
results.append(
{
"url": result["url"],
"title": title,
"publishedDate": publishedDate,
"audio_src": uri,
"content": content,
}
)
return results

51
searx/engines/frinkiac.py Normal file
View File

@@ -0,0 +1,51 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Frinkiac (Images)
"""
from json import loads
from urllib.parse import urlencode
# about
about = {
"website": 'https://frinkiac.com',
"wikidata_id": 'Q24882614',
"official_api_documentation": {'url': None, 'comment': 'see https://github.com/MitchellAW/CompuGlobal'},
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
BASE = 'https://frinkiac.com/'
SEARCH_URL = '{base}api/search?{query}'
RESULT_URL = '{base}?{query}'
THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg'
IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg'
def request(query, params):
params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query}))
return params
def response(resp):
results = []
response_data = loads(resp.text)
for result in response_data:
episode = result['Episode']
timestamp = result['Timestamp']
results.append(
{
'template': 'images.html',
'url': RESULT_URL.format(base=BASE, query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})),
'title': episode,
'content': '',
'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp),
'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp),
}
)
return results

49
searx/engines/fyyd.py Normal file
View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fyyd (podcasts)
"""
from datetime import datetime
from urllib.parse import urlencode
about = {
'website': 'https://fyyd.de',
'official_api_documentation': 'https://github.com/eazyliving/fyyd-api',
'use_official_api': True,
'require_api_key': False,
'results': 'JSON',
}
categories = []
paging = True
base_url = "https://api.fyyd.de"
page_size = 10
def request(query, params):
args = {
'term': query,
'count': page_size,
'page': params['pageno'] - 1,
}
params['url'] = f"{base_url}/0.2/search/podcast?{urlencode(args)}"
return params
def response(resp):
results = []
json_results = resp.json()['data']
for result in json_results:
results.append(
{
'url': result['htmlURL'],
'title': result['title'],
'content': result['description'],
'thumbnail': result['smallImageURL'],
'publishedDate': datetime.strptime(result['status_since'], '%Y-%m-%d %H:%M:%S'),
'metadata': f"Rank: {result['rank']} || {result['episode_count']} episodes",
}
)
return results

97
searx/engines/geizhals.py Normal file
View File

@@ -0,0 +1,97 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Geizhals is a German website to compare the price of a product on the
most common German shopping sites and find the lowest price.
The sorting of the search results can be influenced by the following additions
to the search term:
``asc`` or ``price``
To sort by price in ascending order.
``desc``
To sort by price in descending order.
"""
import re
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, eval_xpath_list, extract_text
about = {
'website': 'https://geizhals.de',
'wikidata_id': 'Q15977657',
'use_official_api': False,
'official_api_documentation': None,
'require_api_key': False,
'results': 'HTML',
'language': 'de',
}
paging = True
categories = ['shopping']
base_url = "https://geizhals.de"
sort_order = 'relevance'
SORT_RE = re.compile(r"sort:(\w+)")
sort_order_map = {
'relevance': None,
'price': 'p',
'asc': 'p',
'desc': '-p',
}
def request(query, params):
sort = None
sort_order_path = SORT_RE.search(query)
if sort_order_path:
sort = sort_order_map.get(sort_order_path.group(1))
query = SORT_RE.sub("", query)
logger.debug(query)
args = {
'fs': query,
'pg': params['pageno'],
'toggle_all': 1, # load item specs
'sort': sort,
}
params['url'] = f"{base_url}/?{urlencode(args)}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, "//article[contains(@class, 'listview__item')]"):
content = []
for spec in eval_xpath_list(result, ".//div[contains(@class, 'specs-grid__item')]"):
content.append(f"{extract_text(eval_xpath(spec, './dt'))}: {extract_text(eval_xpath(spec, './dd'))}")
metadata = [
extract_text(eval_xpath(result, ".//div[contains(@class, 'stars-rating-label')]")),
extract_text(eval_xpath(result, ".//div[contains(@class, 'listview__offercount')]")),
]
item = {
'template': 'products.html',
'url': (
base_url + "/" + extract_text(eval_xpath(result, ".//a[contains(@class, 'listview__name-link')]/@href"))
),
'title': extract_text(eval_xpath(result, ".//h3[contains(@class, 'listview__name')]")),
'content': ' | '.join(content),
'thumbnail': extract_text(eval_xpath(result, ".//img[contains(@class, 'listview__image')]/@src")),
'metadata': ', '.join(item for item in metadata if item),
}
best_price = extract_text(eval_xpath(result, ".//a[contains(@class, 'listview__price-link')]")).split(" ")
if len(best_price) > 1:
item["price"] = f"Bestes Angebot: {best_price[1]}"
results.append(item)
return results

102
searx/engines/genius.py Normal file
View File

@@ -0,0 +1,102 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""Genius
"""
from urllib.parse import urlencode
from datetime import datetime
# about
about = {
"website": 'https://genius.com/',
"wikidata_id": 'Q3419343',
"official_api_documentation": 'https://docs.genius.com/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['music', 'lyrics']
paging = True
page_size = 5
url = 'https://genius.com/api/'
search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}'
music_player = 'https://genius.com{api_path}/apple_music_player'
def request(query, params):
params['url'] = search_url.format(
query=urlencode({'q': query}),
index='multi',
page_size=page_size,
pageno=params['pageno'],
)
return params
def parse_lyric(hit):
content = ''
highlights = hit['highlights']
if highlights:
content = hit['highlights'][0]['value']
else:
content = hit['result'].get('title_with_featured', '')
timestamp = hit['result']['lyrics_updated_at']
result = {
'url': hit['result']['url'],
'title': hit['result']['full_title'],
'content': content,
'thumbnail': hit['result']['song_art_image_thumbnail_url'],
}
if timestamp:
result.update({'publishedDate': datetime.fromtimestamp(timestamp)})
api_path = hit['result'].get('api_path')
if api_path:
# The players are just playing 30sec from the title. Some of the player
# will be blocked because of a cross-origin request and some players will
# link to apple when you press the play button.
result['iframe_src'] = music_player.format(api_path=api_path)
return result
def parse_artist(hit):
result = {
'url': hit['result']['url'],
'title': hit['result']['name'],
'content': '',
'thumbnail': hit['result']['image_url'],
}
return result
def parse_album(hit):
res = hit['result']
content = res.get('name_with_artist', res.get('name', ''))
x = res.get('release_date_components')
if x:
x = x.get('year')
if x:
content = "%s / %s" % (x, content)
return {
'url': res['url'],
'title': res['full_title'],
'thumbnail': res['cover_art_url'],
'content': content.strip(),
}
parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album}
def response(resp):
results = []
for section in resp.json()['response']['sections']:
for hit in section['hits']:
func = parse.get(hit['type'])
if func:
results.append(func(hit))
return results

116
searx/engines/gitea.py Normal file
View File

@@ -0,0 +1,116 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Engine to search in collaborative software platforms based on Gitea_ or Forgejo_.
.. _Gitea: https://about.gitea.com/
.. _Forgejo: https://forgejo.org/
Configuration
=============
The engine has the following mandatory setting:
- :py:obj:`base_url`
Optional settings are:
- :py:obj:`sort`
- :py:obj:`order`
- :py:obj:`page_size`
.. code:: yaml
- name: gitea.com
engine: gitea
base_url: https://gitea.com
shortcut: gitea
- name: forgejo.com
engine: gitea
base_url: https://code.forgejo.org
shortcut: forgejo
If you would like to use additional instances, just configure new engines in the
:ref:`settings <settings engines>` and set the ``base_url``.
Implementation
==============
"""
from urllib.parse import urlencode
from dateutil import parser
about = {
"website": 'https://about.gitea.com',
"wikidata_id": None,
"official_api_documentation": 'https://docs.gitea.com/next/development/api-usage',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['it', 'repos']
paging = True
base_url: str = ''
"""URL of the Gitea_ instance."""
sort: str = "updated"
"""Sort criteria, possible values:
- ``updated`` (default)
- ``alpha``
- ``created``
- ``size``
- ``id``
"""
order = "desc"
"""Sort order, possible values:
- ``desc`` (default)
- ``asc``
"""
page_size: int = 10
"""Maximum number of results per page (default 10)."""
def init(_):
if not base_url:
raise ValueError('gitea engine: base_url is unset')
def request(query, params):
args = {'q': query, 'limit': page_size, 'sort': sort, 'order': order, 'page': params['pageno']}
params['url'] = f"{base_url}/api/v1/repos/search?{urlencode(args)}"
return params
def response(resp):
results = []
for item in resp.json().get('data', []):
content = [item.get(i) for i in ['language', 'description'] if item.get(i)]
results.append(
{
'template': 'packages.html',
'url': item.get('html_url'),
'title': item.get('full_name'),
'content': ' / '.join(content),
# Use Repository Avatar and fall back to Owner Avatar if not set.
'thumbnail': item.get('avatar_url') or item.get('owner', {}).get('avatar_url'),
'package_name': item.get('name'),
'maintainer': item.get('owner', {}).get('username'),
'publishedDate': parser.parse(item.get("updated_at") or item.get("created_at")),
'tags': item.get('topics', []),
'popularity': item.get('stars_count'),
'homepage': item.get('website'),
'source_code_url': item.get('clone_url'),
}
)
return results

67
searx/engines/github.py Normal file
View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Github (IT)
"""
from urllib.parse import urlencode
from dateutil import parser
# about
about = {
"website": 'https://github.com/',
"wikidata_id": 'Q364',
"official_api_documentation": 'https://developer.github.com/v3/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['it', 'repos']
# search-url
search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}'
accept_header = 'application/vnd.github.preview.text-match+json'
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
params['headers']['Accept'] = accept_header
return params
def response(resp):
results = []
for item in resp.json().get('items', []):
content = [item.get(i) for i in ['language', 'description'] if item.get(i)]
# license can be None
lic = item.get('license') or {}
lic_url = None
if lic.get('spdx_id'):
lic_url = f"https://spdx.org/licenses/{lic.get('spdx_id')}.html"
results.append(
{
'template': 'packages.html',
'url': item.get('html_url'),
'title': item.get('full_name'),
'content': ' / '.join(content),
'thumbnail': item.get('owner', {}).get('avatar_url'),
'package_name': item.get('name'),
# 'version': item.get('updated_at'),
'maintainer': item.get('owner', {}).get('login'),
'publishedDate': parser.parse(item.get("updated_at") or item.get("created_at")),
'tags': item.get('topics', []),
'popularity': item.get('stargazers_count'),
'license_name': lic.get('name'),
'license_url': lic_url,
'homepage': item.get('homepage'),
'source_code_url': item.get('clone_url'),
}
)
return results

95
searx/engines/gitlab.py Normal file
View File

@@ -0,0 +1,95 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Engine to search in collaborative software platforms based on GitLab_ with
the `GitLab REST API`_.
.. _GitLab: https://about.gitlab.com/install/
.. _GitLab REST API: https://docs.gitlab.com/ee/api/
Configuration
=============
The engine has the following mandatory setting:
- :py:obj:`base_url`
Optional settings are:
- :py:obj:`api_path`
.. code:: yaml
- name: gitlab
engine: gitlab
base_url: https://gitlab.com
shortcut: gl
about:
website: https://gitlab.com/
wikidata_id: Q16639197
- name: gnome
engine: gitlab
base_url: https://gitlab.gnome.org
shortcut: gn
about:
website: https://gitlab.gnome.org
wikidata_id: Q44316
Implementations
===============
"""
from urllib.parse import urlencode
from dateutil import parser
about = {
"website": None,
"wikidata_id": None,
"official_api_documentation": "https://docs.gitlab.com/ee/api/",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ['it', 'repos']
paging = True
base_url: str = ""
"""Base URL of the GitLab host."""
api_path: str = 'api/v4/projects'
"""The path the `project API <https://docs.gitlab.com/ee/api/projects.html>`_.
The default path should work fine usually.
"""
def request(query, params):
args = {'search': query, 'page': params['pageno']}
params['url'] = f"{base_url}/{api_path}?{urlencode(args)}"
return params
def response(resp):
results = []
for item in resp.json():
results.append(
{
'template': 'packages.html',
'url': item.get('web_url'),
'title': item.get('name'),
'content': item.get('description'),
'thumbnail': item.get('avatar_url'),
'package_name': item.get('name'),
'maintainer': item.get('namespace', {}).get('name'),
'publishedDate': parser.parse(item.get('last_activity_at') or item.get("created_at")),
'tags': item.get('tag_list', []),
'popularity': item.get('star_count'),
'homepage': item.get('readme_url'),
'source_code_url': item.get('http_url_to_repo'),
}
)
return results

View File

@@ -0,0 +1,57 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Goodreads (books)
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list
about = {
'website': 'https://www.goodreads.com',
'wikidata_id': 'Q2359213',
'official_api_documentation': None,
'use_official_api': False,
'require_api_key': False,
'results': 'HTML',
}
categories = []
paging = True
base_url = "https://www.goodreads.com"
results_xpath = "//table//tr"
thumbnail_xpath = ".//img[contains(@class, 'bookCover')]/@src"
url_xpath = ".//a[contains(@class, 'bookTitle')]/@href"
title_xpath = ".//a[contains(@class, 'bookTitle')]"
author_xpath = ".//a[contains(@class, 'authorName')]"
info_text_xpath = ".//span[contains(@class, 'uitext')]"
def request(query, params):
args = {
'q': query,
'page': params['pageno'],
}
params['url'] = f"{base_url}/search?{urlencode(args)}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, results_xpath):
results.append(
{
'url': base_url + extract_text(eval_xpath(result, url_xpath)),
'title': extract_text(eval_xpath(result, title_xpath)),
'thumbnail': extract_text(eval_xpath(result, thumbnail_xpath)),
'content': extract_text(eval_xpath(result, info_text_xpath)),
'metadata': extract_text(eval_xpath(result, author_xpath)),
}
)
return results

534
searx/engines/google.py Normal file
View File

@@ -0,0 +1,534 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is the implementation of the Google WEB engine. Some of this
implementations (manly the :py:obj:`get_google_info`) are shared by other
engines:
- :ref:`google images engine`
- :ref:`google news engine`
- :ref:`google videos engine`
- :ref:`google scholar engine`
- :ref:`google autocomplete`
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import re
import random
import string
import time
from urllib.parse import urlencode
from lxml import html
import babel
import babel.core
import babel.languages
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.locales import language_tag, region_tag, get_official_locales
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.exceptions import SearxEngineCaptchaException
from searx.enginelib.traits import EngineTraits
from searx.result_types import EngineResults
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.google.com',
"wikidata_id": 'Q9366',
"official_api_documentation": 'https://developers.google.com/custom-search/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general', 'web']
paging = True
max_page = 50
"""`Google max 50 pages`_
.. _Google max 50 pages: https://github.com/searxng/searxng/issues/2982
"""
time_range_support = True
safesearch = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
# Filter results. 0: None, 1: Moderate, 2: Strict
filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# specific xpath variables
# ------------------------
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
_arcid_range = string.ascii_letters + string.digits + "_-"
_arcid_random: tuple[str, int] | None = None
def ui_async(start: int) -> str:
"""Format of the response from UI's async request.
- ``arc_id:<...>,use_ac:true,_fmt:prog``
The arc_id is random generated every hour.
"""
global _arcid_random # pylint: disable=global-statement
use_ac = "use_ac:true"
# _fmt:html returns a HTTP 500 when user search for celebrities like
# '!google natasha allegri' or '!google chris evans'
_fmt = "_fmt:prog"
# create a new random arc_id every hour
if not _arcid_random or (int(time.time()) - _arcid_random[1]) > 3600:
_arcid_random = (''.join(random.choices(_arcid_range, k=23)), int(time.time()))
arc_id = f"arc_id:srp_{_arcid_random[0]}_1{start:02}"
return ",".join([arc_id, use_ac, _fmt])
def get_google_info(params, eng_traits):
"""Composing various (language) properties for the google engines (:ref:`google
API`).
This function is called by the various google engines (:ref:`google web
engine`, :ref:`google images engine`, :ref:`google news engine` and
:ref:`google videos engine`).
:param dict param: Request parameters of the engine. At least
a ``searxng_locale`` key should be in the dictionary.
:param eng_traits: Engine's traits fetched from google preferences
(:py:obj:`searx.enginelib.traits.EngineTraits`)
:rtype: dict
:returns:
Py-Dictionary with the key/value pairs:
language:
The language code that is used by google (e.g. ``lang_en`` or
``lang_zh-TW``)
country:
The country code that is used by google (e.g. ``US`` or ``TW``)
locale:
A instance of :py:obj:`babel.core.Locale` build from the
``searxng_locale`` value.
subdomain:
Google subdomain :py:obj:`google_domains` that fits to the country
code.
params:
Py-Dictionary with additional request arguments (can be passed to
:py:func:`urllib.parse.urlencode`).
- ``hl`` parameter: specifies the interface language of user interface.
- ``lr`` parameter: restricts search results to documents written in
a particular language.
- ``cr`` parameter: restricts search results to documents
originating in a particular country.
- ``ie`` parameter: sets the character encoding scheme that should
be used to interpret the query string ('utf8').
- ``oe`` parameter: sets the character encoding scheme that should
be used to decode the XML result ('utf8').
headers:
Py-Dictionary with additional HTTP headers (can be passed to
request's headers)
- ``Accept: '*/*``
"""
ret_val = {
'language': None,
'country': None,
'subdomain': None,
'params': {},
'headers': {},
'cookies': {},
'locale': None,
}
sxng_locale = params.get('searxng_locale', 'all')
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.core.UnknownLocaleError:
locale = None
eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
# Test zh_hans & zh_hant --> in the topmost links in the result list of list
# TW and HK you should a find wiktionary.org zh_hant link. In the result
# list of zh-CN should not be no hant link instead you should find
# zh.m.wikipedia.org/zh somewhere in the top.
# '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
# '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
ret_val['language'] = eng_lang
ret_val['country'] = country
ret_val['locale'] = locale
ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
# hl parameter:
# The hl parameter specifies the interface language (host language) of
# your user interface. To improve the performance and the quality of your
# search results, you are strongly encouraged to set this parameter
# explicitly.
# https://developers.google.com/custom-search/docs/xml_results#hlsp
# The Interface Language:
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
# https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
ret_val['params']['hl'] = f'{lang_code}-{country}'
# lr parameter:
# The lr (language restrict) parameter restricts search results to
# documents written in a particular language.
# https://developers.google.com/custom-search/docs/xml_results#lrsp
# Language Collection Values:
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
#
# To select 'all' languages an empty 'lr' value is used.
#
# Different to other google services, Google Scholar supports to select more
# than one language. The languages are separated by a pipe '|' (logical OR).
# By example: &lr=lang_zh-TW%7Clang_de selects articles written in
# traditional chinese OR german language.
ret_val['params']['lr'] = eng_lang
if sxng_locale == 'all':
ret_val['params']['lr'] = ''
# cr parameter:
# The cr parameter restricts search results to documents originating in a
# particular country.
# https://developers.google.com/custom-search/docs/xml_results#crsp
# specify a region (country) only if a region is given in the selected
# locale --> https://github.com/searxng/searxng/issues/2672
ret_val['params']['cr'] = ''
if len(sxng_locale.split('-')) > 1:
ret_val['params']['cr'] = 'country' + country
# gl parameter: (mandatory by Google News)
# The gl parameter value is a two-letter country code. For WebSearch
# results, the gl parameter boosts search results whose country of origin
# matches the parameter value. See the Country Codes section for a list of
# valid values.
# Specifying a gl parameter value in WebSearch requests should improve the
# relevance of results. This is particularly true for international
# customers and, even more specifically, for customers in English-speaking
# countries other than the United States.
# https://developers.google.com/custom-search/docs/xml_results#glsp
# https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635
# ret_val['params']['gl'] = country
# ie parameter:
# The ie parameter sets the character encoding scheme that should be used
# to interpret the query string. The default ie value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#iesp
ret_val['params']['ie'] = 'utf8'
# oe parameter:
# The oe parameter sets the character encoding scheme that should be used
# to decode the XML result. The default oe value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#oesp
ret_val['params']['oe'] = 'utf8'
# num parameter:
# The num parameter identifies the number of search results to return.
# The default num value is 10, and the maximum value is 20. If you request
# more than 20 results, only 20 results will be returned.
# https://developers.google.com/custom-search/docs/xml_results#numsp
# HINT: seems to have no effect (tested in google WEB & Images)
# ret_val['params']['num'] = 20
# HTTP headers
ret_val['headers']['Accept'] = '*/*'
# Cookies
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
# - https://github.com/searxng/searxng/issues/1555
ret_val['cookies']['CONSENT'] = "YES+"
return ret_val
def detect_google_sorry(resp):
if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
def request(query, params):
"""Google search request"""
# pylint: disable=line-too-long
start = (params['pageno'] - 1) * 10
str_async = ui_async(start)
google_info = get_google_info(params, traits)
logger.debug("ARC_ID: %s", str_async)
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
**google_info['params'],
'filter': '0',
'start': start,
# 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
# 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
# 'cs' : 1,
# 'sa': 'N',
# 'yv': 3,
# 'prmd': 'vin',
# 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
# 'sa': 'N',
# 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
# formally known as use_mobile_ui
'asearch': 'arc',
'async': str_async,
}
)
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
RE_DATA_IMAGE_end = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*)$')
def parse_data_images(text: str):
data_image_map = {}
for img_id, data_image in RE_DATA_IMAGE.findall(text):
end_pos = data_image.rfind('=')
if end_pos > 0:
data_image = data_image[: end_pos + 1]
data_image_map[img_id] = data_image
last = RE_DATA_IMAGE_end.search(text)
if last:
data_image_map[last.group(1)] = last.group(2)
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
return data_image_map
def response(resp) -> EngineResults:
"""Get response from google's search request"""
# pylint: disable=too-many-branches, too-many-statements
detect_google_sorry(resp)
data_image_map = parse_data_images(resp.text)
results = EngineResults()
# convert the text to dom
dom = html.fromstring(resp.text)
# results --> answer
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
for item in answer_list:
for bubble in eval_xpath(item, './/div[@class="nnFGuf"]'):
bubble.drop_tree()
results.add(
results.types.Answer(
answer=extract_text(item),
url=(eval_xpath(item, '../..//a/@href') + [None])[0],
)
)
# parse results
for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
# pylint: disable=too-many-nested-blocks
try:
title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
if title_tag is None:
# this not one of the common google results *section*
logger.debug('ignoring item from the result_xpath list: missing title')
continue
title = extract_text(title_tag)
url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
if url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
continue
content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
for item in content_nodes:
for script in item.xpath(".//script"):
script.getparent().remove(script)
content = extract_text(content_nodes)
if not content:
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
continue
thumbnail = content_nodes[0].xpath('.//img/@src')
if thumbnail:
thumbnail = thumbnail[0]
if thumbnail.startswith('data:image'):
img_id = content_nodes[0].xpath('.//img/@id')
if img_id:
thumbnail = data_image_map.get(img_id[0])
else:
thumbnail = None
results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail})
except Exception as e: # pylint: disable=broad-except
logger.error(e, exc_info=True)
continue
# parse suggestion
for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
# return results
return results
# get supported languages from their site
skip_countries = [
# official language of google-country not in google-languages
'AL', # Albanien (sq)
'AZ', # Aserbaidschan (az)
'BD', # Bangladesch (bn)
'BN', # Brunei Darussalam (ms)
'BT', # Bhutan (dz)
'ET', # Äthiopien (am)
'GE', # Georgien (ka, os)
'GL', # Grönland (kl)
'KH', # Kambodscha (km)
'LA', # Laos (lo)
'LK', # Sri Lanka (si, ta)
'ME', # Montenegro (sr)
'MK', # Nordmazedonien (mk, sq)
'MM', # Myanmar (my)
'MN', # Mongolei (mn)
'MV', # Malediven (dv) // dv_MV is unknown by babel
'MY', # Malaysia (ms)
'NP', # Nepal (ne)
'TJ', # Tadschikistan (tg)
'TM', # Turkmenistan (tk)
'UZ', # Usbekistan (uz)
]
def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
"""Fetch languages from Google."""
# pylint: disable=import-outside-toplevel, too-many-branches
engine_traits.custom['supported_domains'] = {}
resp = get('https://www.google.com/preferences')
if not resp.ok: # type: ignore
raise RuntimeError("Response from Google's preferences is not OK.")
dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
# supported language codes
lang_map = {'no': 'nb'}
for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
eng_lang = x.get("value")
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
print("INFO: google UI language %s (%s) is unknown by babel" % (eng_lang, x.text.split("(")[0].strip()))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
# alias languages
engine_traits.languages['zh'] = 'lang_zh-CN'
# supported region codes
for x in eval_xpath_list(dom, "//select[@name='gl']/option"):
eng_country = x.get("value")
if eng_country in skip_countries:
continue
if eng_country == 'ZZ':
engine_traits.all_locale = 'ZZ'
continue
sxng_locales = get_official_locales(eng_country, engine_traits.languages.keys(), regional=True)
if not sxng_locales:
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
continue
for sxng_locale in sxng_locales:
engine_traits.regions[region_tag(sxng_locale)] = eng_country
# alias regions
engine_traits.regions['zh-CN'] = 'HK'
# supported domains
if add_domains:
resp = get('https://www.google.com/supported_domains')
if not resp.ok: # type: ignore
raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
for domain in resp.text.split(): # type: ignore
domain = domain.strip()
if not domain or domain in [
'.google.com',
]:
continue
region = domain.split('.')[-1].upper()
engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
if region == 'HK':
# There is no google.cn, we use .com.hk for zh-CN
engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore

View File

@@ -0,0 +1,132 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is the implementation of the Google Images engine using the internal
Google API used by the Google Go Android app.
This internal API offer results in
- JSON (``_fmt:json``)
- Protobuf_ (``_fmt:pb``)
- Protobuf_ compressed? (``_fmt:pc``)
- HTML (``_fmt:html``)
- Protobuf_ encoded in JSON (``_fmt:jspb``).
.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from json import loads
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
detect_google_sorry,
)
if TYPE_CHECKING:
import logging
from searx.enginelib.traits import EngineTraits
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://images.google.com',
"wikidata_id": 'Q521550',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['images', 'web']
paging = True
max_page = 50
"""`Google max 50 pages`_
.. _Google max 50 pages: https://github.com/searxng/searxng/issues/2982
"""
time_range_support = True
safesearch = True
send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
def request(query, params):
"""Google-Image search request"""
google_info = get_google_info(params, traits)
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
+ '?'
+ urlencode({'q': query, 'tbm': "isch", **google_info['params'], 'asearch': 'isch'})
# don't urlencode this because wildly different AND bad results
# pagination uses Zero-based numbering
+ f'&async=_fmt:json,p:1,ijn:{params["pageno"] - 1}'
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
# this ua will allow getting ~50 results instead of 10. #1641
params['headers']['User-Agent'] = (
'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12;' f' {google_info.get("country", "US")}) gzip'
)
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
json_start = resp.text.find('{"ischj":')
json_data = loads(resp.text[json_start:])
for item in json_data["ischj"].get("metadata", []):
result_item = {
'url': item["result"]["referrer_url"],
'title': item["result"]["page_title"],
'content': item["text_in_grid"]["snippet"],
'source': item["result"]["site_title"],
'resolution': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
'img_src': item["original_image"]["url"],
'thumbnail_src': item["thumbnail"]["url"],
'template': 'images.html',
}
author = item["result"].get('iptc', {}).get('creator')
if author:
result_item['author'] = ', '.join(author)
copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
if copyright_notice:
result_item['source'] += ' | ' + copyright_notice
freshness_date = item["result"].get("freshness_date")
if freshness_date:
result_item['source'] += ' | ' + freshness_date
file_size = item.get('gsa', {}).get('file_size')
if file_size:
result_item['source'] += ' (%s)' % file_size
results.append(result_item)
return results

View File

@@ -0,0 +1,304 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is the implementation of the Google News engine.
Google News has a different region handling compared to Google WEB.
- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
- the hl_ argument has to be set correctly (and different to Google WEB)
- the gl_ argument is mandatory
If one of this argument is not set correctly, the request is redirected to
CONSENT dialog::
https://consent.google.com/m?continue=
The google news API ignores some parameters from the common :ref:`google API`:
- num_ : the number of search results is ignored / there is no paging all
results for a query term are in the first response.
- save_ : is ignored / Google-News results are always *SafeSearch*
.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
import base64
from lxml import html
import babel
from searx import locales
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://news.google.com',
"wikidata_id": 'Q12020',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['news']
paging = False
time_range_support = False
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors::
#
# safesearch : results are identical for safesearch=0 and safesearch=2
safesearch = True
# send_accept_language_header = True
def request(query, params):
"""Google-News search request"""
sxng_locale = params.get('searxng_locale', 'en-US')
ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
google_info = get_google_info(params, traits)
google_info['subdomain'] = 'news.google.com' # google news has only one domain
ceid_region, ceid_lang = ceid.split(':')
ceid_lang, ceid_suffix = (
ceid_lang.split('-')
+ [
None,
]
)[:2]
google_info['params']['hl'] = ceid_lang
if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
if ceid_region.lower() == ceid_lang:
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
else:
google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
elif ceid_region.lower() != ceid_lang:
if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
google_info['params']['hl'] = ceid_lang
else:
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
google_info['params']['gl'] = ceid_region
query_url = (
'https://'
+ google_info['subdomain']
+ "/search?"
+ urlencode(
{
'q': query,
**google_info['params'],
}
)
# ceid includes a ':' character which must not be urlencoded
+ ('&ceid=%s' % ceid)
)
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
# convert the text to dom
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
# The first <a> tag in the <article> contains the link to the article
# The href attribute of the <a> tag is a google internal link, we have
# to decode
href = eval_xpath_getindex(result, './article/a/@href', 0)
href = href.split('?')[0]
href = href.split('/')[-1]
href = base64.urlsafe_b64decode(href + '====')
href = href[href.index(b'http') :].split(b'\xd2')[0]
href = href.decode()
title = extract_text(eval_xpath(result, './article/h3[1]'))
# The pub_date is mostly a string like 'yesterday', not a real
# timezone date or time. Therefore we can't use publishedDate.
pub_date = extract_text(eval_xpath(result, './article//time'))
pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
content = ' / '.join([x for x in [pub_origin, pub_date] if x])
# The image URL is located in a preceding sibling <img> tag, e.g.:
# "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
# These URL are long but not personalized (double checked via tor).
thumbnail = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
results.append(
{
'url': href,
'title': title,
'content': content,
'thumbnail': thumbnail,
}
)
# return results
return results
ceid_list = [
'AE:ar',
'AR:es-419',
'AT:de',
'AU:en',
'BD:bn',
'BE:fr',
'BE:nl',
'BG:bg',
'BR:pt-419',
'BW:en',
'CA:en',
'CA:fr',
'CH:de',
'CH:fr',
'CL:es-419',
'CN:zh-Hans',
'CO:es-419',
'CU:es-419',
'CZ:cs',
'DE:de',
'EG:ar',
'ES:es',
'ET:en',
'FR:fr',
'GB:en',
'GH:en',
'GR:el',
'HK:zh-Hant',
'HU:hu',
'ID:en',
'ID:id',
'IE:en',
'IL:en',
'IL:he',
'IN:bn',
'IN:en',
'IN:hi',
'IN:ml',
'IN:mr',
'IN:ta',
'IN:te',
'IT:it',
'JP:ja',
'KE:en',
'KR:ko',
'LB:ar',
'LT:lt',
'LV:en',
'LV:lv',
'MA:fr',
'MX:es-419',
'MY:en',
'NA:en',
'NG:en',
'NL:nl',
'NO:no',
'NZ:en',
'PE:es-419',
'PH:en',
'PK:en',
'PL:pl',
'PT:pt-150',
'RO:ro',
'RS:sr',
'RU:ru',
'SA:ar',
'SE:sv',
'SG:en',
'SI:sl',
'SK:sk',
'SN:fr',
'TH:th',
'TR:tr',
'TW:zh-Hant',
'TZ:en',
'UA:ru',
'UA:uk',
'UG:en',
'US:en',
'US:es-419',
'VE:es-419',
'VN:vi',
'ZA:en',
'ZW:en',
]
"""List of region/language combinations supported by Google News. Values of the
``ceid`` argument of the Google News REST API."""
_skip_values = [
'ET:en', # english (ethiopia)
'ID:en', # english (indonesia)
'LV:en', # english (latvia)
]
_ceid_locale_map = {'NO:no': 'nb-NO'}
def fetch_traits(engine_traits: EngineTraits):
_fetch_traits(engine_traits, add_domains=False)
engine_traits.custom['ceid'] = {}
for ceid in ceid_list:
if ceid in _skip_values:
continue
region, lang = ceid.split(':')
x = lang.split('-')
if len(x) > 1:
if x[1] not in ['Hant', 'Hans']:
lang = x[0]
sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
continue
engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid

View File

@@ -0,0 +1,115 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Google Play Apps & Google Play Movies
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath,
extract_url,
extract_text,
eval_xpath_list,
eval_xpath_getindex,
)
about = {
"website": "https://play.google.com/",
"wikidata_id": "Q79576",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
send_accept_language_header = True
play_categ = None # apps|movies
base_url = 'https://play.google.com'
search_url = base_url + "/store/search?{query}&c={play_categ}"
def request(query, params):
if play_categ not in ('movies', 'apps'):
raise ValueError(f"unknown google play category: {play_categ}")
params["url"] = search_url.format(
query=urlencode({"q": query}),
play_categ=play_categ,
)
params['cookies']['CONSENT'] = "YES+"
return params
def response(resp):
if play_categ == 'movies':
return response_movies(resp)
if play_categ == 'apps':
return response_apps(resp)
raise ValueError(f"Unsupported play category: {play_categ}")
def response_movies(resp):
results = []
dom = html.fromstring(resp.text)
for section in eval_xpath(dom, '//c-wiz/section/header/..'):
sec_name = extract_text(eval_xpath(section, './header'))
for item in eval_xpath(section, './/a'):
url = base_url + item.get('href')
div_1, div_2 = eval_xpath(item, './div')[:2]
title = extract_text(eval_xpath(div_2, './div[@title]'))
metadata = extract_text(eval_xpath(div_2, './div[@class]'))
img = eval_xpath(div_1, './/img')[0]
thumbnail = img.get('src')
results.append(
{
"url": url,
"title": title,
"content": sec_name,
"thumbnail": thumbnail,
'metadata': metadata,
'template': 'videos.html',
}
)
return results
def response_apps(resp):
results = []
dom = html.fromstring(resp.text)
if eval_xpath(dom, '//div[@class="v6DsQb"]'):
return []
spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None)
if spot is not None:
url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url)
title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]'))
content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]'))
img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src'))
results.append({"url": url, "title": title, "content": content, "img_src": img})
more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1)
for result in more:
url = extract_url(eval_xpath(result, ".//a/@href"), search_url)
title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]'))
content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]'))
img = extract_text(
eval_xpath(
result,
'.//img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src',
)
)
results.append({"url": url, "title": title, "content": content, "img_src": img})
for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'):
results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))})
return results

View File

@@ -0,0 +1,221 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is the implementation of the Google Scholar engine.
Compared to other Google services the Scholar engine has a simple GET REST-API
and there does not exists `async` API. Even though the API slightly vintage we
can make use of the :ref:`google API` to assemble the arguments of the GET
request.
"""
from typing import TYPE_CHECKING
from typing import Optional
from urllib.parse import urlencode
from datetime import datetime
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
from searx.exceptions import SearxEngineCaptchaException
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://scholar.google.com',
"wikidata_id": 'Q494817',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['science', 'scientific publications']
paging = True
max_page = 50
"""`Google max 50 pages`_
.. _Google max 50 pages: https://github.com/searxng/searxng/issues/2982
"""
language_support = True
time_range_support = True
safesearch = False
send_accept_language_header = True
def time_range_args(params):
"""Returns a dictionary with a time range arguments based on
``params['time_range']``.
Google Scholar supports a detailed search by year. Searching by *last
month* or *last week* (as offered by SearXNG) is uncommon for scientific
publications and is not supported by Google Scholar.
To limit the result list when the users selects a range, all the SearXNG
ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
is set an empty dictionary of arguments is returned. Example; when
user selects a time range (current year minus one in 2022):
.. code:: python
{ 'as_ylo' : 2021 }
"""
ret_val = {}
if params['time_range'] in time_range_dict:
ret_val['as_ylo'] = datetime.now().year - 1
return ret_val
def detect_google_captcha(dom):
"""In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
not redirected to ``sorry.google.com``.
"""
if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
raise SearxEngineCaptchaException()
def request(query, params):
"""Google-Scholar search request"""
google_info = get_google_info(params, traits)
# subdomain is: scholar.google.xy
google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
args = {
'q': query,
**google_info['params'],
'start': (params['pageno'] - 1) * 10,
'as_sdt': '2007', # include patents / to disable set '0,5'
'as_vis': '0', # include citations / to disable set '1'
}
args.update(time_range_args(params))
params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def parse_gs_a(text: Optional[str]):
"""Parse the text written in green.
Possible formats:
* "{authors} - {journal}, {year} - {publisher}"
* "{authors} - {year} - {publisher}"
* "{authors} - {publisher}"
"""
if text is None or text == "":
return None, None, None, None
s_text = text.split(' - ')
authors = s_text[0].split(', ')
publisher = s_text[-1]
if len(s_text) != 3:
return authors, None, publisher, None
# the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
# get journal and year
journal_year = s_text[1].split(', ')
# journal is optional and may contains some coma
if len(journal_year) > 1:
journal = ', '.join(journal_year[0:-1])
if journal == '':
journal = None
else:
journal = None
# year
year = journal_year[-1]
try:
publishedDate = datetime.strptime(year.strip(), '%Y')
except ValueError:
publishedDate = None
return authors, journal, publisher, publishedDate
def response(resp): # pylint: disable=too-many-locals
"""Parse response from Google Scholar"""
results = []
# convert the text to dom
dom = html.fromstring(resp.text)
detect_google_captcha(dom)
# parse results
for result in eval_xpath_list(dom, '//div[@data-rp]'):
title = extract_text(eval_xpath(result, './/h3[1]//a'))
if not title:
# this is a [ZITATION] block
continue
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if pub_type:
pub_type = pub_type[1:-1].lower()
url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
authors, journal, publisher, publishedDate = parse_gs_a(
extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
)
if publisher in url:
publisher = None
# cited by
comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
# link to the html or pdf document
html_url = None
pdf_url = None
doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if doc_type == "[PDF]":
pdf_url = doc_url
else:
html_url = doc_url
results.append(
{
'template': 'paper.html',
'type': pub_type,
'url': url,
'title': title,
'authors': authors,
'publisher': publisher,
'journal': journal,
'publishedDate': publishedDate,
'content': content,
'comments': comments,
'html_url': html_url,
'pdf_url': pdf_url,
}
)
# parse suggestion
for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
results.append({'correction': extract_text(correction)})
return results

View File

@@ -0,0 +1,153 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is the implementation of the Google Videos engine.
.. admonition:: Content-Security-Policy (CSP)
This engine needs to allow images from the `data URLs`_ (prefixed with the
``data:`` scheme)::
Header set Content-Security-Policy "img-src 'self' data: ;"
.. _data URLs:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
filter_mapping,
suggestion_xpath,
detect_google_sorry,
ui_async,
parse_data_images,
)
from searx.enginelib.traits import EngineTraits
from searx.utils import get_embeded_stream_url
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://www.google.com',
"wikidata_id": 'Q219885',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['videos', 'web']
paging = True
max_page = 50
"""`Google: max 50 pages`
.. _Google: max 50 pages: https://github.com/searxng/searxng/issues/2982
"""
language_support = True
time_range_support = True
safesearch = True
def request(query, params):
"""Google-Video search request"""
google_info = get_google_info(params, traits)
start = (params['pageno'] - 1) * 10
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
'tbm': "vid",
'start': 10 * params['pageno'],
**google_info['params'],
'asearch': 'arc',
'async': ui_async(start),
}
)
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if 'safesearch' in params:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
data_image_map = parse_data_images(resp.text)
# convert the text to dom
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
thumbnail = eval_xpath_getindex(result, './/img/@src', 0, None)
if thumbnail:
if thumbnail.startswith('data:image'):
img_id = eval_xpath_getindex(result, './/img/@id', 0, None)
if img_id:
thumbnail = data_image_map.get(img_id)
else:
thumbnail = None
title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0))
url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0)
c_node = eval_xpath_getindex(result, './/div[contains(@class, "ITZIwc")]', 0)
content = extract_text(c_node)
pub_info = extract_text(eval_xpath(result, './/div[contains(@class, "gqF9jc")]'))
results.append(
{
'url': url,
'title': title,
'content': content,
'author': pub_info,
'thumbnail': thumbnail,
'iframe_src': get_embeded_stream_url(url),
'template': 'videos.html',
}
)
# parse suggestion
for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
return results

View File

@@ -0,0 +1,94 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Hackernews
"""
from datetime import datetime
from urllib.parse import urlencode
from dateutil.relativedelta import relativedelta
from flask_babel import gettext
# Engine metadata
about = {
"website": "https://news.ycombinator.com/",
"wikidata_id": "Q686797",
"official_api_documentation": "https://hn.algolia.com/api",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
# Engine configuration
paging = True
time_range_support = True
categories = ["it"]
results_per_page = 30
# Search URL
base_url = "https://hn.algolia.com/api/v1"
def request(query, params):
search_type = 'search'
if not query:
# if search query is empty show results from HN's front page
search_type = 'search_by_date'
query_params = {
"tags": "front_page",
"page": (params["pageno"] - 1),
}
else:
query_params = {
"query": query,
"page": (params["pageno"] - 1),
"hitsPerPage": results_per_page,
"minWordSizefor1Typo": 4,
"minWordSizefor2Typos": 8,
"advancedSyntax": "true",
"ignorePlurals": "false",
"minProximity": 7,
"numericFilters": '[]',
"tagFilters": '["story",[]]',
"typoTolerance": "true",
"queryType": "prefixLast",
"restrictSearchableAttributes": '["title","comment_text","url","story_text","author"]',
"getRankingInfo": "true",
}
if params['time_range']:
search_type = 'search_by_date'
timestamp = (
# pylint: disable=unexpected-keyword-arg
datetime.now()
- relativedelta(**{f"{params['time_range']}s": 1}) # type: ignore
).timestamp()
query_params["numericFilters"] = f"created_at_i>{timestamp}"
params["url"] = f"{base_url}/{search_type}?{urlencode(query_params)}"
return params
def response(resp):
results = []
data = resp.json()
for hit in data["hits"]:
object_id = hit["objectID"]
points = hit.get("points") or 0
num_comments = hit.get("num_comments") or 0
metadata = ""
if points != 0 or num_comments != 0:
metadata = f"{gettext('points')}: {points}" f" | {gettext('comments')}: {num_comments}"
results.append(
{
"title": hit.get("title") or f"{gettext('author')}: {hit['author']}",
"url": f"https://news.ycombinator.com/item?id={object_id}",
"content": hit.get("url") or hit.get("comment_text") or hit.get("story_text") or "",
"metadata": metadata,
"author": hit["author"],
"publishedDate": datetime.fromtimestamp(hit["created_at_i"]),
}
)
return results

81
searx/engines/hex.py Normal file
View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""hex.pm"""
from urllib.parse import urlencode
from dateutil import parser
about = {
# pylint: disable=line-too-long
"website": "https://hex.pm/",
"wikidata_id": None,
"official_api_documentation": "https://github.com/hexpm/hexpm/blob/main/lib/hexpm_web/controllers/api/package_controller.ex",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ["it", "packages"]
# engine dependent config
paging = True
search_url = "https://hex.pm/api/packages/"
# Valid values: name inserted_at updated_at total_downloads recent_downloads
sort_criteria = "recent_downloads"
page_size = 10
linked_terms = {
# lower-case : replacement
"author": "Author",
"bitbucket": "Bitbucket",
"bug tracker": "Issue tracker",
"changelog": "Changelog",
"doc": "Documentation",
"docs": "Documentation",
"documentation": "Documentation",
"github repository": "GitHub",
"github": "GitHub",
"gitlab": "GitLab",
"issues": "Issue tracker",
"project source code": "Source code",
"repository": "Source code",
"scm": "Source code",
"sourcehut": "SourceHut",
"sources": "Source code",
"sponsor": "Sponsors",
"sponsors": "Sponsors",
"website": "Homepage",
}
def request(query: str, params):
args = urlencode({"page": params["pageno"], "per_page": page_size, "sort": sort_criteria, "search": query})
params["url"] = f"{search_url}?{args}"
return params
def response(resp):
results = []
for package in resp.json():
meta = package["meta"]
published_date = package.get("updated_at")
published_date = parser.parse(published_date)
links = {linked_terms.get(k.lower(), k): v for k, v in meta.get("links").items()}
results.append(
{
"template": "packages.html",
"url": package["html_url"],
"title": package["name"],
"package_name": package["name"],
"content": meta.get("description", ""),
"version": meta.get("latest_version"),
"maintainer": ", ".join(meta.get("maintainers", [])),
"publishedDate": published_date,
"license_name": ", ".join(meta.get("licenses", [])),
"homepage": package["docs_html_url"],
"links": links,
}
)
return results

View File

@@ -0,0 +1,116 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Hugging Face`_ search engine for SearXNG.
.. _Hugging Face: https://huggingface.co
Configuration
=============
The engine has the following additional settings:
- :py:obj:`huggingface_endpoint`
Configurations for endpoints:
.. code:: yaml
- name: huggingface
engine: huggingface
shortcut: hf
- name: huggingface datasets
huggingface_endpoint: datasets
engine: huggingface
shortcut: hfd
- name: huggingface spaces
huggingface_endpoint: spaces
engine: huggingface
shortcut: hfs
Implementations
===============
"""
from urllib.parse import urlencode
from datetime import datetime
from searx.exceptions import SearxEngineAPIException
from searx.utils import html_to_text
from searx.result_types import EngineResults, MainResult
about = {
"website": "https://huggingface.co/",
"wikidata_id": "Q108943604",
"official_api_documentation": "https://huggingface.co/docs/hub/en/api",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ['it', 'repos']
base_url = "https://huggingface.co"
huggingface_endpoint = 'models'
"""Hugging Face supports datasets, models, spaces as search endpoint.
- ``datasets``: search for datasets
- ``models``: search for models
- ``spaces``: search for spaces
"""
def init(_):
if huggingface_endpoint not in ('datasets', 'models', 'spaces'):
raise SearxEngineAPIException(f"Unsupported Hugging Face endpoint: {huggingface_endpoint}")
def request(query, params):
query_params = {
"direction": -1,
"search": query,
}
params["url"] = f"{base_url}/api/{huggingface_endpoint}?{urlencode(query_params)}"
return params
def response(resp) -> EngineResults:
results = EngineResults()
data = resp.json()
for entry in data:
if huggingface_endpoint != 'models':
url = f"{base_url}/{huggingface_endpoint}/{entry['id']}"
else:
url = f"{base_url}/{entry['id']}"
published_date = None
try:
published_date = datetime.strptime(entry["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
except (ValueError, TypeError):
pass
contents = []
if entry.get("likes"):
contents.append(f"Likes: {entry['likes']}")
if entry.get("downloads"):
contents.append(f"Downloads: {entry['downloads']:,}")
if entry.get("tags"):
contents.append(f"Tags: {', '.join(entry['tags'])}")
if entry.get("description"):
contents.append(f"Description: {entry['description']}")
item = MainResult(
title=entry["id"],
content=html_to_text(" | ".join(contents)),
url=url,
publishedDate=published_date,
)
results.add(item)
return results

71
searx/engines/il_post.py Normal file
View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Engine for Il Post, a largely independent online Italian newspaper.
To use this engine add the following entry to your engines
list in ``settings.yml``:
.. code:: yaml
- name: il post
engine: il_post
shortcut: pst
disabled: false
"""
from urllib.parse import urlencode
from searx.result_types import EngineResults
engine_type = "online"
language_support = False
categories = ["news"]
paging = True
page_size = 10
time_range_support = True
time_range_args = {"month": "pub_date:ultimi_30_giorni", "year": "pub_date:ultimo_anno"}
search_api = "https://api.ilpost.org/search/api/site_search/?"
about = {
"website": "https://www.ilpost.it",
"wikidata_id": "Q3792882",
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
"language": "it",
}
def request(query, params):
query_params = {
"qs": query,
"pg": params["pageno"],
"sort": "date_d",
"filters": "ctype:articoli",
}
if params["time_range"]:
if params["time_range"] not in time_range_args:
return None
query_params["filters"] += f";{time_range_args.get(params['time_range'], 'pub_date:da_sempre')}"
params["url"] = search_api + urlencode(query_params)
return params
def response(resp) -> EngineResults:
res = EngineResults()
json_data = resp.json()
for result in json_data["docs"]:
res.add(
res.types.MainResult(
url=result["link"],
title=result["title"],
content=result.get("summary", ""),
thumbnail=result.get("image"),
)
)
return res

97
searx/engines/imdb.py Normal file
View File

@@ -0,0 +1,97 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""IMDB - Internet Movie Database
Retrieves results from a basic search. Advanced search options are not
supported. IMDB's API is undocumented, here are some posts about:
- https://stackoverflow.com/questions/1966503/does-imdb-provide-an-api
- https://rapidapi.com/blog/how-to-use-imdb-api/
An alternative that needs IMDPro_ is `IMDb and Box Office Mojo
<https://developer.imdb.com/documentation>`_
.. __IMDPro: https://pro.imdb.com/login
"""
import json
about = {
"website": 'https://imdb.com/',
"wikidata_id": 'Q37312',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ["movies"]
paging = False
# suggestion_url = "https://sg.media-imdb.com/suggestion/{letter}/{query}.json"
suggestion_url = "https://v2.sg.media-imdb.com/suggestion/{letter}/{query}.json"
href_base = 'https://imdb.com/{category}/{entry_id}'
search_categories = {"nm": "name", "tt": "title", "kw": "keyword", "co": "company", "ep": "episode"}
def request(query, params):
query = query.replace(" ", "_").lower()
params['url'] = suggestion_url.format(letter=query[0], query=query)
return params
def response(resp):
suggestions = json.loads(resp.text)
results = []
for entry in suggestions.get('d', []):
# https://developer.imdb.com/documentation/key-concepts#imdb-ids
entry_id = entry['id']
categ = search_categories.get(entry_id[:2])
if categ is None:
logger.error('skip unknown category tag %s in %s', entry_id[:2], entry_id)
continue
title = entry['l']
if 'q' in entry:
title += " (%s)" % entry['q']
content = ''
if 'rank' in entry:
content += "(%s) " % entry['rank']
if 'y' in entry:
content += str(entry['y']) + " - "
if 's' in entry:
content += entry['s']
# imageUrl is the image itself, it is not a thumb!
image_url = entry.get('i', {}).get('imageUrl')
if image_url:
# get thumbnail
image_url_name, image_url_prefix = image_url.rsplit('.', 1)
# recipe to get the magic value:
# * search on imdb.com, look at the URL of the thumbnail on the right side of the screen
# * search using the imdb engine, compare the imageUrl and thumbnail URL
# QL75 : JPEG quality (?)
# UX280 : resize to width 320
# 280,414 : size of the image (add white border)
magic = 'QL75_UX280_CR0,0,280,414_'
if not image_url_name.endswith('_V1_'):
magic = '_V1_' + magic
image_url = image_url_name + magic + '.' + image_url_prefix
results.append(
{
"title": title,
"url": href_base.format(category=categ, entry_id=entry_id),
"content": content,
"thumbnail": image_url,
}
)
return results

65
searx/engines/imgur.py Normal file
View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Imgur (images)
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list
about = {
"website": 'https://imgur.com/',
"wikidata_id": 'Q355022',
"official_api_documentation": 'https://api.imgur.com/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['images']
paging = True
time_range_support = True
base_url = "https://imgur.com"
results_xpath = "//div[contains(@class, 'cards')]/div[contains(@class, 'post')]"
url_xpath = "./a/@href"
title_xpath = "./a/img/@alt"
thumbnail_xpath = "./a/img/@src"
def request(query, params):
time_range = params['time_range'] or 'all'
args = {
'q': query,
'qs': 'thumbs',
'p': params['pageno'] - 1,
}
params['url'] = f"{base_url}/search/score/{time_range}?{urlencode(args)}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, results_xpath):
thumbnail_src = extract_text(eval_xpath(result, thumbnail_xpath))
img_src = thumbnail_src.replace("b.", ".")
# that's a bug at imgur's side:
# sometimes there's just no preview image, hence we skip the image
if len(thumbnail_src) < 25:
continue
results.append(
{
'template': 'images.html',
'url': base_url + extract_text(eval_xpath(result, url_xpath)),
'title': extract_text(eval_xpath(result, title_xpath)),
'img_src': img_src,
'thumbnail_src': thumbnail_src,
}
)
return results

75
searx/engines/ina.py Normal file
View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
INA (Videos)
"""
from html import unescape
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
"website": 'https://www.ina.fr/',
"wikidata_id": 'Q1665109',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'fr',
}
# engine dependent config
categories = ['videos']
paging = True
page_size = 12
# search-url
base_url = 'https://www.ina.fr'
search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'
# specific xpath variables
results_xpath = '//div[@id="searchHits"]/div'
url_xpath = './/a/@href'
title_xpath = './/div[contains(@class,"title-bloc-small")]'
content_xpath = './/div[contains(@class,"sous-titre-fonction")]'
thumbnail_xpath = './/img/@data-src'
publishedDate_xpath = './/div[contains(@class,"dateAgenda")]'
# do search-request
def request(query, params):
params['url'] = search_url.format(start=params['pageno'] * page_size, query=urlencode({'q': query}))
return params
# get response from search-request
def response(resp):
results = []
# we get html in a JSON container...
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, results_xpath):
url_relative = eval_xpath_getindex(result, url_xpath, 0)
url = base_url + url_relative
title = unescape(extract_text(eval_xpath(result, title_xpath)))
thumbnail = extract_text(eval_xpath(result, thumbnail_xpath))
content = extract_text(eval_xpath(result, publishedDate_xpath)) + extract_text(
eval_xpath(result, content_xpath)
)
# append result
results.append(
{
'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'thumbnail': thumbnail,
}
)
# return results
return results

118
searx/engines/invidious.py Normal file
View File

@@ -0,0 +1,118 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Invidious (Videos)
If you want to use invidious with SearXNG you should setup one locally.
No public instance offer a public API now
- https://github.com/searxng/searxng/issues/2722#issuecomment-2884993248
"""
from __future__ import annotations
import time
import random
from urllib.parse import quote_plus, urlparse
from dateutil import parser
from searx.utils import humanize_number
# about
about = {
"website": 'https://api.invidious.io/',
"wikidata_id": 'Q79343316',
"official_api_documentation": 'https://docs.invidious.io/api/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ["videos", "music"]
paging = True
time_range_support = True
# base_url can be overwritten by a list of URLs in the settings.yml
base_url: list | str = []
def init(_):
if not base_url:
raise ValueError("missing invidious base_url")
def request(query, params):
time_range_dict = {
"day": "today",
"week": "week",
"month": "month",
"year": "year",
}
if isinstance(base_url, list):
params["base_url"] = random.choice(base_url)
else:
params["base_url"] = base_url
search_url = params["base_url"] + "/api/v1/search?q={query}"
params["url"] = search_url.format(query=quote_plus(query)) + "&page={pageno}".format(pageno=params["pageno"])
if params["time_range"] in time_range_dict:
params["url"] += "&date={timerange}".format(timerange=time_range_dict[params["time_range"]])
if params["language"] != "all":
lang = params["language"].split("-")
if len(lang) == 2:
params["url"] += "&range={lrange}".format(lrange=lang[1])
return params
def response(resp):
results = []
search_results = resp.json()
base_invidious_url = resp.search_params['base_url'] + "/watch?v="
for result in search_results:
rtype = result.get("type", None)
if rtype == "video":
videoid = result.get("videoId", None)
if not videoid:
continue
url = base_invidious_url + videoid
thumbs = result.get("videoThumbnails", [])
thumb = next((th for th in thumbs if th["quality"] == "sddefault"), None)
if thumb:
thumbnail = thumb.get("url", "")
else:
thumbnail = ""
# some instances return a partial thumbnail url
# we check if the url is partial, and prepend the base_url if it is
if thumbnail and not urlparse(thumbnail).netloc:
thumbnail = resp.search_params['base_url'] + thumbnail
publishedDate = parser.parse(time.ctime(result.get("published", 0)))
length = time.gmtime(result.get("lengthSeconds"))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
results.append(
{
"url": url,
"title": result.get("title", ""),
"content": result.get("description", ""),
"length": length,
"views": humanize_number(result['viewCount']),
"template": "videos.html",
"author": result.get("author"),
"publishedDate": publishedDate,
"iframe_src": resp.search_params['base_url'] + '/embed/' + videoid,
"thumbnail": thumbnail,
}
)
return results

76
searx/engines/ipernity.py Normal file
View File

@@ -0,0 +1,76 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Ipernity (images)"""
from datetime import datetime
from json import loads, JSONDecodeError
from urllib.parse import quote_plus
from lxml import html
from searx.utils import extr, extract_text, eval_xpath, eval_xpath_list
about = {
'website': 'https://www.ipernity.com',
'official_api_documentation': 'https://www.ipernity.com/help/api',
'use_official_api': False,
'require_api_key': False,
'results': 'HTML',
}
paging = True
categories = ['images']
base_url = 'https://www.ipernity.com'
page_size = 10
def request(query, params):
params['url'] = f"{base_url}/search/photo/@/page:{params['pageno']}:{page_size}?q={quote_plus(query)}"
return params
def response(resp):
results = []
doc = html.fromstring(resp.text)
images = eval_xpath_list(doc, '//a[starts-with(@href, "/doc")]//img')
result_index = 0
for result in eval_xpath_list(doc, '//script[@type="text/javascript"]'):
info_js = extr(extract_text(result), '] = ', '};') + '}'
if not info_js:
continue
try:
info_item = loads(info_js)
if not info_item.get('mediakey'):
continue
thumbnail_src = extract_text(eval_xpath(images[result_index], './@src'))
img_src = thumbnail_src.replace('240.jpg', '640.jpg')
resolution = None
if info_item.get("width") and info_item.get("height"):
resolution = f'{info_item["width"]}x{info_item["height"]}'
item = {
'template': 'images.html',
'url': f"{base_url}/doc/{info_item['user_id']}/{info_item['doc_id']}",
'title': info_item.get('title'),
'content': info_item.get('content', ''),
'resolution': resolution,
'publishedDate': datetime.fromtimestamp(int(info_item['posted_at'])),
'thumbnail_src': thumbnail_src,
'img_src': img_src,
}
results.append(item)
result_index += 1
except JSONDecodeError:
continue
return results

72
searx/engines/iqiyi.py Normal file
View File

@@ -0,0 +1,72 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""iQiyi: A search engine for retrieving videos from iQiyi."""
from urllib.parse import urlencode
from datetime import datetime
from searx.exceptions import SearxEngineAPIException
from searx.utils import parse_duration_string
about = {
"website": "https://www.iqiyi.com/",
"wikidata_id": "Q15913890",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
"language": "zh",
}
paging = True
time_range_support = True
categories = ["videos"]
time_range_dict = {'day': '1', 'week': '2', 'month': '3'}
base_url = "https://mesh.if.iqiyi.com"
def request(query, params):
query_params = {"key": query, "pageNum": params["pageno"], "pageSize": 25}
if time_range_dict.get(params['time_range']):
query_params["sitePublishDate"] = time_range_dict[params['time_range']]
params["url"] = f"{base_url}/portal/lw/search/homePageV3?{urlencode(query_params)}"
return params
def response(resp):
try:
data = resp.json()
except Exception as e:
raise SearxEngineAPIException(f"Invalid response: {e}") from e
results = []
if "data" not in data or "templates" not in data["data"]:
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["templates"]:
album_info = entry.get("albumInfo", {})
published_date = None
release_time = album_info.get("releaseTime", {}).get("value")
if release_time:
try:
published_date = datetime.strptime(release_time, "%Y-%m-%d")
except (ValueError, TypeError):
pass
length = parse_duration_string(album_info.get("subscriptionContent"))
results.append(
{
'url': album_info.get("pageUrl", "").replace("http://", "https://"),
'title': album_info.get("title", ""),
'content': album_info.get("brief", {}).get("value", ""),
'template': 'videos.html',
'length': length,
'publishedDate': published_date,
'thumbnail': album_info.get("img", ""),
}
)
return results

137
searx/engines/jisho.py Normal file
View File

@@ -0,0 +1,137 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Jisho (the Japanese-English dictionary)
"""
from urllib.parse import urlencode, urljoin
# about
about = {
"website": 'https://jisho.org',
"wikidata_id": 'Q24568389',
"official_api_documentation": "https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api",
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
"language": 'ja',
}
categories = ['dictionaries']
paging = False
URL = 'https://jisho.org'
BASE_URL = 'https://jisho.org/word/'
SEARCH_URL = URL + '/api/v1/search/words?{query}'
def request(query, params):
query = urlencode({'keyword': query})
params['url'] = SEARCH_URL.format(query=query)
logger.debug(f"query_url --> {params['url']}")
return params
def response(resp):
results = []
first_result = True
search_results = resp.json()
for page in search_results.get('data', []):
# Entries that are purely from Wikipedia are excluded.
parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech')
if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition':
pass
# Process alternative forms
alt_forms = []
for title_raw in page['japanese']:
if 'word' not in title_raw:
alt_forms.append(title_raw['reading'])
else:
title = title_raw['word']
if 'reading' in title_raw:
title += ' (' + title_raw['reading'] + ')'
alt_forms.append(title)
result_url = urljoin(BASE_URL, page['slug'])
definitions = get_definitions(page)
# For results, we'll return the URL, all alternative forms (as title),
# and all definitions (as description) truncated to 300 characters.
content = " ".join(f"{engdef}." for _, engdef, _ in definitions)
results.append(
{'url': result_url, 'title': ", ".join(alt_forms), 'content': content[:300] + (content[300:] and '...')}
)
# Like Wordnik, we'll return the first result in an infobox too.
if first_result:
first_result = False
results.append(get_infobox(alt_forms, result_url, definitions))
return results
def get_definitions(page):
# Process definitions
definitions = []
for defn_raw in page['senses']:
extra = []
# Extra data. Since they're not documented, this implementation is based solely by the author's assumptions.
if defn_raw.get('tags'):
if defn_raw.get('info'):
# "usually written as kana: <kana>"
extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ')
else:
# abbreviation, archaism, etc.
extra.append(', '.join(defn_raw['tags']) + '. ')
elif defn_raw.get('info'):
# inconsistent
extra.append(', '.join(defn_raw['info']).capitalize() + '. ')
if defn_raw.get('restrictions'):
extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ')
definitions.append(
(
', '.join(defn_raw['parts_of_speech']),
'; '.join(defn_raw['english_definitions']),
''.join(extra)[:-1],
)
)
return definitions
def get_infobox(alt_forms, result_url, definitions):
infobox_content = []
# title & alt_forms
infobox_title = alt_forms[0]
if len(alt_forms) > 1:
infobox_content.append(f'<p><i>Other forms:</i> {", ".join(alt_forms[1:])}</p>')
# definitions
infobox_content.append(
'''
<small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a>
and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a>
by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small>
<ul>
'''
)
for pos, engdef, extra in definitions:
if pos == 'Wikipedia definition':
infobox_content.append('</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>')
pos = f'<i>{pos}</i>: ' if pos else ''
extra = f' ({extra})' if extra else ''
infobox_content.append(f'<li>{pos}{engdef}{extra}</li>')
infobox_content.append('</ul>')
#
return {
'infobox': infobox_title,
'content': ''.join(infobox_content),
'urls': [
{
'title': 'Jisho.org',
'url': result_url,
}
],
}

View File

@@ -0,0 +1,423 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""The JSON engine is a *generic* engine with which it is possible to configure
engines in the settings.
Configuration
=============
Request:
- :py:obj:`search_url`
- :py:obj:`lang_all`
- :py:obj:`soft_max_redirects`
- :py:obj:`method`
- :py:obj:`request_body`
- :py:obj:`cookies`
- :py:obj:`headers`
Paging:
- :py:obj:`paging`
- :py:obj:`page_size`
- :py:obj:`first_page_num`
Time Range:
- :py:obj:`time_range_support`
- :py:obj:`time_range_url`
- :py:obj:`time_range_map`
Safe-Search:
- :py:obj:`safe_search_support`
- :py:obj:`safe_search_map`
Response:
- :py:obj:`title_html_to_text`
- :py:obj:`content_html_to_text`
- :py:obj:`no_result_for_http_status`
JSON query:
- :py:obj:`results_query`
- :py:obj:`url_query`
- :py:obj:`url_prefix`
- :py:obj:`title_query`
- :py:obj:`content_query`
- :py:obj:`thumbnail_query`
- :py:obj:`thumbnail_prefix`
- :py:obj:`suggestion_query`
Example
=======
Here is a simple example of a JSON engine configure in the :ref:`settings
engines` section, further read :ref:`engines-dev`.
.. code:: yaml
- name : mdn
engine : json_engine
paging : True
search_url : https://developer.mozilla.org/api/v1/search?q={query}&page={pageno}
results_query : documents
url_query : mdn_url
url_prefix : https://developer.mozilla.org
title_query : title
content_query : summary
Implementations
===============
"""
from collections.abc import Iterable
from json import loads
from urllib.parse import urlencode
from searx.utils import to_string, html_to_text
from searx.network import raise_for_httperror
search_url = None
"""
Search URL of the engine. Example::
https://example.org/?search={query}&page={pageno}{time_range}{safe_search}
Replacements are:
``{query}``:
Search terms from user.
``{pageno}``:
Page number if engine supports paging :py:obj:`paging`
``{lang}``:
ISO 639-1 language code (en, de, fr ..)
``{time_range}``:
:py:obj:`URL parameter <time_range_url>` if engine :py:obj:`supports time
range <time_range_support>`. The value for the parameter is taken from
:py:obj:`time_range_map`.
``{safe_search}``:
Safe-search :py:obj:`URL parameter <safe_search_map>` if engine
:py:obj:`supports safe-search <safe_search_support>`. The ``{safe_search}``
replacement is taken from the :py:obj:`safes_search_map`. Filter results::
0: none, 1: moderate, 2:strict
If not supported, the URL parameter is an empty string.
"""
lang_all = 'en'
'''Replacement ``{lang}`` in :py:obj:`search_url` if language ``all`` is
selected.
'''
no_result_for_http_status = []
'''Return empty result for these HTTP status codes instead of throwing an error.
.. code:: yaml
no_result_for_http_status: []
'''
soft_max_redirects = 0
'''Maximum redirects, soft limit. Record an error but don't stop the engine'''
method = 'GET'
'''Some engines might require to do POST requests for search.'''
request_body = ''
'''The body of the request. This can only be used if different :py:obj:`method`
is set, e.g. ``POST``. For formatting see the documentation of :py:obj:`search_url`.
Note: Curly brackets which aren't encapsulating a replacement placeholder
must be escaped by doubling each ``{`` and ``}``.
.. code:: yaml
request_body: >-
{{
"search": "{query}",
"page": {pageno},
"extra": {{
"time_range": {time_range},
"rating": "{safe_search}"
}}
}}
'''
cookies = {}
'''Some engines might offer different result based on cookies.
Possible use-case: To set safesearch cookie.'''
headers = {}
'''Some engines might offer different result based on cookies or headers.
Possible use-case: To set safesearch cookie or header to moderate.'''
paging = False
'''Engine supports paging [True or False].'''
page_size = 1
'''Number of results on each page. Only needed if the site requires not a page
number, but an offset.'''
first_page_num = 1
'''Number of the first page (usually 0 or 1).'''
results_query = ''
'''JSON query for the list of result items.
The query string is a slash `/` separated path of JSON key names.
Array entries can be specified using the index or can be omitted entirely,
in which case each entry is considered -
most implementations will default to the first entry in this case.
'''
url_query = None
'''JSON query of result's ``url``. For the query string documentation see :py:obj:`results_query`'''
url_prefix = ""
'''String to prepend to the result's ``url``.'''
title_query = None
'''JSON query of result's ``title``. For the query string documentation see :py:obj:`results_query`'''
content_query = None
'''JSON query of result's ``content``. For the query string documentation see :py:obj:`results_query`'''
thumbnail_query = False
'''JSON query of result's ``thumbnail``. For the query string documentation see :py:obj:`results_query`'''
thumbnail_prefix = ''
'''String to prepend to the result's ``thumbnail``.'''
suggestion_query = ''
'''JSON query of result's ``suggestion``. For the query string documentation see :py:obj:`results_query`'''
title_html_to_text = False
'''Extract text from a HTML title string'''
content_html_to_text = False
'''Extract text from a HTML content string'''
time_range_support = False
'''Engine supports search time range.'''
time_range_url = '&hours={time_range_val}'
'''Time range URL parameter in the in :py:obj:`search_url`. If no time range is
requested by the user, the URL parameter is an empty string. The
``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`.
.. code:: yaml
time_range_url : '&days={time_range_val}'
'''
time_range_map = {
'day': 24,
'week': 24 * 7,
'month': 24 * 30,
'year': 24 * 365,
}
'''Maps time range value from user to ``{time_range_val}`` in
:py:obj:`time_range_url`.
.. code:: yaml
time_range_map:
day: 1
week: 7
month: 30
year: 365
'''
safe_search_support = False
'''Engine supports safe-search.'''
safe_search_map = {0: '&filter=none', 1: '&filter=moderate', 2: '&filter=strict'}
'''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`.
.. code:: yaml
safesearch: true
safes_search_map:
0: '&filter=none'
1: '&filter=moderate'
2: '&filter=strict'
'''
def iterate(iterable):
if isinstance(iterable, dict):
items = iterable.items()
else:
items = enumerate(iterable)
for index, value in items:
yield str(index), value
def is_iterable(obj):
if isinstance(obj, str):
return False
return isinstance(obj, Iterable)
def parse(query): # pylint: disable=redefined-outer-name
q = [] # pylint: disable=invalid-name
for part in query.split('/'):
if part == '':
continue
q.append(part)
return q
def do_query(data, q): # pylint: disable=invalid-name
ret = []
if not q:
return ret
qkey = q[0]
for key, value in iterate(data):
if len(q) == 1:
if key == qkey:
ret.append(value)
elif is_iterable(value):
ret.extend(do_query(value, q))
else:
if not is_iterable(value):
continue
if key == qkey:
ret.extend(do_query(value, q[1:]))
else:
ret.extend(do_query(value, q))
return ret
def query(data, query_string):
q = parse(query_string)
return do_query(data, q)
def request(query, params): # pylint: disable=redefined-outer-name
'''Build request parameters (see :ref:`engine request`).'''
lang = lang_all
if params['language'] != 'all':
lang = params['language'][:2]
time_range = ''
if params.get('time_range'):
time_range_val = time_range_map.get(params.get('time_range'))
time_range = time_range_url.format(time_range_val=time_range_val)
safe_search = ''
if params['safesearch']:
safe_search = safe_search_map[params['safesearch']]
fp = { # pylint: disable=invalid-name
'query': urlencode({'q': query})[2:],
'lang': lang,
'pageno': (params['pageno'] - 1) * page_size + first_page_num,
'time_range': time_range,
'safe_search': safe_search,
}
params['cookies'].update(cookies)
params['headers'].update(headers)
params['url'] = search_url.format(**fp)
params['method'] = method
if request_body:
# don't url-encode the query if it's in the request body
fp['query'] = query
params['data'] = request_body.format(**fp)
params['soft_max_redirects'] = soft_max_redirects
params['raise_for_httperror'] = False
return params
def identity(arg):
return arg
def extract_response_info(result):
title_filter = html_to_text if title_html_to_text else identity
content_filter = html_to_text if content_html_to_text else identity
tmp_result = {}
try:
url = query(result, url_query)[0]
tmp_result['url'] = url_prefix + to_string(url)
title = query(result, title_query)[0]
tmp_result['title'] = title_filter(to_string(title))
except: # pylint: disable=bare-except
return None
try:
content = query(result, content_query)[0]
tmp_result['content'] = content_filter(to_string(content))
except: # pylint: disable=bare-except
tmp_result['content'] = ""
try:
if thumbnail_query:
thumbnail_query_result = query(result, thumbnail_query)[0]
tmp_result['thumbnail'] = thumbnail_prefix + to_string(thumbnail_query_result)
except: # pylint: disable=bare-except
pass
return tmp_result
def response(resp):
'''Scrap *results* from the response (see :ref:`result types`).'''
results = []
if no_result_for_http_status and resp.status_code in no_result_for_http_status:
return results
raise_for_httperror(resp)
if not resp.text:
return results
json = loads(resp.text)
is_onion = 'onions' in categories
if results_query:
rs = query(json, results_query) # pylint: disable=invalid-name
if not rs:
return results
rs = rs[0] # pylint: disable=invalid-name
else:
rs = json # pylint: disable=invalid-name
for result in rs:
tmp_result = extract_response_info(result)
if not tmp_result:
continue
if is_onion:
tmp_result['is_onion'] = True
results.append(tmp_result)
if not suggestion_query:
return results
for suggestion in query(json, suggestion_query):
results.append({'suggestion': suggestion})
return results

61
searx/engines/kickass.py Normal file
View File

@@ -0,0 +1,61 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Kickass Torrent (Videos, Music, Files)"""
import random
from operator import itemgetter
from urllib.parse import quote
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
eval_xpath_list,
extract_text,
int_or_zero,
)
about = {
"website": 'https://kickasstorrents.to',
"wikidata_id": 'Q17062285',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['files']
paging = True
# base_url can be overwritten by a list of URLs in the settings.yml
base_url = 'https://kickasstorrents.to'
def request(query, params):
params['base_url'] = random.choice(base_url) if isinstance(base_url, list) else base_url
params['url'] = params['base_url'] + f'/usearch/{quote(query)}/{params["pageno"]}/'
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
search_res = eval_xpath_list(dom, '//table[contains(@class, "data")]//tr[descendant::a]', None)
if search_res is None:
return []
for tag in search_res[1:]:
result = {'template': 'torrent.html'}
url = eval_xpath_getindex(tag, './/a[contains(@class, "cellMainLink")]/@href', 0, None)
result['url'] = resp.search_params['base_url'] + url
result['title'] = extract_text(eval_xpath(tag, './/a[contains(@class, "cellMainLink")]'))
result['content'] = extract_text(eval_xpath(tag, './/span[@class="font11px lightgrey block"]'))
result['seed'] = int_or_zero(extract_text(eval_xpath(tag, './/td[contains(@class, "green")]')))
result['leech'] = int_or_zero(extract_text(eval_xpath(tag, './/td[contains(@class, "red")]')))
result['filesize'] = extract_text(eval_xpath(tag, './/td[contains(@class, "nobr")]'))
results.append(result)
# results sorted by seeder count
return sorted(results, key=itemgetter('seed'), reverse=True)

196
searx/engines/lemmy.py Normal file
View File

@@ -0,0 +1,196 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This engine uses the Lemmy API (https://lemmy.ml/api/v3/search), which is
documented at `lemmy-js-client`_ / `Interface Search`_. Since Lemmy is
federated, results are from many different, independent lemmy instances, and not
only the official one.
.. _lemmy-js-client: https://join-lemmy.org/api/modules.html
.. _Interface Search: https://join-lemmy.org/api/interfaces/Search.html
Configuration
=============
The engine has the following additional settings:
- :py:obj:`base_url`
- :py:obj:`lemmy_type`
This implementation is used by different lemmy engines in the :ref:`settings.yml
<settings engines>`:
.. code:: yaml
- name: lemmy communities
lemmy_type: Communities
...
- name: lemmy users
lemmy_type: Users
...
- name: lemmy posts
lemmy_type: Posts
...
- name: lemmy comments
lemmy_type: Comments
...
Implementations
===============
"""
from datetime import datetime
from urllib.parse import urlencode
from flask_babel import gettext
from searx.utils import markdown_to_text
about = {
"website": 'https://lemmy.ml/',
"wikidata_id": 'Q84777032',
"official_api_documentation": "https://join-lemmy.org/api/",
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
paging = True
categories = ['social media']
base_url = "https://lemmy.ml/"
"""By default, https://lemmy.ml is used for providing the results. If you want
to use a different lemmy instance, you can specify ``base_url``.
"""
lemmy_type = "Communities"
"""Any of ``Communities``, ``Users``, ``Posts``, ``Comments``"""
def request(query, params):
args = {
'q': query,
'page': params['pageno'],
'type_': lemmy_type,
}
params['url'] = f"{base_url}api/v3/search?{urlencode(args)}"
return params
def _get_communities(json):
results = []
for result in json["communities"]:
counts = result['counts']
metadata = (
f"{gettext('subscribers')}: {counts.get('subscribers', 0)}"
f" | {gettext('posts')}: {counts.get('posts', 0)}"
f" | {gettext('active users')}: {counts.get('users_active_half_year', 0)}"
)
results.append(
{
'url': result['community']['actor_id'],
'title': result['community']['title'],
'content': markdown_to_text(result['community'].get('description', '')),
'thumbnail': result['community'].get('icon', result['community'].get('banner')),
'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata,
}
)
return results
def _get_users(json):
results = []
for result in json["users"]:
results.append(
{
'url': result['person']['actor_id'],
'title': result['person']['name'],
'content': markdown_to_text(result['person'].get('bio', '')),
}
)
return results
def _get_posts(json):
results = []
for result in json["posts"]:
user = result['creator'].get('display_name', result['creator']['name'])
thumbnail = None
if result['post'].get('thumbnail_url'):
thumbnail = result['post']['thumbnail_url'] + '?format=webp&thumbnail=208'
metadata = (
f"&#x25B2; {result['counts']['upvotes']} &#x25BC; {result['counts']['downvotes']}"
f" | {gettext('user')}: {user}"
f" | {gettext('comments')}: {result['counts']['comments']}"
f" | {gettext('community')}: {result['community']['title']}"
)
content = result['post'].get('body', '').strip()
if content:
content = markdown_to_text(content)
results.append(
{
'url': result['post']['ap_id'],
'title': result['post']['name'],
'content': content,
'thumbnail': thumbnail,
'publishedDate': datetime.strptime(result['post']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata,
}
)
return results
def _get_comments(json):
results = []
for result in json["comments"]:
user = result['creator'].get('display_name', result['creator']['name'])
content = result['comment'].get('content', '').strip()
if content:
content = markdown_to_text(content)
metadata = (
f"&#x25B2; {result['counts']['upvotes']} &#x25BC; {result['counts']['downvotes']}"
f" | {gettext('user')}: {user}"
f" | {gettext('community')}: {result['community']['title']}"
)
results.append(
{
'url': result['comment']['ap_id'],
'title': result['post']['name'],
'content': markdown_to_text(result['comment']['content']),
'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata,
}
)
return results
def response(resp):
json = resp.json()
if lemmy_type == "Communities":
return _get_communities(json)
if lemmy_type == "Users":
return _get_users(json)
if lemmy_type == "Posts":
return _get_posts(json)
if lemmy_type == "Comments":
return _get_comments(json)
raise ValueError(f"Unsupported lemmy type: {lemmy_type}")

55
searx/engines/lib_rs.py Normal file
View File

@@ -0,0 +1,55 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""lib.rs (packages)"""
from urllib.parse import quote_plus
from lxml import html
from searx.utils import eval_xpath, eval_xpath_list, extract_text
about = {
'website': 'https://lib.rs',
'wikidata_id': 'Q113486010',
'use_official_api': False,
'require_api_key': False,
'results': "HTML",
}
categories = ["it", "packages"]
base_url = 'https://lib.rs'
results_xpath = '/html/body/main/div/ol/li/a'
url_xpath = './@href'
title_xpath = './div[@class="h"]/h4'
content_xpath = './div[@class="h"]/p'
version_xpath = './div[@class="meta"]/span[contains(@class, "version")]'
download_count_xpath = './div[@class="meta"]/span[@class="downloads"]'
tags_xpath = './div[@class="meta"]/span[contains(@class, "k")]/text()'
def request(query, params):
params['url'] = f"{base_url}/search?q={quote_plus(query)}"
return params
def response(resp):
results = []
doc = html.fromstring(resp.text)
for result in eval_xpath_list(doc, results_xpath):
package_name = extract_text(eval_xpath(result, title_xpath))
results.append(
{
'template': 'packages.html',
'title': package_name,
'url': base_url + extract_text(eval_xpath(result, url_xpath)), # type: ignore
'content': extract_text(eval_xpath(result, content_xpath)),
'package_name': package_name,
'version': extract_text(eval_xpath(result, version_xpath)),
'popularity': extract_text(eval_xpath(result, download_count_xpath)),
'tags': eval_xpath_list(result, tags_xpath),
}
)
return results

View File

@@ -0,0 +1,59 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""LibreTranslate (Free and Open Source Machine Translation API)"""
import random
import json
from searx.result_types import EngineResults
about = {
"website": 'https://libretranslate.com',
"wikidata_id": None,
"official_api_documentation": 'https://libretranslate.com/docs/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general', 'translate']
base_url = "https://libretranslate.com/translate"
api_key = ""
def request(_query, params):
request_url = random.choice(base_url) if isinstance(base_url, list) else base_url
if request_url.startswith("https://libretranslate.com") and not api_key:
return None
params['url'] = f"{request_url}/translate"
args = {
'q': params['query'],
'source': params['from_lang'][1],
'target': params['to_lang'][1],
'alternatives': 3,
}
if api_key:
args['api_key'] = api_key
params['data'] = json.dumps(args)
params['method'] = 'POST'
params['headers'] = {'Content-Type': 'application/json'}
params['req_url'] = request_url
return params
def response(resp) -> EngineResults:
results = EngineResults()
json_resp = resp.json()
text = json_resp.get('translatedText')
if not text:
return results
item = results.types.Translations.Item(text=text, examples=json_resp.get('alternatives', []))
results.add(results.types.Translations(translations=[item]))
return results

74
searx/engines/lingva.py Normal file
View File

@@ -0,0 +1,74 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Lingva (alternative Google Translate frontend)"""
from searx.result_types import EngineResults
about = {
"website": 'https://lingva.ml',
"wikidata_id": None,
"official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general', 'translate']
url = "https://lingva.thedaviddelta.com"
def request(_query, params):
params['url'] = f"{url}/api/v1/{params['from_lang'][1]}/{params['to_lang'][1]}/{params['query']}"
return params
def response(resp) -> EngineResults:
results = EngineResults()
result = resp.json()
info = result["info"]
from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1])
if "typo" in info:
results.append({"suggestion": from_to_prefix + info["typo"]})
if 'definitions' in info: # pylint: disable=too-many-nested-blocks
for definition in info['definitions']:
for item in definition.get('list', []):
for synonym in item.get('synonyms', []):
results.append({"suggestion": from_to_prefix + synonym})
data = []
for definition in info['definitions']:
for translation in definition['list']:
data.append(
results.types.Translations.Item(
text=result['translation'],
definitions=[translation['definition']] if translation['definition'] else [],
examples=[translation['example']] if translation['example'] else [],
synonyms=translation['synonyms'],
)
)
for translation in info["extraTranslations"]:
for word in translation["list"]:
data.append(
results.types.Translations.Item(
text=word['word'],
definitions=word['meanings'],
)
)
if not data and result['translation']:
data.append(results.types.Translations.Item(text=result['translation']))
params = resp.search_params
results.add(
results.types.Translations(
translations=data,
url=f"{url}/{params['from_lang'][1]}/{params['to_lang'][1]}/{params['query']}",
)
)
return results

Some files were not shown because too many files have changed in this diff Show More