first commit
This commit is contained in:
56
searx/engines/1337x.py
Normal file
56
searx/engines/1337x.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=invalid-name
|
||||
"""1337x
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import quote, urljoin
|
||||
from lxml import html
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://1337x.to/',
|
||||
"wikidata_id": 'Q28134166',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
url = 'https://1337x.to/'
|
||||
search_url = url + 'search/{search_term}/{pageno}/'
|
||||
categories = ['files']
|
||||
paging = True
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'])
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, '//table[contains(@class, "table-list")]/tbody//tr'):
|
||||
href = urljoin(url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0))
|
||||
title = extract_text(eval_xpath(result, './td[contains(@class, "name")]/a[2]'))
|
||||
seed = extract_text(eval_xpath(result, './/td[contains(@class, "seeds")]'))
|
||||
leech = extract_text(eval_xpath(result, './/td[contains(@class, "leeches")]'))
|
||||
filesize = extract_text(eval_xpath(result, './/td[contains(@class, "size")]/text()'))
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': href,
|
||||
'title': title,
|
||||
'seed': seed,
|
||||
'leech': leech,
|
||||
'filesize': filesize,
|
||||
'template': 'torrent.html',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
68
searx/engines/360search.py
Normal file
68
searx/engines/360search.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=invalid-name
|
||||
"""360Search search engine for searxng"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import extract_text
|
||||
|
||||
# Metadata
|
||||
about = {
|
||||
"website": "https://www.so.com/",
|
||||
"wikidata_id": "Q10846064",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "HTML",
|
||||
"language": "zh",
|
||||
}
|
||||
|
||||
# Engine Configuration
|
||||
categories = ["general"]
|
||||
paging = True
|
||||
time_range_support = True
|
||||
|
||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
||||
|
||||
# Base URL
|
||||
base_url = "https://www.so.com"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {
|
||||
"pn": params["pageno"],
|
||||
"q": query,
|
||||
}
|
||||
|
||||
if time_range_dict.get(params['time_range']):
|
||||
query_params["adv_t"] = time_range_dict.get(params['time_range'])
|
||||
|
||||
params["url"] = f"{base_url}/s?{urlencode(query_params)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
dom = html.fromstring(resp.text)
|
||||
results = []
|
||||
|
||||
for item in dom.xpath('//li[contains(@class, "res-list")]'):
|
||||
title = extract_text(item.xpath('.//h3[contains(@class, "res-title")]/a'))
|
||||
|
||||
url = extract_text(item.xpath('.//h3[contains(@class, "res-title")]/a/@data-mdurl'))
|
||||
if not url:
|
||||
url = extract_text(item.xpath('.//h3[contains(@class, "res-title")]/a/@href'))
|
||||
|
||||
content = extract_text(item.xpath('.//p[@class="res-desc"]'))
|
||||
if not content:
|
||||
content = extract_text(item.xpath('.//span[@class="res-list-summary"]'))
|
||||
|
||||
if title and url:
|
||||
results.append(
|
||||
{
|
||||
"title": title,
|
||||
"url": url,
|
||||
"content": content,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
65
searx/engines/360search_videos.py
Normal file
65
searx/engines/360search_videos.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=invalid-name
|
||||
"""360Search-Videos: A search engine for retrieving videos from 360Search."""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.utils import html_to_text, get_embeded_stream_url
|
||||
|
||||
about = {
|
||||
"website": "https://tv.360kan.com/",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
paging = True
|
||||
results_per_page = 10
|
||||
categories = ["videos"]
|
||||
|
||||
base_url = "https://tv.360kan.com"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {"count": 10, "q": query, "start": params["pageno"] * 10}
|
||||
|
||||
params["url"] = f"{base_url}/v1/video/list?{urlencode(query_params)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
try:
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
raise SearxEngineAPIException(f"Invalid response: {e}") from e
|
||||
results = []
|
||||
|
||||
if "data" not in data or "result" not in data["data"]:
|
||||
raise SearxEngineAPIException("Invalid response")
|
||||
|
||||
for entry in data["data"]["result"]:
|
||||
if not entry.get("title") or not entry.get("play_url"):
|
||||
continue
|
||||
|
||||
published_date = None
|
||||
if entry.get("publish_time"):
|
||||
try:
|
||||
published_date = datetime.fromtimestamp(int(entry["publish_time"]))
|
||||
except (ValueError, TypeError):
|
||||
published_date = None
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': entry["play_url"],
|
||||
'title': html_to_text(entry["title"]),
|
||||
'content': html_to_text(entry["description"]),
|
||||
'template': 'videos.html',
|
||||
'publishedDate': published_date,
|
||||
'thumbnail': entry["cover_img"],
|
||||
"iframe_src": get_embeded_stream_url(entry["play_url"]),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
76
searx/engines/9gag.py
Normal file
76
searx/engines/9gag.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=invalid-name
|
||||
"""9GAG (social media)"""
|
||||
|
||||
from json import loads
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
|
||||
about = {
|
||||
"website": 'https://9gag.com/',
|
||||
"wikidata_id": 'Q277421',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
categories = ['social media']
|
||||
paging = True
|
||||
|
||||
search_url = "https://9gag.com/v1/search-posts?{query}"
|
||||
page_size = 10
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query = urlencode({'query': query, 'c': (params['pageno'] - 1) * page_size})
|
||||
|
||||
params['url'] = search_url.format(query=query)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
json_results = loads(resp.text)['data']
|
||||
|
||||
for result in json_results['posts']:
|
||||
result_type = result['type']
|
||||
|
||||
# Get the not cropped version of the thumbnail when the image height is not too important
|
||||
if result['images']['image700']['height'] > 400:
|
||||
thumbnail = result['images']['imageFbThumbnail']['url']
|
||||
else:
|
||||
thumbnail = result['images']['image700']['url']
|
||||
|
||||
if result_type == 'Photo':
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': result['description'],
|
||||
'publishedDate': datetime.fromtimestamp(result['creationTs']),
|
||||
'img_src': result['images']['image700']['url'],
|
||||
'thumbnail_src': thumbnail,
|
||||
}
|
||||
)
|
||||
elif result_type == 'Animated':
|
||||
results.append(
|
||||
{
|
||||
'template': 'videos.html',
|
||||
'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': result['description'],
|
||||
'publishedDate': datetime.fromtimestamp(result['creationTs']),
|
||||
'thumbnail': thumbnail,
|
||||
'iframe_src': result['images'].get('image460sv', {}).get('url'),
|
||||
}
|
||||
)
|
||||
|
||||
if 'tags' in json_results:
|
||||
for suggestion in json_results['tags']:
|
||||
results.append({'suggestion': suggestion['key']})
|
||||
|
||||
return results
|
||||
253
searx/engines/__init__.py
Normal file
253
searx/engines/__init__.py
Normal file
@@ -0,0 +1,253 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Load and initialize the ``engines``, see :py:func:`load_engines` and register
|
||||
:py:obj:`engine_shortcuts`.
|
||||
|
||||
usage::
|
||||
|
||||
load_engines( settings['engines'] )
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import copy
|
||||
from os.path import realpath, dirname
|
||||
|
||||
from typing import TYPE_CHECKING, Dict
|
||||
import types
|
||||
import inspect
|
||||
|
||||
from searx import logger, settings
|
||||
from searx.utils import load_module
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from searx.enginelib import Engine
|
||||
|
||||
logger = logger.getChild('engines')
|
||||
ENGINE_DIR = dirname(realpath(__file__))
|
||||
ENGINE_DEFAULT_ARGS = {
|
||||
# Common options in the engine module
|
||||
"engine_type": "online",
|
||||
"paging": False,
|
||||
"time_range_support": False,
|
||||
"safesearch": False,
|
||||
# settings.yml
|
||||
"categories": ["general"],
|
||||
"enable_http": False,
|
||||
"shortcut": "-",
|
||||
"timeout": settings["outgoing"]["request_timeout"],
|
||||
"display_error_messages": True,
|
||||
"disabled": False,
|
||||
"inactive": False,
|
||||
"about": {},
|
||||
"using_tor_proxy": False,
|
||||
"send_accept_language_header": False,
|
||||
"tokens": [],
|
||||
"max_page": 0,
|
||||
}
|
||||
# set automatically when an engine does not have any tab category
|
||||
DEFAULT_CATEGORY = 'other'
|
||||
|
||||
|
||||
# Defaults for the namespace of an engine module, see :py:func:`load_engine`
|
||||
|
||||
categories = {'general': []}
|
||||
engines: Dict[str, Engine | types.ModuleType] = {}
|
||||
engine_shortcuts = {}
|
||||
"""Simple map of registered *shortcuts* to name of the engine (or ``None``).
|
||||
|
||||
::
|
||||
|
||||
engine_shortcuts[engine.shortcut] = engine.name
|
||||
|
||||
:meta hide-value:
|
||||
"""
|
||||
|
||||
|
||||
def check_engine_module(module: types.ModuleType):
|
||||
# probe unintentional name collisions / for example name collisions caused
|
||||
# by import statements in the engine module ..
|
||||
|
||||
# network: https://github.com/searxng/searxng/issues/762#issuecomment-1605323861
|
||||
obj = getattr(module, 'network', None)
|
||||
if obj and inspect.ismodule(obj):
|
||||
msg = f'type of {module.__name__}.network is a module ({obj.__name__}), expected a string'
|
||||
# logger.error(msg)
|
||||
raise TypeError(msg)
|
||||
|
||||
|
||||
def load_engine(engine_data: dict) -> Engine | types.ModuleType | None:
|
||||
"""Load engine from ``engine_data``.
|
||||
|
||||
:param dict engine_data: Attributes from YAML ``settings:engines/<engine>``
|
||||
:return: initialized namespace of the ``<engine>``.
|
||||
|
||||
1. create a namespace and load module of the ``<engine>``
|
||||
2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS`
|
||||
3. update namespace with values from ``engine_data``
|
||||
|
||||
If engine *is active*, return namespace of the engine, otherwise return
|
||||
``None``.
|
||||
|
||||
This function also returns ``None`` if initialization of the namespace fails
|
||||
for one of the following reasons:
|
||||
|
||||
- engine name contains underscore
|
||||
- engine name is not lowercase
|
||||
- required attribute is not set :py:func:`is_missing_required_attributes`
|
||||
|
||||
"""
|
||||
# pylint: disable=too-many-return-statements
|
||||
|
||||
engine_name = engine_data.get('name')
|
||||
if engine_name is None:
|
||||
logger.error('An engine does not have a "name" field')
|
||||
return None
|
||||
if '_' in engine_name:
|
||||
logger.error('Engine name contains underscore: "{}"'.format(engine_name))
|
||||
return None
|
||||
|
||||
if engine_name.lower() != engine_name:
|
||||
logger.warning('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name))
|
||||
engine_name = engine_name.lower()
|
||||
engine_data['name'] = engine_name
|
||||
|
||||
# load_module
|
||||
module_name = engine_data.get('engine')
|
||||
if module_name is None:
|
||||
logger.error('The "engine" field is missing for the engine named "{}"'.format(engine_name))
|
||||
return None
|
||||
try:
|
||||
engine = load_module(module_name + '.py', ENGINE_DIR)
|
||||
except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError):
|
||||
logger.exception('Fatal exception in engine "{}"'.format(module_name))
|
||||
sys.exit(1)
|
||||
except BaseException:
|
||||
logger.exception('Cannot load engine "{}"'.format(module_name))
|
||||
return None
|
||||
|
||||
check_engine_module(engine)
|
||||
update_engine_attributes(engine, engine_data)
|
||||
update_attributes_for_tor(engine)
|
||||
|
||||
# avoid cyclic imports
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from searx.enginelib.traits import EngineTraitsMap
|
||||
|
||||
trait_map = EngineTraitsMap.from_data()
|
||||
trait_map.set_traits(engine)
|
||||
|
||||
if not is_engine_active(engine):
|
||||
return None
|
||||
|
||||
if is_missing_required_attributes(engine):
|
||||
return None
|
||||
|
||||
set_loggers(engine, engine_name)
|
||||
|
||||
if not any(cat in settings['categories_as_tabs'] for cat in engine.categories):
|
||||
engine.categories.append(DEFAULT_CATEGORY)
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
def set_loggers(engine, engine_name):
|
||||
# set the logger for engine
|
||||
engine.logger = logger.getChild(engine_name)
|
||||
# the engine may have load some other engines
|
||||
# may sure the logger is initialized
|
||||
# use sys.modules.copy() to avoid "RuntimeError: dictionary changed size during iteration"
|
||||
# see https://github.com/python/cpython/issues/89516
|
||||
# and https://docs.python.org/3.10/library/sys.html#sys.modules
|
||||
modules = sys.modules.copy()
|
||||
for module_name, module in modules.items():
|
||||
if (
|
||||
module_name.startswith("searx.engines")
|
||||
and module_name != "searx.engines.__init__"
|
||||
and not hasattr(module, "logger")
|
||||
):
|
||||
module_engine_name = module_name.split(".")[-1]
|
||||
module.logger = logger.getChild(module_engine_name) # type: ignore
|
||||
|
||||
|
||||
def update_engine_attributes(engine: Engine | types.ModuleType, engine_data):
|
||||
# set engine attributes from engine_data
|
||||
for param_name, param_value in engine_data.items():
|
||||
if param_name == 'categories':
|
||||
if isinstance(param_value, str):
|
||||
param_value = list(map(str.strip, param_value.split(',')))
|
||||
engine.categories = param_value # type: ignore
|
||||
elif hasattr(engine, 'about') and param_name == 'about':
|
||||
engine.about = {**engine.about, **engine_data['about']} # type: ignore
|
||||
else:
|
||||
setattr(engine, param_name, param_value)
|
||||
|
||||
# set default attributes
|
||||
for arg_name, arg_value in ENGINE_DEFAULT_ARGS.items():
|
||||
if not hasattr(engine, arg_name):
|
||||
setattr(engine, arg_name, copy.deepcopy(arg_value))
|
||||
|
||||
|
||||
def update_attributes_for_tor(engine: Engine | types.ModuleType):
|
||||
if using_tor_proxy(engine) and hasattr(engine, 'onion_url'):
|
||||
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') # type: ignore
|
||||
engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) # type: ignore
|
||||
|
||||
|
||||
def is_missing_required_attributes(engine):
|
||||
"""An attribute is required when its name doesn't start with ``_`` (underline).
|
||||
Required attributes must not be ``None``.
|
||||
|
||||
"""
|
||||
missing = False
|
||||
for engine_attr in dir(engine):
|
||||
if not engine_attr.startswith('_') and getattr(engine, engine_attr) is None:
|
||||
logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr))
|
||||
missing = True
|
||||
return missing
|
||||
|
||||
|
||||
def using_tor_proxy(engine: Engine | types.ModuleType):
|
||||
"""Return True if the engine configuration declares to use Tor."""
|
||||
return settings['outgoing'].get('using_tor_proxy') or getattr(engine, 'using_tor_proxy', False)
|
||||
|
||||
|
||||
def is_engine_active(engine: Engine | types.ModuleType):
|
||||
# check if engine is inactive
|
||||
if engine.inactive is True:
|
||||
return False
|
||||
|
||||
# exclude onion engines if not using tor
|
||||
if 'onions' in engine.categories and not using_tor_proxy(engine):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def register_engine(engine: Engine | types.ModuleType):
|
||||
if engine.name in engines:
|
||||
logger.error('Engine config error: ambiguous name: {0}'.format(engine.name))
|
||||
sys.exit(1)
|
||||
engines[engine.name] = engine
|
||||
|
||||
if engine.shortcut in engine_shortcuts:
|
||||
logger.error('Engine config error: ambiguous shortcut: {0}'.format(engine.shortcut))
|
||||
sys.exit(1)
|
||||
engine_shortcuts[engine.shortcut] = engine.name
|
||||
|
||||
for category_name in engine.categories:
|
||||
categories.setdefault(category_name, []).append(engine)
|
||||
|
||||
|
||||
def load_engines(engine_list):
|
||||
"""usage: ``engine_list = settings['engines']``"""
|
||||
engines.clear()
|
||||
engine_shortcuts.clear()
|
||||
categories.clear()
|
||||
categories['general'] = []
|
||||
for engine_data in engine_list:
|
||||
engine = load_engine(engine_data)
|
||||
if engine:
|
||||
register_engine(engine)
|
||||
return engines
|
||||
109
searx/engines/acfun.py
Normal file
109
searx/engines/acfun.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Acfun search engine for searxng"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
import re
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import extract_text
|
||||
|
||||
# Metadata
|
||||
about = {
|
||||
"website": "https://www.acfun.cn/",
|
||||
"wikidata_id": "Q3077675",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "HTML",
|
||||
"language": "zh",
|
||||
}
|
||||
|
||||
# Engine Configuration
|
||||
categories = ["videos"]
|
||||
paging = True
|
||||
|
||||
# Base URL
|
||||
base_url = "https://www.acfun.cn"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {"keyword": query, "pCursor": params["pageno"]}
|
||||
params["url"] = f"{base_url}/search?{urlencode(query_params)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL)
|
||||
if not matches:
|
||||
return results
|
||||
|
||||
for match in matches:
|
||||
try:
|
||||
json_data = json.loads(match)
|
||||
raw_html = json_data.get("html", "")
|
||||
if not raw_html:
|
||||
continue
|
||||
|
||||
tree = html.fromstring(raw_html)
|
||||
|
||||
video_blocks = tree.xpath('//div[contains(@class, "search-video")]')
|
||||
if not video_blocks:
|
||||
continue
|
||||
|
||||
for video_block in video_blocks:
|
||||
video_info = extract_video_data(video_block)
|
||||
if video_info and video_info["title"] and video_info["url"]:
|
||||
results.append(video_info)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def extract_video_data(video_block):
|
||||
try:
|
||||
data_exposure_log = video_block.get('data-exposure-log')
|
||||
video_data = json.loads(data_exposure_log)
|
||||
|
||||
content_id = video_data.get("content_id", "")
|
||||
title = video_data.get("title", "")
|
||||
|
||||
url = f"{base_url}/v/ac{content_id}"
|
||||
iframe_src = f"{base_url}/player/ac{content_id}"
|
||||
|
||||
create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]'))
|
||||
video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0])
|
||||
video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]'))
|
||||
video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]'))
|
||||
|
||||
published_date = None
|
||||
if create_time:
|
||||
try:
|
||||
published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d")
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
length = None
|
||||
if video_duration:
|
||||
try:
|
||||
timediff = datetime.strptime(video_duration.strip(), "%M:%S")
|
||||
length = timedelta(minutes=timediff.minute, seconds=timediff.second)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"url": url,
|
||||
"content": video_intro,
|
||||
"thumbnail": video_cover,
|
||||
"length": length,
|
||||
"publishedDate": published_date,
|
||||
"iframe_src": iframe_src,
|
||||
}
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, TypeError, ValueError):
|
||||
return None
|
||||
229
searx/engines/adobe_stock.py
Normal file
229
searx/engines/adobe_stock.py
Normal file
@@ -0,0 +1,229 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""`Adobe Stock`_ is a service that gives access to millions of royalty-free
|
||||
assets. Assets types include photos, vectors, illustrations, templates, 3D
|
||||
assets, videos, motion graphics templates and audio tracks.
|
||||
|
||||
.. Adobe Stock: https://stock.adobe.com/
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The engine has the following mandatory setting:
|
||||
|
||||
- SearXNG's :ref:`engine categories`
|
||||
- Adobe-Stock's :py:obj:`adobe_order`
|
||||
- Adobe-Stock's :py:obj:`adobe_content_types`
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: adobe stock
|
||||
engine: adobe_stock
|
||||
shortcut: asi
|
||||
categories: [images]
|
||||
adobe_order: relevance
|
||||
adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
|
||||
|
||||
- name: adobe stock video
|
||||
engine: adobe_stock
|
||||
network: adobe stock
|
||||
shortcut: asi
|
||||
categories: [videos]
|
||||
adobe_order: relevance
|
||||
adobe_content_types: ["video"]
|
||||
|
||||
Implementation
|
||||
==============
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import isodate
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
about = {
|
||||
"website": "https://stock.adobe.com/",
|
||||
"wikidata_id": "Q5977430",
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
categories = []
|
||||
paging = True
|
||||
send_accept_language_header = True
|
||||
results_per_page = 10
|
||||
|
||||
base_url = "https://stock.adobe.com"
|
||||
|
||||
adobe_order: str = ""
|
||||
"""Sort order, can be one of:
|
||||
|
||||
- ``relevance`` or
|
||||
- ``featured`` or
|
||||
- ``creation`` (most recent) or
|
||||
- ``nb_downloads`` (number of downloads)
|
||||
"""
|
||||
|
||||
ADOBE_VALID_TYPES = ["photo", "illustration", "zip_vector", "video", "template", "3d", "audio", "image"]
|
||||
adobe_content_types: list = []
|
||||
"""A list of of content types. The following content types are offered:
|
||||
|
||||
- Images: ``image``
|
||||
- Videos: ``video``
|
||||
- Templates: ``template``
|
||||
- 3D: ``3d``
|
||||
- Audio ``audio``
|
||||
|
||||
Additional subcategories:
|
||||
|
||||
- Photos: ``photo``
|
||||
- Illustrations: ``illustration``
|
||||
- Vectors: ``zip_vector`` (Vectors),
|
||||
"""
|
||||
|
||||
# Do we need support for "free_collection" and "include_stock_enterprise"?
|
||||
|
||||
|
||||
def init(_):
|
||||
if not categories:
|
||||
raise ValueError("adobe_stock engine: categories is unset")
|
||||
|
||||
# adobe_order
|
||||
if not adobe_order:
|
||||
raise ValueError("adobe_stock engine: adobe_order is unset")
|
||||
if adobe_order not in ["relevance", "featured", "creation", "nb_downloads"]:
|
||||
raise ValueError(f"unsupported adobe_order: {adobe_order}")
|
||||
|
||||
# adobe_content_types
|
||||
if not adobe_content_types:
|
||||
raise ValueError("adobe_stock engine: adobe_content_types is unset")
|
||||
|
||||
if isinstance(adobe_content_types, list):
|
||||
for t in adobe_content_types:
|
||||
if t not in ADOBE_VALID_TYPES:
|
||||
raise ValueError("adobe_stock engine: adobe_content_types: '%s' is invalid" % t)
|
||||
else:
|
||||
raise ValueError(
|
||||
"adobe_stock engine: adobe_content_types must be a list of strings not %s" % type(adobe_content_types)
|
||||
)
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
args = {
|
||||
"k": query,
|
||||
"limit": results_per_page,
|
||||
"order": adobe_order,
|
||||
"search_page": params["pageno"],
|
||||
"search_type": "pagination",
|
||||
}
|
||||
|
||||
for content_type in ADOBE_VALID_TYPES:
|
||||
args[f"filters[content_type:{content_type}]"] = 1 if content_type in adobe_content_types else 0
|
||||
|
||||
params["url"] = f"{base_url}/de/Ajax/Search?{urlencode(args)}"
|
||||
|
||||
# headers required to bypass bot-detection
|
||||
if params["searxng_locale"] == "all":
|
||||
params["headers"]["Accept-Language"] = "en-US,en;q=0.5"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def parse_image_item(item):
|
||||
return {
|
||||
"template": "images.html",
|
||||
"url": item["content_url"],
|
||||
"title": item["title"],
|
||||
"content": item["asset_type"],
|
||||
"img_src": item["content_thumb_extra_large_url"],
|
||||
"thumbnail_src": item["thumbnail_url"],
|
||||
"resolution": f"{item['content_original_width']}x{item['content_original_height']}",
|
||||
"img_format": item["format"],
|
||||
"author": item["author"],
|
||||
}
|
||||
|
||||
|
||||
def parse_video_item(item):
|
||||
|
||||
# in video items, the title is more or less a "content description", we try
|
||||
# to reduce the length of the title ..
|
||||
|
||||
title = item["title"]
|
||||
content = ""
|
||||
if "." in title.strip()[:-1]:
|
||||
content = title
|
||||
title = title.split(".", 1)[0]
|
||||
elif "," in title:
|
||||
content = title
|
||||
title = title.split(",", 1)[0]
|
||||
elif len(title) > 50:
|
||||
content = title
|
||||
title = ""
|
||||
for w in content.split(" "):
|
||||
title += f" {w}"
|
||||
if len(title) > 50:
|
||||
title = title.strip() + "\u2026"
|
||||
break
|
||||
|
||||
return {
|
||||
"template": "videos.html",
|
||||
"url": item["content_url"],
|
||||
"title": title,
|
||||
"content": content,
|
||||
# https://en.wikipedia.org/wiki/ISO_8601#Durations
|
||||
"length": isodate.parse_duration(item["time_duration"]),
|
||||
"publishedDate": datetime.fromisoformat(item["creation_date"]),
|
||||
"thumbnail": item["thumbnail_url"],
|
||||
"iframe_src": item["video_small_preview_url"],
|
||||
"metadata": item["asset_type"],
|
||||
}
|
||||
|
||||
|
||||
def parse_audio_item(item):
|
||||
audio_data = item["audio_data"]
|
||||
content = audio_data.get("description") or ""
|
||||
if audio_data.get("album"):
|
||||
content = audio_data["album"] + " - " + content
|
||||
|
||||
return {
|
||||
"url": item["content_url"],
|
||||
"title": item["title"],
|
||||
"content": content,
|
||||
# "thumbnail": base_url + item["thumbnail_url"],
|
||||
"iframe_src": audio_data["preview"]["url"],
|
||||
"publishedDate": datetime.fromisoformat(audio_data["release_date"]) if audio_data["release_date"] else None,
|
||||
"length": timedelta(seconds=round(audio_data["duration"] / 1000)) if audio_data["duration"] else None,
|
||||
"author": item.get("artist_name"),
|
||||
}
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
json_resp = resp.json()
|
||||
|
||||
if isinstance(json_resp["items"], list):
|
||||
return None
|
||||
for item in json_resp["items"].values():
|
||||
if item["asset_type"].lower() in ["image", "premium-image", "illustration", "vector"]:
|
||||
result = parse_image_item(item)
|
||||
elif item["asset_type"].lower() == "video":
|
||||
result = parse_video_item(item)
|
||||
elif item["asset_type"].lower() == "audio":
|
||||
result = parse_audio_item(item)
|
||||
else:
|
||||
logger.error("no handle for %s --> %s", item["asset_type"], item)
|
||||
continue
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
80
searx/engines/ahmia.py
Normal file
80
searx/engines/ahmia.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Ahmia (Onions)
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode, urlparse, parse_qs
|
||||
from lxml.html import fromstring
|
||||
from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion',
|
||||
"wikidata_id": 'Q18693938',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine config
|
||||
categories = ['onions']
|
||||
paging = True
|
||||
page_size = 10
|
||||
|
||||
# search url
|
||||
search_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?{query}'
|
||||
time_range_support = True
|
||||
time_range_dict = {'day': 1, 'week': 7, 'month': 30}
|
||||
|
||||
# xpaths
|
||||
results_xpath = '//li[@class="result"]'
|
||||
url_xpath = './h4/a/@href'
|
||||
title_xpath = './h4/a[1]'
|
||||
content_xpath = './/p[1]'
|
||||
correction_xpath = '//*[@id="didYouMean"]//a'
|
||||
number_of_results_xpath = '//*[@id="totalResults"]'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(query=urlencode({'q': query}))
|
||||
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
dom = fromstring(resp.text)
|
||||
|
||||
# trim results so there's not way too many at once
|
||||
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
|
||||
all_results = eval_xpath_list(dom, results_xpath)
|
||||
trimmed_results = all_results[first_result_index : first_result_index + page_size]
|
||||
|
||||
# get results
|
||||
for result in trimmed_results:
|
||||
# remove ahmia url and extract the actual url for the result
|
||||
raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
|
||||
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
|
||||
|
||||
title = extract_text(eval_xpath(result, title_xpath))
|
||||
content = extract_text(eval_xpath(result, content_xpath))
|
||||
|
||||
results.append({'url': cleaned_url, 'title': title, 'content': content, 'is_onion': True})
|
||||
|
||||
# get spelling corrections
|
||||
for correction in eval_xpath_list(dom, correction_xpath):
|
||||
results.append({'correction': extract_text(correction)})
|
||||
|
||||
# get number of results
|
||||
number_of_results = eval_xpath(dom, number_of_results_xpath)
|
||||
if number_of_results:
|
||||
try:
|
||||
results.append({'number_of_results': int(extract_text(number_of_results))})
|
||||
except: # pylint: disable=bare-except
|
||||
pass
|
||||
|
||||
return results
|
||||
83
searx/engines/alpinelinux.py
Normal file
83
searx/engines/alpinelinux.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""`Alpine Linux binary packages`_. `Alpine Linux`_ is a Linux-based operation
|
||||
system designed to be small, simple and secure. Contrary to many other Linux
|
||||
distributions, it uses musl, BusyBox and OpenRC. Alpine is mostly used on
|
||||
servers and for Docker images.
|
||||
|
||||
.. _Alpine Linux binary packages: https://pkgs.alpinelinux.org
|
||||
.. _Alpine Linux: https://www.alpinelinux.org
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from dateutil import parser
|
||||
|
||||
from searx.utils import eval_xpath, eval_xpath_list, extract_text
|
||||
|
||||
about = {
|
||||
'website': 'https://www.alpinelinux.org',
|
||||
'wikidata_id': 'Q4033826',
|
||||
'use_official_api': False,
|
||||
'official_api_documentation': None,
|
||||
'require_api_key': False,
|
||||
'results': 'HTML',
|
||||
}
|
||||
paging = True
|
||||
categories = ['packages', 'it']
|
||||
|
||||
base_url = "https://pkgs.alpinelinux.org"
|
||||
alpine_arch = 'x86_64'
|
||||
"""Kernel architecture: ``x86_64``, ``x86``, ``aarch64``, ``armhf``,
|
||||
``ppc64le``, ``s390x``, ``armv7`` or ``riscv64``"""
|
||||
|
||||
ARCH_RE = re.compile("x86_64|x86|aarch64|armhf|ppc64le|s390x|armv7|riscv64")
|
||||
"""Regular expression to match supported architectures in the query string."""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_arch = ARCH_RE.search(query)
|
||||
if query_arch:
|
||||
query_arch = query_arch.group(0)
|
||||
query = query.replace(query_arch, '').strip()
|
||||
|
||||
args = {
|
||||
# use wildcards to match more than just packages with the exact same
|
||||
# name as the query
|
||||
'name': f"*{query}*",
|
||||
'page': params['pageno'],
|
||||
'arch': query_arch or alpine_arch,
|
||||
}
|
||||
params['url'] = f"{base_url}/packages?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
doc = html.fromstring(resp.text)
|
||||
for result in eval_xpath_list(doc, "//table/tbody/tr"):
|
||||
|
||||
if len(result.xpath("./td")) < 9:
|
||||
# skip non valid entries in the result table
|
||||
# e.g the "No item found..." message
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'packages.html',
|
||||
'url': base_url + extract_text(eval_xpath(result, './td[contains(@class, "package")]/a/@href')),
|
||||
'title': extract_text(eval_xpath(result, './td[contains(@class, "package")]')),
|
||||
'package_name': extract_text(eval_xpath(result, './td[contains(@class, "package")]')),
|
||||
'publishedDate': parser.parse(extract_text(eval_xpath(result, './td[contains(@class, "bdate")]'))),
|
||||
'version': extract_text(eval_xpath(result, './td[contains(@class, "version")]')),
|
||||
'homepage': extract_text(eval_xpath(result, './td[contains(@class, "url")]/a/@href')),
|
||||
'maintainer': extract_text(eval_xpath(result, './td[contains(@class, "maintainer")]')),
|
||||
'license_name': extract_text(eval_xpath(result, './td[contains(@class, "license")]')),
|
||||
'tags': [extract_text(eval_xpath(result, './td[contains(@class, "repo")]'))],
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
202
searx/engines/annas_archive.py
Normal file
202
searx/engines/annas_archive.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""`Anna's Archive`_ is a free non-profit online shadow library metasearch
|
||||
engine providing access to a variety of book resources (also via IPFS), created
|
||||
by a team of anonymous archivists (AnnaArchivist_).
|
||||
|
||||
.. _Anna's Archive: https://annas-archive.org/
|
||||
.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The engine has the following additional settings:
|
||||
|
||||
- :py:obj:`aa_content`
|
||||
- :py:obj:`aa_ext`
|
||||
- :py:obj:`aa_sort`
|
||||
|
||||
With this options a SearXNG maintainer is able to configure **additional**
|
||||
engines for specific searches in Anna's Archive. For example a engine to search
|
||||
for *newest* articles and journals (PDF) / by shortcut ``!aaa <search-term>``.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: annas articles
|
||||
engine: annas_archive
|
||||
shortcut: aaa
|
||||
aa_content: 'magazine'
|
||||
aa_ext: 'pdf'
|
||||
aa_sort: 'newest'
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_getindex, eval_xpath_list
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.data import ENGINE_TRAITS
|
||||
|
||||
# about
|
||||
about: Dict[str, Any] = {
|
||||
"website": "https://annas-archive.org/",
|
||||
"wikidata_id": "Q115288326",
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "HTML",
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories: List[str] = ["files"]
|
||||
paging: bool = True
|
||||
|
||||
# search-url
|
||||
base_url: str = "https://annas-archive.org"
|
||||
aa_content: str = ""
|
||||
"""Anan's search form field **Content** / possible values::
|
||||
|
||||
book_fiction, book_unknown, book_nonfiction,
|
||||
book_comic, magazine, standards_document
|
||||
|
||||
To not filter use an empty string (default).
|
||||
"""
|
||||
aa_sort: str = ''
|
||||
"""Sort Anna's results, possible values::
|
||||
|
||||
newest, oldest, largest, smallest
|
||||
|
||||
To sort by *most relevant* use an empty string (default)."""
|
||||
|
||||
aa_ext: str = ''
|
||||
"""Filter Anna's results by a file ending. Common filters for example are
|
||||
``pdf`` and ``epub``.
|
||||
|
||||
.. note::
|
||||
|
||||
Anna's Archive is a beta release: Filter results by file extension does not
|
||||
really work on Anna's Archive.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def init(engine_settings=None): # pylint: disable=unused-argument
|
||||
"""Check of engine's settings."""
|
||||
traits = EngineTraits(**ENGINE_TRAITS['annas archive'])
|
||||
|
||||
if aa_content and aa_content not in traits.custom['content']:
|
||||
raise ValueError(f'invalid setting content: {aa_content}')
|
||||
|
||||
if aa_sort and aa_sort not in traits.custom['sort']:
|
||||
raise ValueError(f'invalid setting sort: {aa_sort}')
|
||||
|
||||
if aa_ext and aa_ext not in traits.custom['ext']:
|
||||
raise ValueError(f'invalid setting ext: {aa_ext}')
|
||||
|
||||
|
||||
def request(query, params: Dict[str, Any]) -> Dict[str, Any]:
|
||||
lang = traits.get_language(params["language"], traits.all_locale) # type: ignore
|
||||
args = {
|
||||
'lang': lang,
|
||||
'content': aa_content,
|
||||
'ext': aa_ext,
|
||||
'sort': aa_sort,
|
||||
'q': query,
|
||||
'page': params['pageno'],
|
||||
}
|
||||
# filter out None and empty values
|
||||
filtered_args = dict((k, v) for k, v in args.items() if v)
|
||||
params["url"] = f"{base_url}/search?{urlencode(filtered_args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> List[Dict[str, Optional[str]]]:
|
||||
results: List[Dict[str, Optional[str]]] = []
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'):
|
||||
results.append(_get_result(item))
|
||||
|
||||
# The rendering of the WEB page is very strange; except the first position
|
||||
# all other positions of Anna's result page are enclosed in SGML comments.
|
||||
# These comments are *uncommented* by some JS code, see query of class
|
||||
# '.js-scroll-hidden' in Anna's HTML template:
|
||||
# https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
|
||||
|
||||
for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'):
|
||||
item = html.fromstring(item.xpath('./comment()')[0].text)
|
||||
results.append(_get_result(item))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _get_result(item):
|
||||
return {
|
||||
'template': 'paper.html',
|
||||
'url': base_url + extract_text(eval_xpath_getindex(item, './@href', 0)),
|
||||
'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
|
||||
'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
|
||||
'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
|
||||
'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
|
||||
'thumbnail': extract_text(eval_xpath_getindex(item, './/img/@src', 0, default=None), allow_none=True),
|
||||
}
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages and other search arguments from Anna's search form."""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
|
||||
import babel
|
||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||
from searx.locales import language_tag
|
||||
|
||||
engine_traits.all_locale = ''
|
||||
engine_traits.custom['content'] = []
|
||||
engine_traits.custom['ext'] = []
|
||||
engine_traits.custom['sort'] = []
|
||||
|
||||
resp = get(base_url + '/search')
|
||||
if not resp.ok: # type: ignore
|
||||
raise RuntimeError("Response from Anna's search page is not OK.")
|
||||
dom = html.fromstring(resp.text) # type: ignore
|
||||
|
||||
# supported language codes
|
||||
|
||||
lang_map = {}
|
||||
for x in eval_xpath_list(dom, "//form//input[@name='lang']"):
|
||||
eng_lang = x.get("value")
|
||||
if eng_lang in ('', '_empty', 'nl-BE', 'und') or eng_lang.startswith('anti__'):
|
||||
continue
|
||||
try:
|
||||
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
|
||||
except babel.UnknownLocaleError:
|
||||
# silently ignore unknown languages
|
||||
# print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
|
||||
continue
|
||||
sxng_lang = language_tag(locale)
|
||||
conflict = engine_traits.languages.get(sxng_lang)
|
||||
if conflict:
|
||||
if conflict != eng_lang:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
|
||||
continue
|
||||
engine_traits.languages[sxng_lang] = eng_lang
|
||||
|
||||
for x in eval_xpath_list(dom, "//form//input[@name='content']"):
|
||||
if not x.get("value").startswith("anti__"):
|
||||
engine_traits.custom['content'].append(x.get("value"))
|
||||
|
||||
for x in eval_xpath_list(dom, "//form//input[@name='ext']"):
|
||||
if not x.get("value").startswith("anti__"):
|
||||
engine_traits.custom['ext'].append(x.get("value"))
|
||||
|
||||
for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"):
|
||||
engine_traits.custom['sort'].append(x.get("value"))
|
||||
|
||||
# for better diff; sort the persistence of these traits
|
||||
engine_traits.custom['content'].sort()
|
||||
engine_traits.custom['ext'].sort()
|
||||
engine_traits.custom['sort'].sort()
|
||||
81
searx/engines/ansa.py
Normal file
81
searx/engines/ansa.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Engine for Ansa, Italy's oldest news agency.
|
||||
|
||||
To use this engine add the following entry to your engines
|
||||
list in ``settings.yml``:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: ansa
|
||||
engine: ansa
|
||||
shortcut: ans
|
||||
disabled: false
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.result_types import EngineResults, MainResult
|
||||
from searx.utils import eval_xpath, eval_xpath_list, extract_text
|
||||
|
||||
engine_type = 'online'
|
||||
language_support = False
|
||||
categories = ['news']
|
||||
paging = True
|
||||
page_size = 12
|
||||
base_url = 'https://www.ansa.it'
|
||||
|
||||
time_range_support = True
|
||||
time_range_args = {
|
||||
'day': 1,
|
||||
'week': 7,
|
||||
'month': 31,
|
||||
'year': 365,
|
||||
}
|
||||
# https://www.ansa.it/ricerca/ansait/search.shtml?start=0&any=houthi&periodo=&sort=data%3Adesc
|
||||
search_api = 'https://www.ansa.it/ricerca/ansait/search.shtml?'
|
||||
|
||||
about = {
|
||||
'website': 'https://www.ansa.it',
|
||||
'wikidata_id': 'Q392934',
|
||||
'official_api_documentation': None,
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'HTML',
|
||||
'language': 'it',
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {
|
||||
'any': query,
|
||||
'start': (params['pageno'] - 1) * page_size,
|
||||
'sort': "data:desc",
|
||||
}
|
||||
|
||||
if params['time_range']:
|
||||
query_params['periodo'] = time_range_args.get(params['time_range'])
|
||||
|
||||
params['url'] = search_api + urlencode(query_params)
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
res = EngineResults()
|
||||
doc = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(doc, "//div[@class='article']"):
|
||||
|
||||
res_obj = MainResult(
|
||||
title=extract_text(eval_xpath(result, "./div[@class='content']/h2[@class='title']/a")),
|
||||
content=extract_text(eval_xpath(result, "./div[@class='content']/div[@class='text']")),
|
||||
url=base_url + extract_text(eval_xpath(result, "./div[@class='content']/h2[@class='title']/a/@href")),
|
||||
)
|
||||
|
||||
thumbnail = extract_text(eval_xpath(result, "./div[@class='image']/a/img/@src"))
|
||||
if thumbnail:
|
||||
res_obj.thumbnail = base_url + thumbnail
|
||||
|
||||
res.append(res_obj)
|
||||
|
||||
return res
|
||||
61
searx/engines/apkmirror.py
Normal file
61
searx/engines/apkmirror.py
Normal file
@@ -0,0 +1,61 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""APKMirror
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import (
|
||||
eval_xpath_list,
|
||||
eval_xpath_getindex,
|
||||
extract_text,
|
||||
)
|
||||
|
||||
about = {
|
||||
"website": 'https://www.apkmirror.com',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['files', 'apps']
|
||||
paging = True
|
||||
time_range_support = False
|
||||
|
||||
# search-url
|
||||
base_url = 'https://www.apkmirror.com'
|
||||
search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(
|
||||
pageno=params['pageno'],
|
||||
query=urlencode({'s': query}),
|
||||
)
|
||||
logger.debug("query_url --> %s", params['url'])
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in eval_xpath_list(dom, "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']"):
|
||||
|
||||
link = eval_xpath_getindex(result, './/h5/a', 0)
|
||||
|
||||
url = base_url + link.attrib.get('href') + '#downloads'
|
||||
title = extract_text(link)
|
||||
thumbnail = base_url + eval_xpath_getindex(result, './/img/@src', 0)
|
||||
res = {'url': url, 'title': title, 'thumbnail': thumbnail}
|
||||
|
||||
results.append(res)
|
||||
|
||||
return results
|
||||
56
searx/engines/apple_app_store.py
Normal file
56
searx/engines/apple_app_store.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Apple App Store
|
||||
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
from dateutil.parser import parse
|
||||
|
||||
about = {
|
||||
"website": 'https://www.apple.com/app-store/',
|
||||
"wikidata_id": 'Q368215',
|
||||
"official_api_documentation": (
|
||||
'https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/'
|
||||
'iTuneSearchAPI/UnderstandingSearchResults.html#//apple_ref/doc/uid/TP40017632-CH8-SW1'
|
||||
),
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
categories = ['files', 'apps']
|
||||
safesearch = True
|
||||
|
||||
search_url = 'https://itunes.apple.com/search?{query}'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
explicit = "Yes"
|
||||
|
||||
if params['safesearch'] > 0:
|
||||
explicit = "No"
|
||||
|
||||
params['url'] = search_url.format(query=urlencode({'term': query, 'media': 'software', 'explicit': explicit}))
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
json_result = loads(resp.text)
|
||||
|
||||
for result in json_result['results']:
|
||||
results.append(
|
||||
{
|
||||
'url': result['trackViewUrl'],
|
||||
'title': result['trackName'],
|
||||
'content': result['description'],
|
||||
'thumbnail': result['artworkUrl100'],
|
||||
'publishedDate': parse(result['currentVersionReleaseDate']),
|
||||
'author': result['sellerName'],
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
112
searx/engines/apple_maps.py
Normal file
112
searx/engines/apple_maps.py
Normal file
@@ -0,0 +1,112 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Apple Maps"""
|
||||
|
||||
from json import loads
|
||||
from time import time
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from searx.network import get as http_get
|
||||
from searx.engines.openstreetmap import get_key_label
|
||||
|
||||
about = {
|
||||
"website": 'https://www.apple.com/maps/',
|
||||
"wikidata_id": 'Q276101',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
token = {'value': '', 'last_updated': None}
|
||||
|
||||
categories = ['map']
|
||||
paging = False
|
||||
|
||||
search_url = "https://api.apple-mapkit.com/v1/search?{query}&mkjsVersion=5.72.53"
|
||||
|
||||
|
||||
def obtain_token():
|
||||
update_time = time() - (time() % 1800)
|
||||
try:
|
||||
# use duckduckgo's mapkit token
|
||||
token_response = http_get('https://duckduckgo.com/local.js?get_mk_token=1', timeout=2.0)
|
||||
actual_token = http_get(
|
||||
'https://cdn.apple-mapkit.com/ma/bootstrap?apiVersion=2&mkjsVersion=5.72.53&poi=1',
|
||||
timeout=2.0,
|
||||
headers={'Authorization': 'Bearer ' + token_response.text},
|
||||
)
|
||||
token['value'] = loads(actual_token.text)['authInfo']['access_token']
|
||||
token['last_updated'] = update_time
|
||||
# pylint: disable=bare-except
|
||||
except:
|
||||
pass
|
||||
return token
|
||||
|
||||
|
||||
def request(query, params):
|
||||
if time() - (token['last_updated'] or 0) > 1800:
|
||||
obtain_token()
|
||||
|
||||
params['url'] = search_url.format(query=urlencode({'q': query, 'lang': params['language']}))
|
||||
|
||||
params['headers'] = {'Authorization': 'Bearer ' + token['value']}
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
resp_json = loads(resp.text)
|
||||
|
||||
user_language = resp.search_params['language']
|
||||
|
||||
for result in resp_json['results']:
|
||||
boundingbox = None
|
||||
if 'displayMapRegion' in result:
|
||||
box = result['displayMapRegion']
|
||||
boundingbox = [box['southLat'], box['northLat'], box['westLng'], box['eastLng']]
|
||||
|
||||
links = []
|
||||
if 'telephone' in result:
|
||||
telephone = result['telephone']
|
||||
links.append(
|
||||
{
|
||||
'label': get_key_label('phone', user_language),
|
||||
'url': 'tel:' + telephone,
|
||||
'url_label': telephone,
|
||||
}
|
||||
)
|
||||
if result.get('urls'):
|
||||
url = result['urls'][0]
|
||||
links.append(
|
||||
{
|
||||
'label': get_key_label('website', user_language),
|
||||
'url': url,
|
||||
'url_label': url,
|
||||
}
|
||||
)
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'map.html',
|
||||
'type': result.get('poiCategory'),
|
||||
'title': result['name'],
|
||||
'links': links,
|
||||
'latitude': result['center']['lat'],
|
||||
'longitude': result['center']['lng'],
|
||||
'url': result['placecardUrl'],
|
||||
'boundingbox': boundingbox,
|
||||
'geojson': {'type': 'Point', 'coordinates': [result['center']['lng'], result['center']['lat']]},
|
||||
'address': {
|
||||
'name': result['name'],
|
||||
'house_number': result.get('subThoroughfare'),
|
||||
'road': result.get('thoroughfare'),
|
||||
'locality': result.get('locality'),
|
||||
'postcode': result.get('postCode'),
|
||||
'country': result.get('country'),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
154
searx/engines/archlinux.py
Normal file
154
searx/engines/archlinux.py
Normal file
@@ -0,0 +1,154 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Arch Linux Wiki
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
This implementation does not use a official API: Mediawiki provides API, but
|
||||
Arch Wiki blocks access to it.
|
||||
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlencode, urljoin, urlparse
|
||||
import lxml
|
||||
import babel
|
||||
|
||||
from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.locales import language_tag
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
about = {
|
||||
"website": 'https://wiki.archlinux.org/',
|
||||
"wikidata_id": 'Q101445877',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['it', 'software wikis']
|
||||
paging = True
|
||||
main_wiki = 'wiki.archlinux.org'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
sxng_lang = params['searxng_locale'].split('-')[0]
|
||||
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
|
||||
title: str = traits.custom['title'].get(sxng_lang, 'Special:Search') # type: ignore
|
||||
base_url = 'https://' + netloc + '/index.php?'
|
||||
offset = (params['pageno'] - 1) * 20
|
||||
|
||||
if netloc == main_wiki:
|
||||
eng_lang: str = traits.get_language(sxng_lang, 'English') # type: ignore
|
||||
query += ' (' + eng_lang + ')'
|
||||
# wiki.archlinux.org is protected by anubis
|
||||
# - https://github.com/searxng/searxng/issues/4646#issuecomment-2817848019
|
||||
params['headers']['User-Agent'] = "SearXNG"
|
||||
elif netloc == 'wiki.archlinuxcn.org':
|
||||
base_url = 'https://' + netloc + '/wzh/index.php?'
|
||||
|
||||
args = {
|
||||
'search': query,
|
||||
'title': title,
|
||||
'limit': 20,
|
||||
'offset': offset,
|
||||
'profile': 'default',
|
||||
}
|
||||
|
||||
params['url'] = base_url + urlencode(args)
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
results = []
|
||||
dom = lxml.html.fromstring(resp.text) # type: ignore
|
||||
|
||||
# get the base URL for the language in which request was made
|
||||
sxng_lang = resp.search_params['searxng_locale'].split('-')[0]
|
||||
netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore
|
||||
base_url = 'https://' + netloc + '/index.php?'
|
||||
|
||||
for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'):
|
||||
link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0)
|
||||
content = extract_text(result.xpath('.//div[@class="searchresult"]'))
|
||||
results.append(
|
||||
{
|
||||
'url': urljoin(base_url, link.get('href')), # type: ignore
|
||||
'title': extract_text(link),
|
||||
'content': content,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages from Archlinux-Wiki. The location of the Wiki address of a
|
||||
language is mapped in a :py:obj:`custom field
|
||||
<searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``). Depending
|
||||
on the location, the ``title`` argument in the request is translated.
|
||||
|
||||
.. code:: python
|
||||
|
||||
"custom": {
|
||||
"wiki_netloc": {
|
||||
"de": "wiki.archlinux.de",
|
||||
# ...
|
||||
"zh": "wiki.archlinuxcn.org"
|
||||
}
|
||||
"title": {
|
||||
"de": "Spezial:Suche",
|
||||
# ...
|
||||
"zh": "Special:\u641c\u7d22"
|
||||
},
|
||||
},
|
||||
|
||||
"""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||
|
||||
engine_traits.custom['wiki_netloc'] = {}
|
||||
engine_traits.custom['title'] = {}
|
||||
|
||||
title_map = {
|
||||
'de': 'Spezial:Suche',
|
||||
'fa': 'ویژه:جستجو',
|
||||
'ja': '特別:検索',
|
||||
'zh': 'Special:搜索',
|
||||
}
|
||||
|
||||
resp = get('https://wiki.archlinux.org/')
|
||||
if not resp.ok: # type: ignore
|
||||
print("ERROR: response from wiki.archlinux.org is not OK.")
|
||||
|
||||
dom = lxml.html.fromstring(resp.text) # type: ignore
|
||||
for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"):
|
||||
|
||||
sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-'))
|
||||
# zh_Hans --> zh
|
||||
sxng_tag = sxng_tag.split('_')[0]
|
||||
|
||||
netloc = urlparse(a.get('href')).netloc
|
||||
if netloc != 'wiki.archlinux.org':
|
||||
title = title_map.get(sxng_tag)
|
||||
if not title:
|
||||
print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag))
|
||||
continue
|
||||
engine_traits.custom['wiki_netloc'][sxng_tag] = netloc
|
||||
engine_traits.custom['title'][sxng_tag] = title # type: ignore
|
||||
|
||||
eng_tag = extract_text(eval_xpath_list(a, ".//span"))
|
||||
engine_traits.languages[sxng_tag] = eng_tag # type: ignore
|
||||
|
||||
engine_traits.languages['en'] = 'English'
|
||||
67
searx/engines/artic.py
Normal file
67
searx/engines/artic.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""The Art Institute of Chicago
|
||||
|
||||
Explore thousands of artworks from The Art Institute of Chicago.
|
||||
|
||||
* https://artic.edu
|
||||
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
|
||||
about = {
|
||||
"website": 'https://www.artic.edu',
|
||||
"wikidata_id": 'Q239303',
|
||||
"official_api_documentation": 'http://api.artic.edu/docs/',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
categories = ['images']
|
||||
paging = True
|
||||
nb_per_page = 20
|
||||
|
||||
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
|
||||
image_api = 'https://www.artic.edu/iiif/2/'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
args = urlencode(
|
||||
{
|
||||
'q': query,
|
||||
'page': params['pageno'],
|
||||
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
|
||||
'limit': nb_per_page,
|
||||
}
|
||||
)
|
||||
params['url'] = search_api + args
|
||||
|
||||
logger.debug("query_url --> %s", params['url'])
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
results = []
|
||||
json_data = loads(resp.text)
|
||||
|
||||
for result in json_data['data']:
|
||||
|
||||
if not result['image_id']:
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': 'https://artic.edu/artworks/%(id)s' % result,
|
||||
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
|
||||
'content': "%(medium_display)s // %(dimensions)s" % result,
|
||||
'author': ', '.join(result['artist_titles']),
|
||||
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
|
||||
'template': 'images.html',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
110
searx/engines/arxiv.py
Normal file
110
searx/engines/arxiv.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""ArXiV (Scientific preprints)
|
||||
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from lxml import etree
|
||||
from lxml.etree import XPath
|
||||
from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://arxiv.org',
|
||||
"wikidata_id": 'Q118398',
|
||||
"official_api_documentation": 'https://arxiv.org/help/api',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'XML-RSS',
|
||||
}
|
||||
|
||||
categories = ['science', 'scientific publications']
|
||||
paging = True
|
||||
|
||||
base_url = (
|
||||
'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}'
|
||||
)
|
||||
|
||||
# engine dependent config
|
||||
number_of_results = 10
|
||||
|
||||
# xpaths
|
||||
arxiv_namespaces = {
|
||||
"atom": "http://www.w3.org/2005/Atom",
|
||||
"arxiv": "http://arxiv.org/schemas/atom",
|
||||
}
|
||||
xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
|
||||
xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
|
||||
xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
|
||||
xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
|
||||
xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
|
||||
xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
|
||||
xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
|
||||
xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
|
||||
xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
|
||||
xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
|
||||
xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
|
||||
|
||||
|
||||
def request(query, params):
|
||||
# basic search
|
||||
offset = (params['pageno'] - 1) * number_of_results
|
||||
|
||||
string_args = {'query': query, 'offset': offset, 'number_of_results': number_of_results}
|
||||
|
||||
params['url'] = base_url.format(**string_args)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
dom = etree.fromstring(resp.content)
|
||||
for entry in eval_xpath_list(dom, xpath_entry):
|
||||
title = eval_xpath_getindex(entry, xpath_title, 0).text
|
||||
|
||||
url = eval_xpath_getindex(entry, xpath_id, 0).text
|
||||
abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
|
||||
|
||||
authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
|
||||
|
||||
# doi
|
||||
doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
|
||||
doi = None if doi_element is None else doi_element.text
|
||||
|
||||
# pdf
|
||||
pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
|
||||
pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
|
||||
|
||||
# journal
|
||||
journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
|
||||
journal = None if journal_element is None else journal_element.text
|
||||
|
||||
# tags
|
||||
tag_elements = eval_xpath(entry, xpath_category)
|
||||
tags = [str(tag) for tag in tag_elements]
|
||||
|
||||
# comments
|
||||
comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
|
||||
comments = None if comments_elements is None else comments_elements.text
|
||||
|
||||
publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
res_dict = {
|
||||
'template': 'paper.html',
|
||||
'url': url,
|
||||
'title': title,
|
||||
'publishedDate': publishedDate,
|
||||
'content': abstract,
|
||||
'doi': doi,
|
||||
'authors': authors,
|
||||
'journal': journal,
|
||||
'tags': tags,
|
||||
'comments': comments,
|
||||
'pdf_url': pdf_url,
|
||||
}
|
||||
|
||||
results.append(res_dict)
|
||||
|
||||
return results
|
||||
75
searx/engines/ask.py
Normal file
75
searx/engines/ask.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Ask.com"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
import dateutil
|
||||
from lxml import html
|
||||
from searx import utils
|
||||
|
||||
# Metadata
|
||||
about = {
|
||||
"website": "https://www.ask.com/",
|
||||
"wikidata_id": 'Q847564',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "HTML",
|
||||
}
|
||||
|
||||
# Engine Configuration
|
||||
categories = ['general']
|
||||
paging = True
|
||||
max_page = 5
|
||||
"""Ask.com has at max 5 pages."""
|
||||
|
||||
# Base URL
|
||||
base_url = "https://www.ask.com/web"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
query_params = {
|
||||
"q": query,
|
||||
"page": params["pageno"],
|
||||
}
|
||||
|
||||
params["url"] = f"{base_url}?{urlencode(query_params)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
start_tag = 'window.MESON.initialState = {'
|
||||
end_tag = '}};'
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
script = utils.eval_xpath_getindex(dom, '//script', 0, default=None).text
|
||||
|
||||
pos = script.index(start_tag) + len(start_tag) - 1
|
||||
script = script[pos:]
|
||||
pos = script.index(end_tag) + len(end_tag) - 1
|
||||
script = script[:pos]
|
||||
|
||||
json_resp = utils.js_variable_to_python(script)
|
||||
|
||||
results = []
|
||||
|
||||
for item in json_resp['search']['webResults']['results']:
|
||||
|
||||
pubdate_original = item.get('pubdate_original')
|
||||
if pubdate_original:
|
||||
pubdate_original = dateutil.parser.parse(pubdate_original)
|
||||
metadata = [item.get(field) for field in ['category_l1', 'catsy'] if item.get(field)]
|
||||
|
||||
results.append(
|
||||
{
|
||||
"url": item['url'].split('&ueid')[0],
|
||||
"title": item['title'],
|
||||
"content": item['abstract'],
|
||||
"publishedDate": pubdate_original,
|
||||
# "thumbnail": item.get('image_url') or None, # these are not thumbs / to large
|
||||
"metadata": ' | '.join(metadata),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
93
searx/engines/astrophysics_data_system.py
Normal file
93
searx/engines/astrophysics_data_system.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
""".. sidebar:: info
|
||||
|
||||
The Astrophysics Data System (ADS) is a digital library portal for researchers in astronomy and physics,
|
||||
operated by the Smithsonian Astrophysical Observatory (SAO) under a NASA grant.
|
||||
The engine is adapted from the solr engine.
|
||||
|
||||
"""
|
||||
|
||||
# pylint: disable=global-statement
|
||||
|
||||
from datetime import datetime
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
|
||||
about = {
|
||||
"website": 'https://ui.adsabs.harvard.edu/',
|
||||
"wikidata_id": 'Q752099',
|
||||
"official_api_documentation": 'https://ui.adsabs.harvard.edu/help/api/api-docs.html',
|
||||
"use_official_api": True,
|
||||
"require_api_key": True,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
base_url = 'https://api.adsabs.harvard.edu/v1/search'
|
||||
result_base_url = 'https://ui.adsabs.harvard.edu/abs/'
|
||||
rows = 10
|
||||
sort = '' # sorting: asc or desc
|
||||
field_list = ['bibcode', 'author', 'title', 'abstract', 'doi', 'date'] # list of field names to display on the UI
|
||||
default_fields = '' # default field to query
|
||||
query_fields = '' # query fields
|
||||
paging = True
|
||||
api_key = 'unset'
|
||||
|
||||
|
||||
def init(_):
|
||||
if api_key == 'unset':
|
||||
raise SearxEngineAPIException('missing ADS API key')
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {'q': query, 'rows': rows}
|
||||
if field_list:
|
||||
query_params['fl'] = ','.join(field_list)
|
||||
if query_fields:
|
||||
query_params['qf'] = ','.join(query_fields)
|
||||
if default_fields:
|
||||
query_params['df'] = default_fields
|
||||
if sort:
|
||||
query_params['sort'] = sort
|
||||
|
||||
query_params['start'] = rows * (params['pageno'] - 1)
|
||||
|
||||
params['headers']['Authorization'] = f'Bearer {api_key}'
|
||||
params['url'] = f"{base_url}/query?{urlencode(query_params)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
try:
|
||||
resp_json = loads(resp.text)
|
||||
except Exception as e:
|
||||
raise SearxEngineAPIException("failed to parse response") from e
|
||||
|
||||
if 'error' in resp_json:
|
||||
raise SearxEngineAPIException(resp_json['error']['msg'])
|
||||
|
||||
resp_json = resp_json["response"]
|
||||
result_len = resp_json["numFound"]
|
||||
results = []
|
||||
|
||||
for res in resp_json["docs"]:
|
||||
author = res.get("author")
|
||||
|
||||
if author:
|
||||
author = author[0] + ' et al.'
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': result_base_url + res.get("bibcode") + "/",
|
||||
'title': res.get("title")[0],
|
||||
'author': author,
|
||||
'content': res.get("abstract"),
|
||||
'doi': res.get("doi"),
|
||||
'publishedDate': datetime.fromisoformat(res.get("date")),
|
||||
}
|
||||
)
|
||||
|
||||
results.append({'number_of_results': result_len})
|
||||
|
||||
return results
|
||||
182
searx/engines/baidu.py
Normal file
182
searx/engines/baidu.py
Normal file
@@ -0,0 +1,182 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Baidu_
|
||||
|
||||
.. _Baidu: https://www.baidu.com
|
||||
"""
|
||||
|
||||
# There exits a https://github.com/ohblue/baidu-serp-api/
|
||||
# but we don't use it here (may we can learn from).
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
from html import unescape
|
||||
import time
|
||||
import json
|
||||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.utils import html_to_text
|
||||
|
||||
about = {
|
||||
"website": "https://www.baidu.com",
|
||||
"wikidata_id": "Q14772",
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
"language": "zh",
|
||||
}
|
||||
|
||||
paging = True
|
||||
categories = []
|
||||
results_per_page = 10
|
||||
|
||||
baidu_category = 'general'
|
||||
|
||||
time_range_support = True
|
||||
time_range_dict = {"day": 86400, "week": 604800, "month": 2592000, "year": 31536000}
|
||||
|
||||
|
||||
def init(_):
|
||||
if baidu_category not in ('general', 'images', 'it'):
|
||||
raise SearxEngineAPIException(f"Unsupported category: {baidu_category}")
|
||||
|
||||
|
||||
def request(query, params):
|
||||
page_num = params["pageno"]
|
||||
|
||||
category_config = {
|
||||
'general': {
|
||||
'endpoint': 'https://www.baidu.com/s',
|
||||
'params': {
|
||||
"wd": query,
|
||||
"rn": results_per_page,
|
||||
"pn": (page_num - 1) * results_per_page,
|
||||
"tn": "json",
|
||||
},
|
||||
},
|
||||
'images': {
|
||||
'endpoint': 'https://image.baidu.com/search/acjson',
|
||||
'params': {
|
||||
"word": query,
|
||||
"rn": results_per_page,
|
||||
"pn": (page_num - 1) * results_per_page,
|
||||
"tn": "resultjson_com",
|
||||
},
|
||||
},
|
||||
'it': {
|
||||
'endpoint': 'https://kaifa.baidu.com/rest/v1/search',
|
||||
'params': {
|
||||
"wd": query,
|
||||
"pageSize": results_per_page,
|
||||
"pageNum": page_num,
|
||||
"paramList": f"page_num={page_num},page_size={results_per_page}",
|
||||
"position": 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
query_params = category_config[baidu_category]['params']
|
||||
query_url = category_config[baidu_category]['endpoint']
|
||||
|
||||
if params.get("time_range") in time_range_dict:
|
||||
now = int(time.time())
|
||||
past = now - time_range_dict[params["time_range"]]
|
||||
|
||||
if baidu_category == 'general':
|
||||
query_params["gpc"] = f"stf={past},{now}|stftype=1"
|
||||
|
||||
if baidu_category == 'it':
|
||||
query_params["paramList"] += f",timestamp_range={past}-{now}"
|
||||
|
||||
params["url"] = f"{query_url}?{urlencode(query_params)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
text = resp.text
|
||||
if baidu_category == 'images':
|
||||
# baidu's JSON encoder wrongly quotes / and ' characters by \\ and \'
|
||||
text = text.replace(r"\/", "/").replace(r"\'", "'")
|
||||
data = json.loads(text, strict=False)
|
||||
parsers = {'general': parse_general, 'images': parse_images, 'it': parse_it}
|
||||
|
||||
return parsers[baidu_category](data)
|
||||
|
||||
|
||||
def parse_general(data):
|
||||
results = []
|
||||
if not data.get("feed", {}).get("entry"):
|
||||
raise SearxEngineAPIException("Invalid response")
|
||||
|
||||
for entry in data["feed"]["entry"]:
|
||||
if not entry.get("title") or not entry.get("url"):
|
||||
continue
|
||||
|
||||
published_date = None
|
||||
if entry.get("time"):
|
||||
try:
|
||||
published_date = datetime.fromtimestamp(entry["time"])
|
||||
except (ValueError, TypeError):
|
||||
published_date = None
|
||||
|
||||
# title and content sometimes containing characters such as & ' " etc...
|
||||
title = unescape(entry["title"])
|
||||
content = unescape(entry.get("abs", ""))
|
||||
|
||||
results.append(
|
||||
{
|
||||
"title": title,
|
||||
"url": entry["url"],
|
||||
"content": content,
|
||||
"publishedDate": published_date,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def parse_images(data):
|
||||
results = []
|
||||
if "data" in data:
|
||||
for item in data["data"]:
|
||||
if not item:
|
||||
# the last item in the JSON list is empty, the JSON string ends with "}, {}]"
|
||||
continue
|
||||
replace_url = item.get("replaceUrl", [{}])[0]
|
||||
width = item.get("width")
|
||||
height = item.get("height")
|
||||
img_date = item.get("bdImgnewsDate")
|
||||
publishedDate = None
|
||||
if img_date:
|
||||
publishedDate = datetime.strptime(img_date, "%Y-%m-%d %H:%M")
|
||||
results.append(
|
||||
{
|
||||
"template": "images.html",
|
||||
"url": replace_url.get("FromURL"),
|
||||
"thumbnail_src": item.get("thumbURL"),
|
||||
"img_src": replace_url.get("ObjURL"),
|
||||
"title": html_to_text(item.get("fromPageTitle")),
|
||||
"source": item.get("fromURLHost"),
|
||||
"resolution": f"{width} x {height}",
|
||||
"img_format": item.get("type"),
|
||||
"filesize": item.get("filesize"),
|
||||
"publishedDate": publishedDate,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def parse_it(data):
|
||||
results = []
|
||||
if not data.get("data", {}).get("documents", {}).get("data"):
|
||||
raise SearxEngineAPIException("Invalid response")
|
||||
|
||||
for entry in data["data"]["documents"]["data"]:
|
||||
results.append(
|
||||
{
|
||||
'title': entry["techDocDigest"]["title"],
|
||||
'url': entry["techDocDigest"]["url"],
|
||||
'content': entry["techDocDigest"]["summary"],
|
||||
}
|
||||
)
|
||||
return results
|
||||
81
searx/engines/bandcamp.py
Normal file
81
searx/engines/bandcamp.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Bandcamp (Music)
|
||||
|
||||
@website https://bandcamp.com/
|
||||
@provide-api no
|
||||
@results HTML
|
||||
@parse url, title, content, publishedDate, iframe_src, thumbnail
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode, urlparse, parse_qs
|
||||
from dateutil.parser import parse as dateparse
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import (
|
||||
eval_xpath_getindex,
|
||||
eval_xpath_list,
|
||||
extract_text,
|
||||
)
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://bandcamp.com/',
|
||||
"wikidata_id": 'Q545966',
|
||||
"official_api_documentation": 'https://bandcamp.com/developer',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
categories = ['music']
|
||||
paging = True
|
||||
|
||||
base_url = "https://bandcamp.com/"
|
||||
search_string = 'search?{query}&page={page}'
|
||||
iframe_src = "https://bandcamp.com/EmbeddedPlayer/{type}={result_id}/size=large/bgcol=000/linkcol=fff/artwork=small"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno'])
|
||||
params['url'] = base_url + search_path
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, '//li[contains(@class, "searchresult")]'):
|
||||
|
||||
link = eval_xpath_getindex(result, './/div[@class="itemurl"]/a', 0, default=None)
|
||||
if link is None:
|
||||
continue
|
||||
|
||||
title = result.xpath('.//div[@class="heading"]/a/text()')
|
||||
content = result.xpath('.//div[@class="subhead"]/text()')
|
||||
new_result = {
|
||||
"url": extract_text(link),
|
||||
"title": extract_text(title),
|
||||
"content": extract_text(content),
|
||||
}
|
||||
|
||||
date = eval_xpath_getindex(result, '//div[@class="released"]/text()', 0, default=None)
|
||||
if date:
|
||||
new_result["publishedDate"] = dateparse(date.replace("released ", ""))
|
||||
|
||||
thumbnail = result.xpath('.//div[@class="art"]/img/@src')
|
||||
if thumbnail:
|
||||
new_result['thumbnail'] = thumbnail[0]
|
||||
|
||||
result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0]
|
||||
itemtype = extract_text(result.xpath('.//div[@class="itemtype"]')).lower()
|
||||
if "album" == itemtype:
|
||||
new_result["iframe_src"] = iframe_src.format(type='album', result_id=result_id)
|
||||
elif "track" == itemtype:
|
||||
new_result["iframe_src"] = iframe_src.format(type='track', result_id=result_id)
|
||||
|
||||
results.append(new_result)
|
||||
return results
|
||||
118
searx/engines/base.py
Executable file
118
searx/engines/base.py
Executable file
@@ -0,0 +1,118 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""BASE (Scholar publications)
|
||||
|
||||
"""
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import etree
|
||||
from searx.utils import searx_useragent
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://base-search.net',
|
||||
"wikidata_id": 'Q448335',
|
||||
"official_api_documentation": 'https://api.base-search.net/',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'XML',
|
||||
}
|
||||
|
||||
categories = ['science']
|
||||
|
||||
base_url = (
|
||||
'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'
|
||||
+ '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}'
|
||||
)
|
||||
|
||||
# engine dependent config
|
||||
paging = True
|
||||
number_of_results = 10
|
||||
|
||||
# shortcuts for advanced search
|
||||
shortcut_dict = {
|
||||
# user-friendly keywords
|
||||
'format:': 'dcformat:',
|
||||
'author:': 'dccreator:',
|
||||
'collection:': 'dccollection:',
|
||||
'hdate:': 'dchdate:',
|
||||
'contributor:': 'dccontributor:',
|
||||
'coverage:': 'dccoverage:',
|
||||
'date:': 'dcdate:',
|
||||
'abstract:': 'dcdescription:',
|
||||
'urls:': 'dcidentifier:',
|
||||
'language:': 'dclanguage:',
|
||||
'publisher:': 'dcpublisher:',
|
||||
'relation:': 'dcrelation:',
|
||||
'rights:': 'dcrights:',
|
||||
'source:': 'dcsource:',
|
||||
'subject:': 'dcsubject:',
|
||||
'title:': 'dctitle:',
|
||||
'type:': 'dcdctype:',
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
# replace shortcuts with API advanced search keywords
|
||||
for key, val in shortcut_dict.items():
|
||||
query = re.sub(key, val, query)
|
||||
|
||||
# basic search
|
||||
offset = (params['pageno'] - 1) * number_of_results
|
||||
|
||||
string_args = {
|
||||
'query': urlencode({'query': query}),
|
||||
'offset': offset,
|
||||
'hits': number_of_results,
|
||||
}
|
||||
|
||||
params['url'] = base_url.format(**string_args)
|
||||
|
||||
params['headers']['User-Agent'] = searx_useragent()
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
search_results = etree.XML(resp.content)
|
||||
|
||||
for entry in search_results.xpath('./result/doc'):
|
||||
content = "No description available"
|
||||
url = ""
|
||||
title = ""
|
||||
date = datetime.now() # needed in case no dcdate is available for an item
|
||||
|
||||
for item in entry:
|
||||
if item.attrib["name"] == "dcdate":
|
||||
date = item.text
|
||||
|
||||
elif item.attrib["name"] == "dctitle":
|
||||
title = item.text
|
||||
|
||||
elif item.attrib["name"] == "dclink":
|
||||
url = item.text
|
||||
|
||||
elif item.attrib["name"] == "dcdescription":
|
||||
content = item.text[:300]
|
||||
if len(item.text) > 300:
|
||||
content += "..."
|
||||
|
||||
# dates returned by the BASE API are not several formats
|
||||
publishedDate = None
|
||||
for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']:
|
||||
try:
|
||||
publishedDate = datetime.strptime(date, date_format)
|
||||
break
|
||||
except: # pylint: disable=bare-except
|
||||
pass
|
||||
|
||||
if publishedDate is not None:
|
||||
res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}
|
||||
else:
|
||||
res_dict = {'url': url, 'title': title, 'content': content}
|
||||
|
||||
results.append(res_dict)
|
||||
|
||||
return results
|
||||
96
searx/engines/bilibili.py
Normal file
96
searx/engines/bilibili.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Bilibili is a Chinese video sharing website.
|
||||
|
||||
.. _Bilibili: https://www.bilibili.com
|
||||
"""
|
||||
|
||||
import random
|
||||
import string
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from searx import utils
|
||||
|
||||
# Engine metadata
|
||||
about = {
|
||||
"website": "https://www.bilibili.com",
|
||||
"wikidata_id": "Q3077586",
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
# Engine configuration
|
||||
paging = True
|
||||
results_per_page = 20
|
||||
categories = ["videos"]
|
||||
|
||||
# Search URL
|
||||
base_url = "https://api.bilibili.com/x/web-interface/search/type"
|
||||
|
||||
cookie = {
|
||||
"innersign": "0",
|
||||
"buvid3": "".join(random.choice(string.hexdigits) for _ in range(16)) + "infoc",
|
||||
"i-wanna-go-back": "-1",
|
||||
"b_ut": "7",
|
||||
"FEED_LIVE_VERSION": "V8",
|
||||
"header_theme_version": "undefined",
|
||||
"home_feed_column": "4",
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {
|
||||
"__refresh__": "true",
|
||||
"page": params["pageno"],
|
||||
"page_size": results_per_page,
|
||||
"single_column": "0",
|
||||
"keyword": query,
|
||||
"search_type": "video",
|
||||
}
|
||||
|
||||
params["url"] = f"{base_url}?{urlencode(query_params)}"
|
||||
params["cookies"] = cookie
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
search_res = resp.json()
|
||||
|
||||
results = []
|
||||
|
||||
for item in search_res.get("data", {}).get("result", []):
|
||||
title = utils.html_to_text(item["title"])
|
||||
url = item["arcurl"]
|
||||
thumbnail = item["pic"]
|
||||
description = item["description"]
|
||||
author = item["author"]
|
||||
video_id = item["aid"]
|
||||
unix_date = item["pubdate"]
|
||||
|
||||
formatted_date = datetime.fromtimestamp(unix_date)
|
||||
|
||||
# the duration only seems to be valid if the video is less than 60 mins
|
||||
duration = utils.parse_duration_string(item["duration"])
|
||||
if duration and duration > timedelta(minutes=60):
|
||||
duration = None
|
||||
|
||||
iframe_url = f"https://player.bilibili.com/player.html?aid={video_id}&high_quality=1&autoplay=false&danmaku=0"
|
||||
|
||||
results.append(
|
||||
{
|
||||
"title": title,
|
||||
"url": url,
|
||||
"content": description,
|
||||
"author": author,
|
||||
"publishedDate": formatted_date,
|
||||
"length": duration,
|
||||
"thumbnail": thumbnail,
|
||||
"iframe_src": iframe_url,
|
||||
"template": "videos.html",
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
284
searx/engines/bing.py
Normal file
284
searx/engines/bing.py
Normal file
@@ -0,0 +1,284 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""This is the implementation of the Bing-WEB engine. Some of this
|
||||
implementations are shared by other engines:
|
||||
|
||||
- :ref:`bing images engine`
|
||||
- :ref:`bing news engine`
|
||||
- :ref:`bing videos engine`
|
||||
|
||||
On the `preference page`_ Bing offers a lot of languages an regions (see section
|
||||
LANGUAGE and COUNTRY/REGION). The Language is the language of the UI, we need
|
||||
in SearXNG to get the translations of data such as *"published last week"*.
|
||||
|
||||
There is a description of the official search-APIs_, unfortunately this is not
|
||||
the API we can use or that bing itself would use. You can look up some things
|
||||
in the API to get a better picture of bing, but the value specifications like
|
||||
the market codes are usually outdated or at least no longer used by bing itself.
|
||||
|
||||
The market codes have been harmonized and are identical for web, video and
|
||||
images. The news area has also been harmonized with the other categories. Only
|
||||
political adjustments still seem to be made -- for example, there is no news
|
||||
category for the Chinese market.
|
||||
|
||||
.. _preference page: https://www.bing.com/account/general
|
||||
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
|
||||
|
||||
"""
|
||||
# pylint: disable=too-many-branches, invalid-name
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import base64
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import parse_qs, urlencode, urlparse
|
||||
from lxml import html
|
||||
import babel
|
||||
import babel.languages
|
||||
|
||||
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
|
||||
from searx.locales import language_tag, region_tag
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
about = {
|
||||
"website": 'https://www.bing.com',
|
||||
"wikidata_id": 'Q182496',
|
||||
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
max_page = 200
|
||||
"""200 pages maximum (``&first=1991``)"""
|
||||
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
"""Bing results are always SFW. To get NSFW links from bing some age
|
||||
verification by a cookie is needed / thats not possible in SearXNG.
|
||||
"""
|
||||
|
||||
base_url = 'https://www.bing.com/search'
|
||||
"""Bing (Web) search URL"""
|
||||
|
||||
|
||||
def _page_offset(pageno):
|
||||
return (int(pageno) - 1) * 10 + 1
|
||||
|
||||
|
||||
def set_bing_cookies(params, engine_language, engine_region):
|
||||
params['cookies']['_EDGE_CD'] = f'm={engine_region}&u={engine_language}'
|
||||
params['cookies']['_EDGE_S'] = f'mkt={engine_region}&ui={engine_language}'
|
||||
logger.debug("bing cookies: %s", params['cookies'])
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Assemble a Bing-Web request."""
|
||||
|
||||
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
|
||||
set_bing_cookies(params, engine_language, engine_region)
|
||||
|
||||
page = params.get('pageno', 1)
|
||||
query_params = {
|
||||
'q': query,
|
||||
# if arg 'pq' is missed, sometimes on page 4 we get results from page 1,
|
||||
# don't ask why it is only sometimes / its M$ and they have never been
|
||||
# deterministic ;)
|
||||
'pq': query,
|
||||
}
|
||||
|
||||
# To get correct page, arg first and this arg FORM is needed, the value PERE
|
||||
# is on page 2, on page 3 its PERE1 and on page 4 its PERE2 .. and so forth.
|
||||
# The 'first' arg should never send on page 1.
|
||||
|
||||
if page > 1:
|
||||
query_params['first'] = _page_offset(page) # see also arg FORM
|
||||
if page == 2:
|
||||
query_params['FORM'] = 'PERE'
|
||||
elif page > 2:
|
||||
query_params['FORM'] = 'PERE%s' % (page - 2)
|
||||
|
||||
params['url'] = f'{base_url}?{urlencode(query_params)}'
|
||||
|
||||
if params.get('time_range'):
|
||||
unix_day = int(time.time() / 86400)
|
||||
time_ranges = {'day': '1', 'week': '2', 'month': '3', 'year': f'5_{unix_day-365}_{unix_day}'}
|
||||
params['url'] += f'&filters=ex1:"ez{time_ranges[params["time_range"]]}"'
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
# pylint: disable=too-many-locals
|
||||
|
||||
results = []
|
||||
result_len = 0
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# parse results again if nothing is found yet
|
||||
|
||||
for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
|
||||
|
||||
link = eval_xpath_getindex(result, './/h2/a', 0, None)
|
||||
if link is None:
|
||||
continue
|
||||
url = link.attrib.get('href')
|
||||
title = extract_text(link)
|
||||
|
||||
content = eval_xpath(result, './/p')
|
||||
for p in content:
|
||||
# Make sure that the element is free of:
|
||||
# <span class="algoSlug_icon" # data-priority="2">Web</span>
|
||||
for e in p.xpath('.//span[@class="algoSlug_icon"]'):
|
||||
e.getparent().remove(e)
|
||||
content = extract_text(content)
|
||||
|
||||
# get the real URL
|
||||
if url.startswith('https://www.bing.com/ck/a?'):
|
||||
# get the first value of u parameter
|
||||
url_query = urlparse(url).query
|
||||
parsed_url_query = parse_qs(url_query)
|
||||
param_u = parsed_url_query["u"][0]
|
||||
# remove "a1" in front
|
||||
encoded_url = param_u[2:]
|
||||
# add padding
|
||||
encoded_url = encoded_url + '=' * (-len(encoded_url) % 4)
|
||||
# decode base64 encoded URL
|
||||
url = base64.urlsafe_b64decode(encoded_url).decode()
|
||||
|
||||
# append result
|
||||
results.append({'url': url, 'title': title, 'content': content})
|
||||
|
||||
# get number_of_results
|
||||
if results:
|
||||
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
|
||||
if "-" in result_len_container:
|
||||
start_str, result_len_container = re.split(r'-\d+', result_len_container)
|
||||
start = int(start_str)
|
||||
else:
|
||||
start = 1
|
||||
|
||||
result_len_container = re.sub('[^0-9]', '', result_len_container)
|
||||
if len(result_len_container) > 0:
|
||||
result_len = int(result_len_container)
|
||||
|
||||
expected_start = _page_offset(resp.search_params.get("pageno", 1))
|
||||
|
||||
if expected_start != start:
|
||||
if expected_start > result_len:
|
||||
# Avoid reading more results than available.
|
||||
# For example, if there is 100 results from some search and we try to get results from 120 to 130,
|
||||
# Bing will send back the results from 0 to 10 and no error.
|
||||
# If we compare results count with the first parameter of the request we can avoid this "invalid"
|
||||
# results.
|
||||
return []
|
||||
|
||||
# Sometimes Bing will send back the first result page instead of the requested page as a rate limiting
|
||||
# measure.
|
||||
msg = f"Expected results to start at {expected_start}, but got results starting at {start}"
|
||||
raise SearxEngineAPIException(msg)
|
||||
|
||||
results.append({'number_of_results': result_len})
|
||||
return results
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages and regions from Bing-Web."""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
|
||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||
from searx.utils import gen_useragent
|
||||
|
||||
headers = {
|
||||
"User-Agent": gen_useragent(),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"DNT": "1",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-GPC": "1",
|
||||
"Cache-Control": "max-age=0",
|
||||
}
|
||||
|
||||
resp = get("https://www.bing.com/account/general", headers=headers)
|
||||
if not resp.ok: # type: ignore
|
||||
print("ERROR: response from bing is not OK.")
|
||||
|
||||
dom = html.fromstring(resp.text) # type: ignore
|
||||
|
||||
# languages
|
||||
|
||||
engine_traits.languages['zh'] = 'zh-hans'
|
||||
|
||||
map_lang = {'prs': 'fa-AF', 'en': 'en-us'}
|
||||
bing_ui_lang_map = {
|
||||
# HINT: this list probably needs to be supplemented
|
||||
'en': 'us', # en --> en-us
|
||||
'da': 'dk', # da --> da-dk
|
||||
}
|
||||
|
||||
for href in eval_xpath(dom, '//div[@id="language-section-content"]//div[@class="languageItem"]/a/@href'):
|
||||
eng_lang = parse_qs(urlparse(href).query)['setlang'][0]
|
||||
babel_lang = map_lang.get(eng_lang, eng_lang)
|
||||
try:
|
||||
sxng_tag = language_tag(babel.Locale.parse(babel_lang.replace('-', '_')))
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: language (%s) is unknown by babel" % (babel_lang))
|
||||
continue
|
||||
# Language (e.g. 'en' or 'de') from https://www.bing.com/account/general
|
||||
# is converted by bing to 'en-us' or 'de-de'. But only if there is not
|
||||
# already a '-' delemitter in the language. For instance 'pt-PT' -->
|
||||
# 'pt-pt' and 'pt-br' --> 'pt-br'
|
||||
bing_ui_lang = eng_lang.lower()
|
||||
if '-' not in bing_ui_lang:
|
||||
bing_ui_lang = bing_ui_lang + '-' + bing_ui_lang_map.get(bing_ui_lang, bing_ui_lang)
|
||||
|
||||
conflict = engine_traits.languages.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != bing_ui_lang:
|
||||
print(f"CONFLICT: babel {sxng_tag} --> {conflict}, {bing_ui_lang}")
|
||||
continue
|
||||
engine_traits.languages[sxng_tag] = bing_ui_lang
|
||||
|
||||
# regions (aka "market codes")
|
||||
|
||||
engine_traits.regions['zh-CN'] = 'zh-cn'
|
||||
|
||||
map_market_codes = {
|
||||
'zh-hk': 'en-hk', # not sure why, but at M$ this is the market code for Hongkong
|
||||
}
|
||||
for href in eval_xpath(dom, '//div[@id="region-section-content"]//div[@class="regionItem"]/a/@href'):
|
||||
cc_tag = parse_qs(urlparse(href).query)['cc'][0]
|
||||
if cc_tag == 'clear':
|
||||
engine_traits.all_locale = cc_tag
|
||||
continue
|
||||
|
||||
# add market codes from official languages of the country ..
|
||||
for lang_tag in babel.languages.get_official_languages(cc_tag, de_facto=True):
|
||||
if lang_tag not in engine_traits.languages.keys():
|
||||
# print("ignore lang: %s <-- %s" % (cc_tag, lang_tag))
|
||||
continue
|
||||
lang_tag = lang_tag.split('_')[0] # zh_Hant --> zh
|
||||
market_code = f"{lang_tag}-{cc_tag}" # zh-tw
|
||||
|
||||
market_code = map_market_codes.get(market_code, market_code)
|
||||
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, cc_tag.upper())))
|
||||
conflict = engine_traits.regions.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != market_code:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, market_code))
|
||||
continue
|
||||
engine_traits.regions[sxng_tag] = market_code
|
||||
109
searx/engines/bing_images.py
Normal file
109
searx/engines/bing_images.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Bing-Images: description see :py:obj:`searx.engines.bing`.
|
||||
"""
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import json
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from lxml import html
|
||||
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.engines.bing import set_bing_cookies
|
||||
from searx.engines.bing import fetch_traits # pylint: disable=unused-import
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.bing.com/images',
|
||||
"wikidata_id": 'Q182496',
|
||||
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-image-search-api',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images', 'web']
|
||||
paging = True
|
||||
safesearch = True
|
||||
time_range_support = True
|
||||
|
||||
base_url = 'https://www.bing.com/images/async'
|
||||
"""Bing (Images) search URL"""
|
||||
|
||||
time_map = {
|
||||
'day': 60 * 24,
|
||||
'week': 60 * 24 * 7,
|
||||
'month': 60 * 24 * 31,
|
||||
'year': 60 * 24 * 365,
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Assemble a Bing-Image request."""
|
||||
|
||||
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
|
||||
set_bing_cookies(params, engine_language, engine_region)
|
||||
|
||||
# build URL query
|
||||
# - example: https://www.bing.com/images/async?q=foo&async=content&first=1&count=35
|
||||
query_params = {
|
||||
'q': query,
|
||||
'async': '1',
|
||||
# to simplify the page count lets use the default of 35 images per page
|
||||
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
|
||||
'count': 35,
|
||||
}
|
||||
|
||||
# time range
|
||||
# - example: one year (525600 minutes) 'qft=+filterui:age-lt525600'
|
||||
|
||||
if params['time_range']:
|
||||
query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']]
|
||||
|
||||
params['url'] = base_url + '?' + urlencode(query_params)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
"""Get response from Bing-Images"""
|
||||
|
||||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'):
|
||||
|
||||
metadata = result.xpath('.//a[@class="iusc"]/@m')
|
||||
if not metadata:
|
||||
continue
|
||||
|
||||
metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0])
|
||||
title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip()
|
||||
img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip().split(" · ")
|
||||
source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip()
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'url': metadata['purl'],
|
||||
'thumbnail_src': metadata['turl'],
|
||||
'img_src': metadata['murl'],
|
||||
'content': metadata.get('desc'),
|
||||
'title': title,
|
||||
'source': source,
|
||||
'resolution': img_format[0],
|
||||
'img_format': img_format[1] if len(img_format) >= 2 else None,
|
||||
}
|
||||
)
|
||||
return results
|
||||
160
searx/engines/bing_news.py
Normal file
160
searx/engines/bing_news.py
Normal file
@@ -0,0 +1,160 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Bing-News: description see :py:obj:`searx.engines.bing`.
|
||||
|
||||
.. hint::
|
||||
|
||||
Bing News is *different* in some ways!
|
||||
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.engines.bing import set_bing_cookies
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.bing.com/news',
|
||||
"wikidata_id": 'Q2878637',
|
||||
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'RSS',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['news']
|
||||
paging = True
|
||||
"""If go through the pages and there are actually no new results for another
|
||||
page, then bing returns the results from the last page again."""
|
||||
|
||||
time_range_support = True
|
||||
time_map = {
|
||||
'day': 'interval="4"',
|
||||
'week': 'interval="7"',
|
||||
'month': 'interval="9"',
|
||||
}
|
||||
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
|
||||
difference of *last day* and *last week* in the result list is just marginally.
|
||||
Bing does not have news range ``year`` / we use ``month`` instead."""
|
||||
|
||||
base_url = 'https://www.bing.com/news/infinitescrollajax'
|
||||
"""Bing (News) search URL"""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Assemble a Bing-News request."""
|
||||
|
||||
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
|
||||
set_bing_cookies(params, engine_language, engine_region)
|
||||
|
||||
# build URL query
|
||||
#
|
||||
# example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
|
||||
|
||||
page = int(params.get('pageno', 1)) - 1
|
||||
query_params = {
|
||||
'q': query,
|
||||
'InfiniteScroll': 1,
|
||||
# to simplify the page count lets use the default of 10 images per page
|
||||
'first': page * 10 + 1,
|
||||
'SFX': page,
|
||||
'form': 'PTFTNR',
|
||||
'setlang': engine_region.split('-')[0],
|
||||
'cc': engine_region.split('-')[-1],
|
||||
}
|
||||
|
||||
if params['time_range']:
|
||||
query_params['qft'] = time_map.get(params['time_range'], 'interval="9"')
|
||||
|
||||
params['url'] = base_url + '?' + urlencode(query_params)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
"""Get response from Bing-Video"""
|
||||
results = []
|
||||
|
||||
if not resp.ok or not resp.text:
|
||||
return results
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for newsitem in eval_xpath_list(dom, '//div[contains(@class, "newsitem")]'):
|
||||
|
||||
link = eval_xpath_getindex(newsitem, './/a[@class="title"]', 0, None)
|
||||
if link is None:
|
||||
continue
|
||||
url = link.attrib.get('href')
|
||||
title = extract_text(link)
|
||||
content = extract_text(eval_xpath(newsitem, './/div[@class="snippet"]'))
|
||||
|
||||
metadata = []
|
||||
source = eval_xpath_getindex(newsitem, './/div[contains(@class, "source")]', 0, None)
|
||||
if source is not None:
|
||||
for item in (
|
||||
eval_xpath_getindex(source, './/span[@aria-label]/@aria-label', 0, None),
|
||||
# eval_xpath_getindex(source, './/a', 0, None),
|
||||
# eval_xpath_getindex(source, './div/span', 3, None),
|
||||
link.attrib.get('data-author'),
|
||||
):
|
||||
if item is not None:
|
||||
t = extract_text(item)
|
||||
if t and t.strip():
|
||||
metadata.append(t.strip())
|
||||
metadata = ' | '.join(metadata)
|
||||
|
||||
thumbnail = None
|
||||
imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None)
|
||||
if imagelink is not None:
|
||||
thumbnail = imagelink.attrib.get('src')
|
||||
if not thumbnail.startswith("https://www.bing.com"):
|
||||
thumbnail = 'https://www.bing.com/' + thumbnail
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'thumbnail': thumbnail,
|
||||
'metadata': metadata,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages and regions from Bing-News."""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
|
||||
from searx.engines.bing import fetch_traits as _f
|
||||
|
||||
_f(engine_traits)
|
||||
|
||||
# fix market codes not known by bing news:
|
||||
|
||||
# In bing the market code 'zh-cn' exists, but there is no 'news' category in
|
||||
# bing for this market. Alternatively we use the the market code from Honk
|
||||
# Kong. Even if this is not correct, it is better than having no hits at
|
||||
# all, or sending false queries to bing that could raise the suspicion of a
|
||||
# bot.
|
||||
|
||||
# HINT: 'en-hk' is the region code it does not indicate the language en!!
|
||||
engine_traits.regions['zh-CN'] = 'en-hk'
|
||||
98
searx/engines/bing_videos.py
Normal file
98
searx/engines/bing_videos.py
Normal file
@@ -0,0 +1,98 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=invalid-name
|
||||
"""Bing-Videos: description see :py:obj:`searx.engines.bing`.
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import json
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from lxml import html
|
||||
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.engines.bing import set_bing_cookies
|
||||
from searx.engines.bing import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.bing_images import time_map
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
about = {
|
||||
"website": 'https://www.bing.com/videos',
|
||||
"wikidata_id": 'Q4914152',
|
||||
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-video-search-api',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['videos', 'web']
|
||||
paging = True
|
||||
safesearch = True
|
||||
time_range_support = True
|
||||
|
||||
base_url = 'https://www.bing.com/videos/asyncv2'
|
||||
"""Bing (Videos) async search URL."""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Assemble a Bing-Video request."""
|
||||
|
||||
engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
|
||||
set_bing_cookies(params, engine_language, engine_region)
|
||||
|
||||
# build URL query
|
||||
#
|
||||
# example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
|
||||
|
||||
query_params = {
|
||||
'q': query,
|
||||
'async': 'content',
|
||||
# to simplify the page count lets use the default of 35 images per page
|
||||
'first': (int(params.get('pageno', 1)) - 1) * 35 + 1,
|
||||
'count': 35,
|
||||
}
|
||||
|
||||
# time range
|
||||
#
|
||||
# example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR'
|
||||
|
||||
if params['time_range']:
|
||||
query_params['form'] = 'VRFLTR'
|
||||
query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']]
|
||||
|
||||
params['url'] = base_url + '?' + urlencode(query_params)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
"""Get response from Bing-Video"""
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'):
|
||||
metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
|
||||
info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
|
||||
content = '{0} - {1}'.format(metadata['du'], info)
|
||||
thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0]
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': metadata['murl'],
|
||||
'thumbnail': thumbnail,
|
||||
'title': metadata.get('vt', ''),
|
||||
'content': content,
|
||||
'template': 'videos.html',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
56
searx/engines/bitchute.py
Normal file
56
searx/engines/bitchute.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""bitchute (Videos)"""
|
||||
|
||||
from json import dumps
|
||||
from datetime import datetime
|
||||
from searx.utils import html_to_text
|
||||
|
||||
about = {
|
||||
"website": 'https://bitchute.com',
|
||||
"wikidata_id": "Q45287179",
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
base_url = "https://api.bitchute.com/api/beta/search/videos"
|
||||
categories = ['videos']
|
||||
paging = True
|
||||
results_per_page = 20
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
start_index = (params["pageno"] - 1) * results_per_page
|
||||
data = {"offset": start_index, "limit": results_per_page, "query": query, "sensitivity_id": "normal", "sort": "new"}
|
||||
params["url"] = base_url
|
||||
params["method"] = 'POST'
|
||||
params['headers']['content-type'] = "application/json"
|
||||
params['data'] = dumps(data)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
search_res = resp.json()
|
||||
results = []
|
||||
|
||||
for item in search_res.get('videos', []):
|
||||
|
||||
results.append(
|
||||
{
|
||||
"title": item['video_name'],
|
||||
"url": 'https://www.bitchute.com/video/' + item['video_id'],
|
||||
"content": html_to_text(item['description']),
|
||||
"author": item['channel']['channel_name'],
|
||||
"publishedDate": datetime.strptime(item["date_published"], "%Y-%m-%dT%H:%M:%S.%fZ"),
|
||||
"length": item['duration'],
|
||||
"views": item['view_count'],
|
||||
"thumbnail": item['thumbnail_url'],
|
||||
"iframe_src": 'https://www.bitchute.com/embed/' + item['video_id'],
|
||||
"template": "videos.html",
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
67
searx/engines/bpb.py
Normal file
67
searx/engines/bpb.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""BPB refers to ``Bundeszentrale für poltische Bildung``, which is a German
|
||||
governmental institution aiming to reduce misinformation by providing resources
|
||||
about politics and history.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
|
||||
about = {
|
||||
'website': "https://www.bpb.de",
|
||||
'official_api_documentation': None,
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'JSON',
|
||||
'language': 'de',
|
||||
}
|
||||
|
||||
paging = True
|
||||
categories = ['general']
|
||||
|
||||
|
||||
base_url = "https://www.bpb.de"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
'query[term]': query,
|
||||
'page': params['pageno'] - 1,
|
||||
'sort[direction]': 'descending',
|
||||
'payload[nid]': 65350,
|
||||
}
|
||||
params['url'] = f"{base_url}/bpbapi/filter/search?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
json_resp = resp.json()
|
||||
|
||||
for result in json_resp['teaser']:
|
||||
thumbnail = None
|
||||
if result['teaser']['image']:
|
||||
thumbnail = base_url + result['teaser']['image']['sources'][-1]['url']
|
||||
|
||||
metadata = result['extension']['overline']
|
||||
authors = ', '.join(author['name'] for author in result['extension'].get('authors', []))
|
||||
if authors:
|
||||
metadata += f" | {authors}"
|
||||
|
||||
publishedDate = None
|
||||
if result['extension'].get('publishingDate'):
|
||||
publishedDate = datetime.fromtimestamp(result['extension']['publishingDate'])
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': base_url + result['teaser']['link']['url'],
|
||||
'title': result['teaser']['title'],
|
||||
'content': result['teaser']['text'],
|
||||
'thumbnail': thumbnail,
|
||||
'publishedDate': publishedDate,
|
||||
'metadata': metadata,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
505
searx/engines/brave.py
Normal file
505
searx/engines/brave.py
Normal file
@@ -0,0 +1,505 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
|
||||
news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
|
||||
<time_range_support>` is limited (see remarks).
|
||||
|
||||
Configured ``brave`` engines:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: brave
|
||||
engine: brave
|
||||
...
|
||||
brave_category: search
|
||||
time_range_support: true
|
||||
paging: true
|
||||
|
||||
- name: brave.images
|
||||
engine: brave
|
||||
...
|
||||
brave_category: images
|
||||
|
||||
- name: brave.videos
|
||||
engine: brave
|
||||
...
|
||||
brave_category: videos
|
||||
|
||||
- name: brave.news
|
||||
engine: brave
|
||||
...
|
||||
brave_category: news
|
||||
|
||||
- name: brave.goggles
|
||||
time_range_support: true
|
||||
paging: true
|
||||
...
|
||||
brave_category: goggles
|
||||
|
||||
|
||||
.. _brave regions:
|
||||
|
||||
Brave regions
|
||||
=============
|
||||
|
||||
Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
|
||||
locales. To get a mapping, all *officiat de-facto* languages of the Brave
|
||||
region are mapped to regions in SearXNG (see :py:obj:`babel
|
||||
<babel.languages.get_official_languages>`):
|
||||
|
||||
.. code:: python
|
||||
|
||||
"regions": {
|
||||
..
|
||||
"en-CA": "ca",
|
||||
"fr-CA": "ca",
|
||||
..
|
||||
}
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
The language (aka region) support of Brave's index is limited to very basic
|
||||
languages. The search results for languages like Chinese or Arabic are of
|
||||
low quality.
|
||||
|
||||
|
||||
.. _brave googles:
|
||||
|
||||
Brave Goggles
|
||||
=============
|
||||
|
||||
.. _list of Goggles: https://search.brave.com/goggles/discover
|
||||
.. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf
|
||||
.. _Goggles Quickstart: https://github.com/brave/goggles-quickstart
|
||||
|
||||
Goggles allow you to choose, alter, or extend the ranking of Brave Search
|
||||
results (`Goggles Whitepaper`_). Goggles are openly developed by the community
|
||||
of Brave Search users.
|
||||
|
||||
Select from the `list of Goggles`_ people have published, or create your own
|
||||
(`Goggles Quickstart`_).
|
||||
|
||||
|
||||
.. _brave languages:
|
||||
|
||||
Brave languages
|
||||
===============
|
||||
|
||||
Brave's language support is limited to the UI (menus, area local notations,
|
||||
etc). Brave's index only seems to support a locale, but it does not seem to
|
||||
support any languages in its index. The choice of available languages is very
|
||||
small (and its not clear to me where the difference in UI is when switching
|
||||
from en-us to en-ca or en-gb).
|
||||
|
||||
In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
|
||||
UI languages are stored in a custom field named ``ui_lang``:
|
||||
|
||||
.. code:: python
|
||||
|
||||
"custom": {
|
||||
"ui_lang": {
|
||||
"ca": "ca",
|
||||
"de-DE": "de-de",
|
||||
"en-CA": "en-ca",
|
||||
"en-GB": "en-gb",
|
||||
"en-US": "en-us",
|
||||
"es": "es",
|
||||
"fr-CA": "fr-ca",
|
||||
"fr-FR": "fr-fr",
|
||||
"ja-JP": "ja-jp",
|
||||
"pt-BR": "pt-br",
|
||||
"sq-AL": "sq-al"
|
||||
}
|
||||
},
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from typing import Any, TYPE_CHECKING
|
||||
|
||||
from urllib.parse import (
|
||||
urlencode,
|
||||
urlparse,
|
||||
)
|
||||
|
||||
from dateutil import parser
|
||||
from lxml import html
|
||||
|
||||
from searx import locales
|
||||
from searx.utils import (
|
||||
extr,
|
||||
extract_text,
|
||||
eval_xpath,
|
||||
eval_xpath_list,
|
||||
eval_xpath_getindex,
|
||||
js_variable_to_python,
|
||||
get_embeded_stream_url,
|
||||
)
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
about = {
|
||||
"website": 'https://search.brave.com/',
|
||||
"wikidata_id": 'Q22906900',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
base_url = "https://search.brave.com/"
|
||||
categories = []
|
||||
brave_category = 'search'
|
||||
Goggles = Any
|
||||
"""Brave supports common web-search, videos, images, news, and goggles search.
|
||||
|
||||
- ``search``: Common WEB search
|
||||
- ``videos``: search for videos
|
||||
- ``images``: search for images
|
||||
- ``news``: search for news
|
||||
- ``goggles``: Common WEB search with custom rules
|
||||
"""
|
||||
|
||||
brave_spellcheck = False
|
||||
"""Brave supports some kind of spell checking. When activated, Brave tries to
|
||||
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
|
||||
the UI of Brave the user gets warned about this, since we can not warn the user
|
||||
in SearXNG, the spellchecking is disabled by default.
|
||||
"""
|
||||
|
||||
send_accept_language_header = True
|
||||
paging = False
|
||||
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
|
||||
category All) and in the goggles category."""
|
||||
max_page = 10
|
||||
"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
|
||||
to do more won't return any result and you will most likely be flagged as a bot.
|
||||
"""
|
||||
|
||||
safesearch = True
|
||||
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
|
||||
|
||||
time_range_support = False
|
||||
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
|
||||
category All) and in the goggles category."""
|
||||
|
||||
time_range_map = {
|
||||
'day': 'pd',
|
||||
'week': 'pw',
|
||||
'month': 'pm',
|
||||
'year': 'py',
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
# Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
|
||||
params['headers']['Accept-Encoding'] = 'gzip, deflate'
|
||||
|
||||
args = {
|
||||
'q': query,
|
||||
'source': 'web',
|
||||
}
|
||||
if brave_spellcheck:
|
||||
args['spellcheck'] = '1'
|
||||
|
||||
if brave_category in ('search', 'goggles'):
|
||||
if params.get('pageno', 1) - 1:
|
||||
args['offset'] = params.get('pageno', 1) - 1
|
||||
if time_range_map.get(params['time_range']):
|
||||
args['tf'] = time_range_map.get(params['time_range'])
|
||||
|
||||
if brave_category == 'goggles':
|
||||
args['goggles_id'] = Goggles
|
||||
|
||||
params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
|
||||
|
||||
# set properties in the cookies
|
||||
|
||||
params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
|
||||
# the useLocation is IP based, we use cookie 'country' for the region
|
||||
params['cookies']['useLocation'] = '0'
|
||||
params['cookies']['summarizer'] = '0'
|
||||
|
||||
engine_region = traits.get_region(params['searxng_locale'], 'all')
|
||||
params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
|
||||
|
||||
ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
|
||||
params['cookies']['ui_lang'] = ui_lang
|
||||
|
||||
logger.debug("cookies %s", params['cookies'])
|
||||
|
||||
params['headers']['Sec-Fetch-Dest'] = "document"
|
||||
params['headers']['Sec-Fetch-Mode'] = "navigate"
|
||||
params['headers']['Sec-Fetch-Site'] = "same-origin"
|
||||
params['headers']['Sec-Fetch-User'] = "?1"
|
||||
|
||||
|
||||
def _extract_published_date(published_date_raw):
|
||||
if published_date_raw is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
return parser.parse(published_date_raw)
|
||||
except parser.ParserError:
|
||||
return None
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
|
||||
if brave_category in ('search', 'goggles'):
|
||||
return _parse_search(resp)
|
||||
|
||||
if brave_category in ('news'):
|
||||
return _parse_news(resp)
|
||||
|
||||
# Example script source containing the data:
|
||||
#
|
||||
# kit.start(app, element, {
|
||||
# node_ids: [0, 19],
|
||||
# data: [{type:"data",data: .... ["q","goggles_id"],route:1,url:1}}]
|
||||
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
js_object = "[{" + extr(resp.text, "data: [{", "}}],") + "}}]"
|
||||
json_data = js_variable_to_python(js_object)
|
||||
|
||||
# json_data is a list and at the second position (0,1) in this list we find the "response" data we need ..
|
||||
json_resp = json_data[1]['data']['body']['response']
|
||||
|
||||
if brave_category == 'images':
|
||||
return _parse_images(json_resp)
|
||||
if brave_category == 'videos':
|
||||
return _parse_videos(json_resp)
|
||||
|
||||
raise ValueError(f"Unsupported brave category: {brave_category}")
|
||||
|
||||
|
||||
def _parse_search(resp) -> EngineResults:
|
||||
result_list = EngineResults()
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# I doubt that Brave is still providing the "answer" class / I haven't seen
|
||||
# answers in brave for a long time.
|
||||
answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
|
||||
if answer_tag:
|
||||
url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
|
||||
answer = extract_text(answer_tag)
|
||||
if answer is not None:
|
||||
result_list.add(result_list.types.Answer(answer=answer, url=url))
|
||||
|
||||
# xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
|
||||
xpath_results = '//div[contains(@class, "snippet ")]'
|
||||
|
||||
for result in eval_xpath_list(dom, xpath_results):
|
||||
|
||||
url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
|
||||
title_tag = eval_xpath_getindex(
|
||||
result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
|
||||
)
|
||||
if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
|
||||
continue
|
||||
|
||||
content: str = extract_text(
|
||||
eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
|
||||
) # type: ignore
|
||||
pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
|
||||
pub_date = _extract_published_date(pub_date_raw)
|
||||
if pub_date and content.startswith(pub_date_raw):
|
||||
content = content.lstrip(pub_date_raw).strip("- \n\t")
|
||||
|
||||
thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
|
||||
|
||||
item = {
|
||||
'url': url,
|
||||
'title': extract_text(title_tag),
|
||||
'content': content,
|
||||
'publishedDate': pub_date,
|
||||
'thumbnail': thumbnail,
|
||||
}
|
||||
|
||||
video_tag = eval_xpath_getindex(
|
||||
result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
|
||||
)
|
||||
if video_tag is not None:
|
||||
|
||||
# In my tests a video tag in the WEB search was most often not a
|
||||
# video, except the ones from youtube ..
|
||||
|
||||
iframe_src = get_embeded_stream_url(url)
|
||||
if iframe_src:
|
||||
item['iframe_src'] = iframe_src
|
||||
item['template'] = 'videos.html'
|
||||
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
|
||||
pub_date_raw = extract_text(
|
||||
eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
|
||||
)
|
||||
item['publishedDate'] = _extract_published_date(pub_date_raw)
|
||||
else:
|
||||
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
|
||||
|
||||
result_list.append(item)
|
||||
|
||||
return result_list
|
||||
|
||||
|
||||
def _parse_news(resp) -> EngineResults:
|
||||
|
||||
result_list = EngineResults()
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
|
||||
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
|
||||
url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
|
||||
if url is None:
|
||||
continue
|
||||
|
||||
title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
|
||||
content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
|
||||
thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
|
||||
|
||||
item = {
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"thumbnail": thumbnail,
|
||||
}
|
||||
|
||||
result_list.append(item)
|
||||
|
||||
return result_list
|
||||
|
||||
|
||||
def _parse_images(json_resp) -> EngineResults:
|
||||
result_list = EngineResults()
|
||||
|
||||
for result in json_resp["results"]:
|
||||
item = {
|
||||
'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': result['description'],
|
||||
'template': 'images.html',
|
||||
'resolution': result['properties']['format'],
|
||||
'source': result['source'],
|
||||
'img_src': result['properties']['url'],
|
||||
'thumbnail_src': result['thumbnail']['src'],
|
||||
}
|
||||
result_list.append(item)
|
||||
|
||||
return result_list
|
||||
|
||||
|
||||
def _parse_videos(json_resp) -> EngineResults:
|
||||
result_list = EngineResults()
|
||||
|
||||
for result in json_resp["results"]:
|
||||
|
||||
url = result['url']
|
||||
item = {
|
||||
'url': url,
|
||||
'title': result['title'],
|
||||
'content': result['description'],
|
||||
'template': 'videos.html',
|
||||
'length': result['video']['duration'],
|
||||
'duration': result['video']['duration'],
|
||||
'publishedDate': _extract_published_date(result['age']),
|
||||
}
|
||||
|
||||
if result['thumbnail'] is not None:
|
||||
item['thumbnail'] = result['thumbnail']['src']
|
||||
|
||||
iframe_src = get_embeded_stream_url(url)
|
||||
if iframe_src:
|
||||
item['iframe_src'] = iframe_src
|
||||
|
||||
result_list.append(item)
|
||||
|
||||
return result_list
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
|
||||
regions>` from Brave."""
|
||||
|
||||
# pylint: disable=import-outside-toplevel, too-many-branches
|
||||
|
||||
import babel.languages
|
||||
from searx.locales import region_tag, language_tag
|
||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||
|
||||
engine_traits.custom["ui_lang"] = {}
|
||||
|
||||
headers = {
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
}
|
||||
lang_map = {'no': 'nb'} # norway
|
||||
|
||||
# languages (UI)
|
||||
|
||||
resp = get('https://search.brave.com/settings', headers=headers)
|
||||
|
||||
if not resp.ok: # type: ignore
|
||||
print("ERROR: response from Brave is not OK.")
|
||||
dom = html.fromstring(resp.text) # type: ignore
|
||||
|
||||
for option in dom.xpath('//section//option[@value="en-us"]/../option'):
|
||||
|
||||
ui_lang = option.get('value')
|
||||
try:
|
||||
l = babel.Locale.parse(ui_lang, sep='-')
|
||||
if l.territory:
|
||||
sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
|
||||
else:
|
||||
sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
|
||||
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
|
||||
continue
|
||||
|
||||
conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != ui_lang:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
|
||||
continue
|
||||
engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
|
||||
|
||||
# search regions of brave
|
||||
|
||||
resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers)
|
||||
|
||||
if not resp.ok: # type: ignore
|
||||
print("ERROR: response from Brave is not OK.")
|
||||
|
||||
country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
|
||||
country_js = country_js[: country_js.index("},k={default")]
|
||||
country_tags = js_variable_to_python(country_js)
|
||||
|
||||
for k, v in country_tags.items():
|
||||
if k == 'all':
|
||||
engine_traits.all_locale = 'all'
|
||||
continue
|
||||
country_tag = v['value']
|
||||
|
||||
# add official languages of the country ..
|
||||
for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
|
||||
lang_tag = lang_map.get(lang_tag, lang_tag)
|
||||
sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
|
||||
# print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
|
||||
|
||||
conflict = engine_traits.regions.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != country_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
|
||||
continue
|
||||
engine_traits.regions[sxng_tag] = country_tag
|
||||
118
searx/engines/bt4g.py
Normal file
118
searx/engines/bt4g.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""BT4G_ (bt4g.com) is not a tracker and doesn't store any content and only
|
||||
collects torrent metadata (such as file names and file sizes) and a magnet link
|
||||
(torrent identifier).
|
||||
|
||||
This engine does not parse the HTML page because there is an API in XML (RSS).
|
||||
The RSS feed provides fewer data like amount of seeders/leechers and the files
|
||||
in the torrent file. It's a tradeoff for a "stable" engine as the XML from RSS
|
||||
content will change way less than the HTML page.
|
||||
|
||||
.. _BT4G: https://bt4g.com/
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The engine has the following additional settings:
|
||||
|
||||
- :py:obj:`bt4g_order_by`
|
||||
- :py:obj:`bt4g_category`
|
||||
|
||||
With this options a SearXNG maintainer is able to configure **additional**
|
||||
engines for specific torrent searches. For example a engine to search only for
|
||||
Movies and sort the result list by the count of seeders.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: bt4g.movie
|
||||
engine: bt4g
|
||||
shortcut: bt4gv
|
||||
categories: video
|
||||
bt4g_order_by: seeders
|
||||
bt4g_category: 'movie'
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
from lxml import etree
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://bt4gprx.com',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'XML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['files']
|
||||
paging = True
|
||||
time_range_support = True
|
||||
|
||||
# search-url
|
||||
url = 'https://bt4gprx.com'
|
||||
search_url = url + '/search?q={search_term}&orderby={order_by}&category={category}&p={pageno}&page=rss'
|
||||
bt4g_order_by = 'relevance'
|
||||
"""Result list can be ordered by ``relevance`` (default), ``size``, ``seeders``
|
||||
or ``time``.
|
||||
|
||||
.. hint::
|
||||
|
||||
When *time_range* is activate, the results always ordered by ``time``.
|
||||
"""
|
||||
|
||||
bt4g_category = 'all'
|
||||
"""BT$G offers categories: ``all`` (default), ``audio``, ``movie``, ``doc``,
|
||||
``app`` and `` other``.
|
||||
"""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
order_by = bt4g_order_by
|
||||
if params['time_range']:
|
||||
order_by = 'time'
|
||||
|
||||
params['url'] = search_url.format(
|
||||
search_term=quote(query),
|
||||
order_by=order_by,
|
||||
category=bt4g_category,
|
||||
pageno=params['pageno'],
|
||||
)
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
search_results = etree.XML(resp.content)
|
||||
|
||||
# return empty array if nothing is found
|
||||
if len(search_results) == 0:
|
||||
return []
|
||||
|
||||
for entry in search_results.xpath('./channel/item'):
|
||||
title = entry.find("title").text
|
||||
link = entry.find("guid").text
|
||||
fullDescription = entry.find("description").text.split('<br>')
|
||||
magnetlink = entry.find("link").text
|
||||
pubDate = entry.find("pubDate").text
|
||||
results.append(
|
||||
{
|
||||
'url': link,
|
||||
'title': title,
|
||||
'magnetlink': magnetlink,
|
||||
'seed': 'N/A',
|
||||
'leech': 'N/A',
|
||||
'filesize': fullDescription[1],
|
||||
'publishedDate': datetime.strptime(pubDate, '%a,%d %b %Y %H:%M:%S %z'),
|
||||
'template': 'torrent.html',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
85
searx/engines/btdigg.py
Normal file
85
searx/engines/btdigg.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
BTDigg (Videos, Music, Files)
|
||||
"""
|
||||
|
||||
from urllib.parse import quote, urljoin
|
||||
|
||||
from lxml import html
|
||||
from searx.utils import extract_text
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://btdig.com',
|
||||
"wikidata_id": 'Q4836698',
|
||||
"official_api_documentation": {'url': 'https://btdig.com/contacts', 'comment': 'on demand'},
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['files']
|
||||
paging = True
|
||||
|
||||
# search-url
|
||||
url = 'https://btdig.com'
|
||||
search_url = url + '/search?q={search_term}&p={pageno}'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'] - 1)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
search_res = dom.xpath('//div[@class="one_result"]')
|
||||
|
||||
# return empty array if nothing is found
|
||||
if not search_res:
|
||||
return []
|
||||
|
||||
# parse results
|
||||
for result in search_res:
|
||||
link = result.xpath('.//div[@class="torrent_name"]//a')[0]
|
||||
href = urljoin(url, link.attrib.get('href'))
|
||||
title = extract_text(link)
|
||||
|
||||
excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]
|
||||
content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False)
|
||||
content = content.strip().replace('\n', ' | ')
|
||||
content = ' '.join(content.split())
|
||||
|
||||
filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0]
|
||||
files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0]
|
||||
|
||||
# convert files to int if possible
|
||||
try:
|
||||
files = int(files)
|
||||
except: # pylint: disable=bare-except
|
||||
files = None
|
||||
|
||||
magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href']
|
||||
|
||||
# append result
|
||||
results.append(
|
||||
{
|
||||
'url': href,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'filesize': filesize,
|
||||
'files': files,
|
||||
'magnetlink': magnetlink,
|
||||
'template': 'torrent.html',
|
||||
}
|
||||
)
|
||||
|
||||
# return results sorted by seeder
|
||||
return results
|
||||
59
searx/engines/ccc_media.py
Normal file
59
searx/engines/ccc_media.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""media.ccc.de"""
|
||||
|
||||
import datetime
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from dateutil import parser
|
||||
|
||||
about = {
|
||||
'website': 'https://media.ccc.de',
|
||||
'official_api_documentation': 'https://github.com/voc/voctoweb',
|
||||
'use_official_api': True,
|
||||
'require_api_key': False,
|
||||
'results': 'JSON',
|
||||
}
|
||||
categories = ['videos']
|
||||
paging = True
|
||||
|
||||
api_url = "https://api.media.ccc.de"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {'q': query, 'page': params['pageno']}
|
||||
params['url'] = f"{api_url}/public/events/search?{urlencode(args)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
for item in resp.json()['events']:
|
||||
publishedDate = None
|
||||
if item.get('date'):
|
||||
publishedDate = parser.parse(item['date'])
|
||||
|
||||
iframe_src = None
|
||||
for rec in item['recordings']:
|
||||
if rec['mime_type'].startswith('video'):
|
||||
if not iframe_src:
|
||||
iframe_src = rec['recording_url']
|
||||
elif rec['mime_type'] == 'video/mp4':
|
||||
# prefer mp4 (minimal data rates)
|
||||
iframe_src = rec['recording_url']
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'videos.html',
|
||||
'url': item['frontend_link'],
|
||||
'title': item['title'],
|
||||
'content': item['description'],
|
||||
'thumbnail': item['thumb_url'],
|
||||
'publishedDate': publishedDate,
|
||||
'length': datetime.timedelta(seconds=item['length']),
|
||||
'iframe_src': iframe_src,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
68
searx/engines/chefkoch.py
Normal file
68
searx/engines/chefkoch.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Chefkoch is a German database of recipes.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
|
||||
about = {
|
||||
'website': "https://www.chefkoch.de",
|
||||
'official_api_documentation': None,
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'JSON',
|
||||
'language': 'de',
|
||||
}
|
||||
|
||||
paging = True
|
||||
categories = []
|
||||
|
||||
number_of_results = 20
|
||||
skip_premium = True
|
||||
|
||||
|
||||
base_url = "https://api.chefkoch.de"
|
||||
thumbnail_format = "crop-240x300"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {'query': query, 'limit': number_of_results, 'offset': (params['pageno'] - 1) * number_of_results}
|
||||
params['url'] = f"{base_url}/v2/search-gateway/recipes?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
json = resp.json()
|
||||
|
||||
for result in json['results']:
|
||||
recipe = result['recipe']
|
||||
|
||||
if skip_premium and (recipe['isPremium'] or recipe['isPlus']):
|
||||
continue
|
||||
|
||||
publishedDate = None
|
||||
if recipe['submissionDate']:
|
||||
publishedDate = datetime.strptime(result['recipe']['submissionDate'][:19], "%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
content = [
|
||||
f"Schwierigkeitsstufe (1-3): {recipe['difficulty']}",
|
||||
f"Zubereitungszeit: {recipe['preparationTime']}min",
|
||||
f"Anzahl der Zutaten: {recipe['ingredientCount']}",
|
||||
]
|
||||
|
||||
if recipe['subtitle']:
|
||||
content.insert(0, recipe['subtitle'])
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': recipe['siteUrl'],
|
||||
'title': recipe['title'],
|
||||
'content': " | ".join(content),
|
||||
'thumbnail': recipe['previewImageUrlTemplate'].replace("<format>", thumbnail_format),
|
||||
'publishedDate': publishedDate,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
223
searx/engines/chinaso.py
Normal file
223
searx/engines/chinaso.py
Normal file
@@ -0,0 +1,223 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""ChinaSo_, a search engine for the chinese language area.
|
||||
|
||||
.. attention::
|
||||
|
||||
ChinaSo engine does not return real URL, the links from these search
|
||||
engines violate the privacy of the users!!
|
||||
|
||||
We try to find a solution for this problem, please follow `issue #4694`_.
|
||||
|
||||
As long as the problem has not been resolved, these engines are
|
||||
not active in a standard setup (``inactive: true``).
|
||||
|
||||
.. _ChinaSo: https://www.chinaso.com/
|
||||
.. _issue #4694: https://github.com/searxng/searxng/issues/4694
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The engine has the following additional settings:
|
||||
|
||||
- :py:obj:`chinaso_category` (:py:obj:`ChinasoCategoryType`)
|
||||
- :py:obj:`chinaso_news_source` (:py:obj:`ChinasoNewsSourceType`)
|
||||
|
||||
In the example below, all three ChinaSO engines are using the :ref:`network
|
||||
<engine network>` from the ``chinaso news`` engine.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: chinaso news
|
||||
engine: chinaso
|
||||
shortcut: chinaso
|
||||
categories: [news]
|
||||
chinaso_category: news
|
||||
chinaso_news_source: all
|
||||
|
||||
- name: chinaso images
|
||||
engine: chinaso
|
||||
network: chinaso news
|
||||
shortcut: chinasoi
|
||||
categories: [images]
|
||||
chinaso_category: images
|
||||
|
||||
- name: chinaso videos
|
||||
engine: chinaso
|
||||
network: chinaso news
|
||||
shortcut: chinasov
|
||||
categories: [videos]
|
||||
chinaso_category: videos
|
||||
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
import typing
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.utils import html_to_text
|
||||
|
||||
about = {
|
||||
"website": "https://www.chinaso.com/",
|
||||
"wikidata_id": "Q10846064",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
"language": "zh",
|
||||
}
|
||||
|
||||
paging = True
|
||||
time_range_support = True
|
||||
results_per_page = 10
|
||||
categories = []
|
||||
|
||||
ChinasoCategoryType = typing.Literal['news', 'videos', 'images']
|
||||
"""ChinaSo supports news, videos, images search.
|
||||
|
||||
- ``news``: search for news
|
||||
- ``videos``: search for videos
|
||||
- ``images``: search for images
|
||||
|
||||
In the category ``news`` you can additionally filter by option
|
||||
:py:obj:`chinaso_news_source`.
|
||||
"""
|
||||
chinaso_category = 'news'
|
||||
"""Configure ChinaSo category (:py:obj:`ChinasoCategoryType`)."""
|
||||
|
||||
ChinasoNewsSourceType = typing.Literal['CENTRAL', 'LOCAL', 'BUSINESS', 'EPAPER', 'all']
|
||||
"""Filtering ChinaSo-News results by source:
|
||||
|
||||
- ``CENTRAL``: central publication
|
||||
- ``LOCAL``: local publication
|
||||
- ``BUSINESS``: business publication
|
||||
- ``EPAPER``: E-Paper
|
||||
- ``all``: all sources
|
||||
"""
|
||||
chinaso_news_source: ChinasoNewsSourceType = 'all'
|
||||
"""Configure ChinaSo-News type (:py:obj:`ChinasoNewsSourceType`)."""
|
||||
|
||||
time_range_dict = {'day': '24h', 'week': '1w', 'month': '1m', 'year': '1y'}
|
||||
|
||||
base_url = "https://www.chinaso.com"
|
||||
|
||||
|
||||
def init(_):
|
||||
if chinaso_category not in ('news', 'videos', 'images'):
|
||||
raise ValueError(f"Unsupported category: {chinaso_category}")
|
||||
if chinaso_category == 'news' and chinaso_news_source not in typing.get_args(ChinasoNewsSourceType):
|
||||
raise ValueError(f"Unsupported news source: {chinaso_news_source}")
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {"q": query}
|
||||
|
||||
if time_range_dict.get(params['time_range']):
|
||||
query_params["stime"] = time_range_dict[params['time_range']]
|
||||
query_params["etime"] = 'now'
|
||||
|
||||
category_config = {
|
||||
'news': {'endpoint': '/v5/general/v1/web/search', 'params': {'pn': params["pageno"], 'ps': results_per_page}},
|
||||
'images': {
|
||||
'endpoint': '/v5/general/v1/search/image',
|
||||
'params': {'start_index': (params["pageno"] - 1) * results_per_page, 'rn': results_per_page},
|
||||
},
|
||||
'videos': {
|
||||
'endpoint': '/v5/general/v1/search/video',
|
||||
'params': {'start_index': (params["pageno"] - 1) * results_per_page, 'rn': results_per_page},
|
||||
},
|
||||
}
|
||||
if chinaso_news_source != 'all':
|
||||
if chinaso_news_source == 'EPAPER':
|
||||
category_config['news']['params']["type"] = 'EPAPER'
|
||||
else:
|
||||
category_config['news']['params']["cate"] = chinaso_news_source
|
||||
|
||||
query_params.update(category_config[chinaso_category]['params'])
|
||||
|
||||
params["url"] = f"{base_url}{category_config[chinaso_category]['endpoint']}?{urlencode(query_params)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
try:
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
raise SearxEngineAPIException(f"Invalid response: {e}") from e
|
||||
|
||||
parsers = {'news': parse_news, 'images': parse_images, 'videos': parse_videos}
|
||||
|
||||
return parsers[chinaso_category](data)
|
||||
|
||||
|
||||
def parse_news(data):
|
||||
results = []
|
||||
if not data.get("data", {}).get("data"):
|
||||
raise SearxEngineAPIException("Invalid response")
|
||||
|
||||
for entry in data["data"]["data"]:
|
||||
published_date = None
|
||||
if entry.get("timestamp"):
|
||||
try:
|
||||
published_date = datetime.fromtimestamp(int(entry["timestamp"]))
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
results.append(
|
||||
{
|
||||
'title': html_to_text(entry["title"]),
|
||||
'url': entry["url"],
|
||||
'content': html_to_text(entry["snippet"]),
|
||||
'publishedDate': published_date,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def parse_images(data):
|
||||
results = []
|
||||
if not data.get("data", {}).get("arrRes"):
|
||||
raise SearxEngineAPIException("Invalid response")
|
||||
|
||||
for entry in data["data"]["arrRes"]:
|
||||
results.append(
|
||||
{
|
||||
'url': entry["web_url"],
|
||||
'title': html_to_text(entry["title"]),
|
||||
'content': html_to_text(entry["ImageInfo"]),
|
||||
'template': 'images.html',
|
||||
'img_src': entry["url"].replace("http://", "https://"),
|
||||
'thumbnail_src': entry["largeimage"].replace("http://", "https://"),
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def parse_videos(data):
|
||||
results = []
|
||||
if not data.get("data", {}).get("arrRes"):
|
||||
raise SearxEngineAPIException("Invalid response")
|
||||
|
||||
for entry in data["data"]["arrRes"]:
|
||||
published_date = None
|
||||
if entry.get("VideoPubDate"):
|
||||
try:
|
||||
published_date = datetime.fromtimestamp(int(entry["VideoPubDate"]))
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': entry["url"],
|
||||
'title': html_to_text(entry["raw_title"]),
|
||||
'template': 'videos.html',
|
||||
'publishedDate': published_date,
|
||||
'thumbnail': entry["image_src"].replace("http://", "https://"),
|
||||
}
|
||||
)
|
||||
return results
|
||||
68
searx/engines/cloudflareai.py
Normal file
68
searx/engines/cloudflareai.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Cloudflare AI engine"""
|
||||
|
||||
from json import loads, dumps
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
|
||||
about = {
|
||||
"website": 'https://ai.cloudflare.com',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": 'https://developers.cloudflare.com/workers-ai',
|
||||
"use_official_api": True,
|
||||
"require_api_key": True,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
cf_account_id = ''
|
||||
cf_ai_api = ''
|
||||
cf_ai_gateway = ''
|
||||
|
||||
cf_ai_model = ''
|
||||
cf_ai_model_display_name = 'Cloudflare AI'
|
||||
|
||||
# Assistant messages hint to the AI about the desired output format. Not all models support this role.
|
||||
cf_ai_model_assistant = 'Keep your answers as short and effective as possible.'
|
||||
# System messages define the AI's personality. You can use them to set rules and how you expect the AI to behave.
|
||||
cf_ai_model_system = 'You are a self-aware language model who is honest and direct about any question from the user.'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
params['query'] = query
|
||||
|
||||
params['url'] = f'https://gateway.ai.cloudflare.com/v1/{cf_account_id}/{cf_ai_gateway}/workers-ai/{cf_ai_model}'
|
||||
|
||||
params['method'] = 'POST'
|
||||
|
||||
params['headers']['Authorization'] = f'Bearer {cf_ai_api}'
|
||||
params['headers']['Content-Type'] = 'application/json'
|
||||
|
||||
params['data'] = dumps(
|
||||
{
|
||||
'messages': [
|
||||
{'role': 'assistant', 'content': cf_ai_model_assistant},
|
||||
{'role': 'system', 'content': cf_ai_model_system},
|
||||
{'role': 'user', 'content': params['query']},
|
||||
]
|
||||
}
|
||||
).encode('utf-8')
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
json = loads(resp.text)
|
||||
|
||||
if 'error' in json:
|
||||
raise SearxEngineAPIException('Cloudflare AI error: ' + json['error'])
|
||||
|
||||
if 'result' in json:
|
||||
results.append(
|
||||
{
|
||||
'content': json['result']['response'],
|
||||
'infobox': cf_ai_model_display_name,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
243
searx/engines/command.py
Normal file
243
searx/engines/command.py
Normal file
@@ -0,0 +1,243 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""With *command engines* administrators can run engines to integrate arbitrary
|
||||
shell commands.
|
||||
|
||||
.. attention::
|
||||
|
||||
When creating and enabling a ``command`` engine on a public instance, you
|
||||
must be careful to avoid leaking private data.
|
||||
|
||||
The easiest solution is to limit the access by setting ``tokens`` as described
|
||||
in section :ref:`private engines`. The engine base is flexible. Only your
|
||||
imagination can limit the power of this engine (and maybe security concerns).
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The following options are available:
|
||||
|
||||
``command``:
|
||||
A comma separated list of the elements of the command. A special token
|
||||
``{{QUERY}}`` tells where to put the search terms of the user. Example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
['ls', '-l', '-h', '{{QUERY}}']
|
||||
|
||||
``delimiter``:
|
||||
A mapping containing a delimiter ``char`` and the *titles* of each element in
|
||||
``keys``.
|
||||
|
||||
``parse_regex``:
|
||||
A dict containing the regular expressions for each result key.
|
||||
|
||||
``query_type``:
|
||||
|
||||
The expected type of user search terms. Possible values: ``path`` and
|
||||
``enum``.
|
||||
|
||||
``path``:
|
||||
Checks if the user provided path is inside the working directory. If not,
|
||||
the query is not executed.
|
||||
|
||||
``enum``:
|
||||
Is a list of allowed search terms. If the user submits something which is
|
||||
not included in the list, the query returns an error.
|
||||
|
||||
``query_enum``:
|
||||
A list containing allowed search terms if ``query_type`` is set to ``enum``.
|
||||
|
||||
``working_dir``:
|
||||
The directory where the command has to be executed. Default: ``./``.
|
||||
|
||||
``result_separator``:
|
||||
The character that separates results. Default: ``\\n``.
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
The example engine below can be used to find files with a specific name in the
|
||||
configured working directory:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: find
|
||||
engine: command
|
||||
command: ['find', '.', '-name', '{{QUERY}}']
|
||||
query_type: path
|
||||
shortcut: fnd
|
||||
delimiter:
|
||||
chars: ' '
|
||||
keys: ['line']
|
||||
|
||||
Implementations
|
||||
===============
|
||||
"""
|
||||
|
||||
import re
|
||||
from os.path import expanduser, isabs, realpath, commonprefix
|
||||
from shlex import split as shlex_split
|
||||
from subprocess import Popen, PIPE
|
||||
from threading import Thread
|
||||
|
||||
from searx import logger
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
|
||||
engine_type = 'offline'
|
||||
paging = True
|
||||
command = []
|
||||
delimiter = {}
|
||||
parse_regex = {}
|
||||
query_type = ''
|
||||
query_enum = []
|
||||
environment_variables = {}
|
||||
working_dir = realpath('.')
|
||||
result_separator = '\n'
|
||||
timeout = 4.0
|
||||
|
||||
_command_logger = logger.getChild('command')
|
||||
_compiled_parse_regex = {}
|
||||
|
||||
|
||||
def init(engine_settings):
|
||||
check_parsing_options(engine_settings)
|
||||
|
||||
if 'command' not in engine_settings:
|
||||
raise ValueError('engine command : missing configuration key: command')
|
||||
|
||||
global command, working_dir, delimiter, parse_regex, environment_variables # pylint: disable=global-statement
|
||||
|
||||
command = engine_settings['command']
|
||||
|
||||
if 'working_dir' in engine_settings:
|
||||
working_dir = engine_settings['working_dir']
|
||||
if not isabs(engine_settings['working_dir']):
|
||||
working_dir = realpath(working_dir)
|
||||
|
||||
if 'parse_regex' in engine_settings:
|
||||
parse_regex = engine_settings['parse_regex']
|
||||
for result_key, regex in parse_regex.items():
|
||||
_compiled_parse_regex[result_key] = re.compile(regex, flags=re.MULTILINE)
|
||||
if 'delimiter' in engine_settings:
|
||||
delimiter = engine_settings['delimiter']
|
||||
|
||||
if 'environment_variables' in engine_settings:
|
||||
environment_variables = engine_settings['environment_variables']
|
||||
|
||||
|
||||
def search(query, params) -> EngineResults:
|
||||
res = EngineResults()
|
||||
cmd = _get_command_to_run(query)
|
||||
if not cmd:
|
||||
return res
|
||||
|
||||
reader_thread = Thread(target=_get_results_from_process, args=(res, cmd, params['pageno']))
|
||||
reader_thread.start()
|
||||
reader_thread.join(timeout=timeout)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _get_command_to_run(query):
|
||||
params = shlex_split(query)
|
||||
__check_query_params(params)
|
||||
|
||||
cmd = []
|
||||
for c in command:
|
||||
if c == '{{QUERY}}':
|
||||
cmd.extend(params)
|
||||
else:
|
||||
cmd.append(c)
|
||||
|
||||
return cmd
|
||||
|
||||
|
||||
def _get_results_from_process(res: EngineResults, cmd, pageno):
|
||||
leftover = ''
|
||||
count = 0
|
||||
start, end = __get_results_limits(pageno)
|
||||
with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process:
|
||||
line = process.stdout.readline()
|
||||
while line:
|
||||
buf = leftover + line.decode('utf-8')
|
||||
raw_results = buf.split(result_separator)
|
||||
if raw_results[-1]:
|
||||
leftover = raw_results[-1]
|
||||
raw_results = raw_results[:-1]
|
||||
|
||||
for raw_result in raw_results:
|
||||
result = __parse_single_result(raw_result)
|
||||
if result is None:
|
||||
_command_logger.debug('skipped result:', raw_result)
|
||||
continue
|
||||
|
||||
if start <= count and count <= end: # pylint: disable=chained-comparison
|
||||
res.add(res.types.KeyValue(kvmap=result))
|
||||
|
||||
count += 1
|
||||
if end < count:
|
||||
return res
|
||||
|
||||
line = process.stdout.readline()
|
||||
|
||||
return_code = process.wait(timeout=timeout)
|
||||
if return_code != 0:
|
||||
raise RuntimeError('non-zero return code when running command', cmd, return_code)
|
||||
return None
|
||||
|
||||
|
||||
def __get_results_limits(pageno):
|
||||
start = (pageno - 1) * 10
|
||||
end = start + 9
|
||||
return start, end
|
||||
|
||||
|
||||
def __check_query_params(params):
|
||||
if not query_type:
|
||||
return
|
||||
|
||||
if query_type == 'path':
|
||||
query_path = params[-1]
|
||||
query_path = expanduser(query_path)
|
||||
if commonprefix([realpath(query_path), working_dir]) != working_dir:
|
||||
raise ValueError('requested path is outside of configured working directory')
|
||||
elif query_type == 'enum' and len(query_enum) > 0:
|
||||
for param in params:
|
||||
if param not in query_enum:
|
||||
raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum)
|
||||
|
||||
|
||||
def check_parsing_options(engine_settings):
|
||||
"""Checks if delimiter based parsing or regex parsing is configured correctly"""
|
||||
|
||||
if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings:
|
||||
raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex')
|
||||
if 'delimiter' in engine_settings and 'parse_regex' in engine_settings:
|
||||
raise ValueError('failed to init settings for parsing lines: too many settings')
|
||||
|
||||
if 'delimiter' in engine_settings:
|
||||
if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']:
|
||||
raise ValueError
|
||||
|
||||
|
||||
def __parse_single_result(raw_result):
|
||||
"""Parses command line output based on configuration"""
|
||||
|
||||
result = {}
|
||||
|
||||
if delimiter:
|
||||
elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1)
|
||||
if len(elements) != len(delimiter['keys']):
|
||||
return {}
|
||||
for i in range(len(elements)): # pylint: disable=consider-using-enumerate
|
||||
result[delimiter['keys'][i]] = elements[i]
|
||||
|
||||
if parse_regex:
|
||||
for result_key, regex in _compiled_parse_regex.items():
|
||||
found = regex.search(raw_result)
|
||||
if not found:
|
||||
return {}
|
||||
result[result_key] = raw_result[found.start() : found.end()]
|
||||
|
||||
return result
|
||||
151
searx/engines/core.py
Normal file
151
searx/engines/core.py
Normal file
@@ -0,0 +1,151 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""CORE_ (COnnecting REpositories) provides a comprehensive bibliographic
|
||||
database of the world’s scholarly literature, collecting and indexing
|
||||
research from repositories and journals.
|
||||
|
||||
.. _CORE: https://core.ac.uk/about
|
||||
|
||||
.. _core engine config:
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The engine has the following additional settings:
|
||||
|
||||
- :py:obj:`api_key`
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: core.ac.uk
|
||||
engine: core
|
||||
categories: science
|
||||
shortcut: cor
|
||||
api_key: "..."
|
||||
timeout: 5
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
# pylint: disable=too-many-branches
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
|
||||
about = {
|
||||
"website": 'https://core.ac.uk',
|
||||
"wikidata_id": 'Q22661180',
|
||||
"official_api_documentation": 'https://api.core.ac.uk/docs/v3',
|
||||
"use_official_api": True,
|
||||
"require_api_key": True,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
api_key = 'unset'
|
||||
"""For an API key register at https://core.ac.uk/services/api and insert
|
||||
the API key in the engine :ref:`core engine config`."""
|
||||
|
||||
categories = ['science', 'scientific publications']
|
||||
paging = True
|
||||
nb_per_page = 10
|
||||
base_url = 'https://api.core.ac.uk/v3/search/works/'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
if api_key == 'unset':
|
||||
raise SearxEngineAPIException('missing CORE API key')
|
||||
|
||||
# API v3 uses different parameters
|
||||
search_params = {
|
||||
'q': query,
|
||||
'offset': (params['pageno'] - 1) * nb_per_page,
|
||||
'limit': nb_per_page,
|
||||
'sort': 'relevance',
|
||||
}
|
||||
|
||||
params['url'] = base_url + '?' + urlencode(search_params)
|
||||
params['headers'] = {'Authorization': f'Bearer {api_key}'}
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
json_data = resp.json()
|
||||
|
||||
for result in json_data.get('results', []):
|
||||
# Get title
|
||||
if not result.get('title'):
|
||||
continue
|
||||
|
||||
# Get URL - try different options
|
||||
url = None
|
||||
|
||||
# Try DOI first
|
||||
doi = result.get('doi')
|
||||
if doi:
|
||||
url = f'https://doi.org/{doi}'
|
||||
|
||||
if url is None and result.get('doi'):
|
||||
# use the DOI reference
|
||||
url = 'https://doi.org/' + str(result['doi'])
|
||||
elif result.get('id'):
|
||||
url = 'https://core.ac.uk/works/' + str(result['id'])
|
||||
elif result.get('downloadUrl'):
|
||||
url = result['downloadUrl']
|
||||
elif result.get('sourceFulltextUrls'):
|
||||
url = result['sourceFulltextUrls']
|
||||
else:
|
||||
continue
|
||||
|
||||
# Published date
|
||||
published_date = None
|
||||
|
||||
raw_date = result.get('publishedDate') or result.get('depositedDate')
|
||||
if raw_date:
|
||||
try:
|
||||
published_date = datetime.fromisoformat(result['publishedDate'].replace('Z', '+00:00'))
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
|
||||
# Handle journals
|
||||
journals = []
|
||||
if result.get('journals'):
|
||||
journals = [j.get('title') for j in result['journals'] if j.get('title')]
|
||||
|
||||
# Handle publisher
|
||||
publisher = result.get('publisher', '').strip("'")
|
||||
if publisher:
|
||||
publisher = publisher.strip("'")
|
||||
|
||||
# Handle authors
|
||||
authors = set()
|
||||
for i in result.get('authors', []):
|
||||
name = i.get("name")
|
||||
if name:
|
||||
authors.add(name)
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'paper.html',
|
||||
'title': result.get('title'),
|
||||
'url': url,
|
||||
'content': result.get('fullText', '') or '',
|
||||
# 'comments': '',
|
||||
'tags': result.get('fieldOfStudy', []),
|
||||
'publishedDate': published_date,
|
||||
'type': result.get('documentType', '') or '',
|
||||
'authors': authors,
|
||||
'editor': ', '.join(result.get('contributors', [])),
|
||||
'publisher': publisher,
|
||||
'journal': ', '.join(journals),
|
||||
'doi': result.get('doi'),
|
||||
# 'issn' : ''
|
||||
# 'isbn' : ''
|
||||
'pdf_url': result.get('downloadUrl', {}) or result.get("sourceFulltextUrls", {}),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
38
searx/engines/cppreference.py
Normal file
38
searx/engines/cppreference.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Cppreference
|
||||
"""
|
||||
from lxml import html
|
||||
from searx.utils import eval_xpath
|
||||
|
||||
|
||||
about = {
|
||||
"website": "https://en.cppreference.com/",
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
|
||||
categories = ['it']
|
||||
url = 'https://en.cppreference.com/'
|
||||
search_url = url + 'mwiki/index.php?title=Special%3ASearch&search={query}'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(query=query)
|
||||
return query
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
for result in eval_xpath(dom, '//div[contains(@class, "mw-search-result-heading")]'):
|
||||
results.append(
|
||||
{
|
||||
'url': url + eval_xpath(result, './/a/@href')[0],
|
||||
'title': eval_xpath(result, './/a/text()')[0],
|
||||
}
|
||||
)
|
||||
return results
|
||||
70
searx/engines/crates.py
Normal file
70
searx/engines/crates.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Cargo search on crates.io"""
|
||||
|
||||
from collections import OrderedDict
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from dateutil import parser
|
||||
|
||||
about = {
|
||||
"website": "https://crates.io/",
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": "https://crates.io/data-access",
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
categories = ["it", "packages", "cargo"]
|
||||
|
||||
|
||||
# engine dependent config
|
||||
paging = True
|
||||
page_size = 10
|
||||
search_url = "https://crates.io/api/v1/crates"
|
||||
|
||||
linked_terms = OrderedDict(
|
||||
[
|
||||
("homepage", "Project homepage"),
|
||||
("documentation", "Documentation"),
|
||||
("repository", "Source code"),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def request(query: str, params):
|
||||
|
||||
args = urlencode({"page": params["pageno"], "q": query, "per_page": page_size})
|
||||
params["url"] = f"{search_url}?{args}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
for package in resp.json()["crates"]:
|
||||
|
||||
published_date = package.get("updated_at")
|
||||
published_date = parser.parse(published_date)
|
||||
|
||||
links = {}
|
||||
for k, v in linked_terms.items():
|
||||
l = package.get(k)
|
||||
if l:
|
||||
links[v] = l
|
||||
|
||||
results.append(
|
||||
{
|
||||
"template": "packages.html",
|
||||
"url": f'https://crates.io/crates/{package["name"]}',
|
||||
"title": package["name"],
|
||||
"package_name": package["name"],
|
||||
"tags": package["keywords"],
|
||||
"content": package["description"],
|
||||
"version": package["newest_version"] or package["max_version"] or package["max_stable_version"],
|
||||
"publishedDate": published_date,
|
||||
"links": links,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
63
searx/engines/crossref.py
Normal file
63
searx/engines/crossref.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""CrossRef"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
|
||||
about = {
|
||||
"website": "https://www.crossref.org/",
|
||||
"wikidata_id": "Q5188229",
|
||||
"official_api_documentation": "https://api.crossref.org",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
categories = ["science", "scientific publications"]
|
||||
paging = True
|
||||
search_url = "https://api.crossref.org/works"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params["url"] = search_url + "?" + urlencode({"query": query, "offset": 20 * (params["pageno"] - 1)})
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
for record in resp.json()["message"]["items"]:
|
||||
|
||||
if record["type"] == "component":
|
||||
# These seem to be files published along with papers. Not something you'd search for
|
||||
continue
|
||||
result = {
|
||||
"template": "paper.html",
|
||||
"content": record.get("abstract", ""),
|
||||
"doi": record.get("DOI"),
|
||||
"pages": record.get("page"),
|
||||
"publisher": record.get("publisher"),
|
||||
"tags": record.get("subject"),
|
||||
"type": record.get("type"),
|
||||
"url": record.get("URL"),
|
||||
"volume": record.get("volume"),
|
||||
}
|
||||
if record["type"] == "book-chapter":
|
||||
result["title"] = record["container-title"][0]
|
||||
if record["title"][0].lower().strip() != result["title"].lower().strip():
|
||||
result["title"] += f" ({record['title'][0]})"
|
||||
else:
|
||||
result["title"] = record["title"][0] if "title" in record else record.get("container-title", [None])[0]
|
||||
result["journal"] = record.get("container-title", [None])[0] if "title" in record else None
|
||||
|
||||
if "resource" in record and "primary" in record["resource"] and "URL" in record["resource"]["primary"]:
|
||||
result["url"] = record["resource"]["primary"]["URL"]
|
||||
if "published" in record and "date-parts" in record["published"]:
|
||||
result["publishedDate"] = datetime(*(record["published"]["date-parts"][0] + [1, 1][:3]))
|
||||
result["authors"] = [a.get("given", "") + " " + a.get("family", "") for a in record.get("author", [])]
|
||||
result["isbn"] = record.get("isbn") or [i["value"] for i in record.get("isbn-type", [])]
|
||||
# All the links are not PDFs, even if the URL ends with ".pdf"
|
||||
# result["pdf_url"] = record.get("link", [{"URL": None}])[0]["URL"]
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
53
searx/engines/currency_convert.py
Normal file
53
searx/engines/currency_convert.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Currency convert (DuckDuckGo)
|
||||
"""
|
||||
|
||||
import json
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
"wikidata_id": 'Q12805',
|
||||
"official_api_documentation": 'https://duckduckgo.com/api',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'JSONP',
|
||||
"description": "Service from DuckDuckGo.",
|
||||
}
|
||||
|
||||
engine_type = 'online_currency'
|
||||
categories = []
|
||||
base_url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'
|
||||
weight = 100
|
||||
|
||||
https_support = True
|
||||
|
||||
|
||||
def request(_query, params):
|
||||
params['url'] = base_url.format(params['from'], params['to'])
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
res = EngineResults()
|
||||
|
||||
# remove first and last lines to get only json
|
||||
json_resp = resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2]
|
||||
try:
|
||||
conversion_rate = float(json.loads(json_resp)["to"][0]["mid"])
|
||||
except IndexError:
|
||||
return res
|
||||
answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format(
|
||||
resp.search_params['amount'],
|
||||
resp.search_params['from'],
|
||||
resp.search_params['amount'] * conversion_rate,
|
||||
resp.search_params['to'],
|
||||
conversion_rate,
|
||||
resp.search_params['from_name'],
|
||||
resp.search_params['to_name'],
|
||||
)
|
||||
|
||||
url = f"https://duckduckgo.com/?q={resp.search_params['from']}+to+{resp.search_params['to']}"
|
||||
res.add(res.types.Answer(answer=answer, url=url))
|
||||
return res
|
||||
251
searx/engines/dailymotion.py
Normal file
251
searx/engines/dailymotion.py
Normal file
@@ -0,0 +1,251 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Dailymotion (Videos)
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. _REST GET: https://developers.dailymotion.com/tools/
|
||||
.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters
|
||||
.. _Video filters API: https://developers.dailymotion.com/api/#video-filters
|
||||
.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection
|
||||
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urlencode
|
||||
import time
|
||||
import babel
|
||||
|
||||
from searx.network import get, raise_for_httperror # see https://github.com/searxng/searxng/issues/762
|
||||
from searx.utils import html_to_text
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.locales import region_tag, language_tag
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.dailymotion.com',
|
||||
"wikidata_id": 'Q769222',
|
||||
"official_api_documentation": 'https://www.dailymotion.com/developer',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['videos']
|
||||
paging = True
|
||||
number_of_results = 10
|
||||
|
||||
time_range_support = True
|
||||
time_delta_dict = {
|
||||
"day": timedelta(days=1),
|
||||
"week": timedelta(days=7),
|
||||
"month": timedelta(days=31),
|
||||
"year": timedelta(days=365),
|
||||
}
|
||||
|
||||
safesearch = True
|
||||
safesearch_params = {
|
||||
2: {'is_created_for_kids': 'true'},
|
||||
1: {'is_created_for_kids': 'true'},
|
||||
0: {},
|
||||
}
|
||||
"""True if this video is "Created for Kids" / intends to target an audience
|
||||
under the age of 16 (``is_created_for_kids`` in `Video filters API`_ )
|
||||
"""
|
||||
|
||||
family_filter_map = {
|
||||
2: 'true',
|
||||
1: 'true',
|
||||
0: 'false',
|
||||
}
|
||||
"""By default, the family filter is turned on. Setting this parameter to
|
||||
``false`` will stop filtering-out explicit content from searches and global
|
||||
contexts (``family_filter`` in `Global API Parameters`_ ).
|
||||
"""
|
||||
|
||||
result_fields = [
|
||||
'allow_embed',
|
||||
'description',
|
||||
'title',
|
||||
'created_time',
|
||||
'duration',
|
||||
'url',
|
||||
'thumbnail_360_url',
|
||||
'id',
|
||||
]
|
||||
"""`Fields selection`_, by default, a few fields are returned. To request more
|
||||
specific fields, the ``fields`` parameter is used with the list of fields
|
||||
SearXNG needs in the response to build a video result list.
|
||||
"""
|
||||
|
||||
search_url = 'https://api.dailymotion.com/videos?'
|
||||
"""URL to retrieve a list of videos.
|
||||
|
||||
- `REST GET`_
|
||||
- `Global API Parameters`_
|
||||
- `Video filters API`_
|
||||
"""
|
||||
|
||||
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
|
||||
"""URL template to embed video in SearXNG's result list."""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
if not query:
|
||||
return False
|
||||
|
||||
eng_region: str = traits.get_region(params['searxng_locale'], 'en_US') # type: ignore
|
||||
eng_lang = traits.get_language(params['searxng_locale'], 'en')
|
||||
|
||||
args = {
|
||||
'search': query,
|
||||
'family_filter': family_filter_map.get(params['safesearch'], 'false'),
|
||||
'thumbnail_ratio': 'original', # original|widescreen|square
|
||||
# https://developers.dailymotion.com/api/#video-filters
|
||||
'languages': eng_lang,
|
||||
'page': params['pageno'],
|
||||
'password_protected': 'false',
|
||||
'private': 'false',
|
||||
'sort': 'relevance',
|
||||
'limit': number_of_results,
|
||||
'fields': ','.join(result_fields),
|
||||
}
|
||||
|
||||
args.update(safesearch_params.get(params['safesearch'], {}))
|
||||
|
||||
# Don't add localization and country arguments if the user does select a
|
||||
# language (:de, :en, ..)
|
||||
|
||||
if len(params['searxng_locale'].split('-')) > 1:
|
||||
# https://developers.dailymotion.com/api/#global-parameters
|
||||
args['localization'] = eng_region
|
||||
args['country'] = eng_region.split('_')[1]
|
||||
# Insufficient rights for the `ams_country' parameter of route `GET /videos'
|
||||
# 'ams_country': eng_region.split('_')[1],
|
||||
|
||||
time_delta = time_delta_dict.get(params["time_range"])
|
||||
if time_delta:
|
||||
created_after = datetime.now() - time_delta
|
||||
args['created_after'] = datetime.timestamp(created_after)
|
||||
|
||||
query_str = urlencode(args)
|
||||
params['url'] = search_url + query_str
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
search_res = resp.json()
|
||||
|
||||
# check for an API error
|
||||
if 'error' in search_res:
|
||||
raise SearxEngineAPIException(search_res['error'].get('message'))
|
||||
|
||||
raise_for_httperror(resp)
|
||||
|
||||
# parse results
|
||||
for res in search_res.get('list', []):
|
||||
|
||||
title = res['title']
|
||||
url = res['url']
|
||||
|
||||
content = html_to_text(res['description'])
|
||||
if len(content) > 300:
|
||||
content = content[:300] + '...'
|
||||
|
||||
publishedDate = datetime.fromtimestamp(res['created_time'], None)
|
||||
|
||||
length = time.gmtime(res.get('duration'))
|
||||
if length.tm_hour:
|
||||
length = time.strftime("%H:%M:%S", length)
|
||||
else:
|
||||
length = time.strftime("%M:%S", length)
|
||||
|
||||
thumbnail = res['thumbnail_360_url']
|
||||
thumbnail = thumbnail.replace("http://", "https://")
|
||||
|
||||
item = {
|
||||
'template': 'videos.html',
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'publishedDate': publishedDate,
|
||||
'length': length,
|
||||
'thumbnail': thumbnail,
|
||||
}
|
||||
|
||||
# HINT: no mater what the value is, without API token videos can't shown
|
||||
# embedded
|
||||
if res['allow_embed']:
|
||||
item['iframe_src'] = iframe_src.format(video_id=res['id'])
|
||||
|
||||
results.append(item)
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch locales & languages from dailymotion.
|
||||
|
||||
Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_.
|
||||
There are duplications in the locale codes returned from Dailymotion which
|
||||
can be ignored::
|
||||
|
||||
en_EN --> en_GB, en_US
|
||||
ar_AA --> ar_EG, ar_AE, ar_SA
|
||||
|
||||
The language list `api/languages <https://api.dailymotion.com/languages>`_
|
||||
contains over 7000 *languages* codes (see PR1071_). We use only those
|
||||
language codes that are used in the locales.
|
||||
|
||||
.. _PR1071: https://github.com/searxng/searxng/pull/1071
|
||||
|
||||
"""
|
||||
|
||||
resp = get('https://api.dailymotion.com/locales')
|
||||
if not resp.ok: # type: ignore
|
||||
print("ERROR: response from dailymotion/locales is not OK.")
|
||||
|
||||
for item in resp.json()['list']: # type: ignore
|
||||
eng_tag = item['locale']
|
||||
if eng_tag in ('en_EN', 'ar_AA'):
|
||||
continue
|
||||
try:
|
||||
sxng_tag = region_tag(babel.Locale.parse(eng_tag))
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: item unknown --> %s" % item)
|
||||
continue
|
||||
|
||||
conflict = engine_traits.regions.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
engine_traits.regions[sxng_tag] = eng_tag
|
||||
|
||||
locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()]
|
||||
|
||||
resp = get('https://api.dailymotion.com/languages')
|
||||
if not resp.ok: # type: ignore
|
||||
print("ERROR: response from dailymotion/languages is not OK.")
|
||||
|
||||
for item in resp.json()['list']: # type: ignore
|
||||
eng_tag = item['code']
|
||||
if eng_tag in locale_lang_list:
|
||||
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
|
||||
engine_traits.languages[sxng_tag] = eng_tag
|
||||
52
searx/engines/deepl.py
Normal file
52
searx/engines/deepl.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Deepl translation engine"""
|
||||
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
about = {
|
||||
"website": 'https://deepl.com',
|
||||
"wikidata_id": 'Q43968444',
|
||||
"official_api_documentation": 'https://www.deepl.com/docs-api',
|
||||
"use_official_api": True,
|
||||
"require_api_key": True,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
engine_type = 'online_dictionary'
|
||||
categories = ['general', 'translate']
|
||||
|
||||
url = 'https://api-free.deepl.com/v2/translate'
|
||||
api_key = None
|
||||
|
||||
|
||||
def request(_query, params):
|
||||
'''pre-request callback
|
||||
|
||||
params<dict>:
|
||||
|
||||
- ``method`` : POST/GET
|
||||
- ``headers``: {}
|
||||
- ``data``: {} # if method == POST
|
||||
- ``url``: ''
|
||||
- ``category``: 'search category'
|
||||
- ``pageno``: 1 # number of the requested page
|
||||
'''
|
||||
|
||||
params['url'] = url
|
||||
params['method'] = 'POST'
|
||||
params['data'] = {'auth_key': api_key, 'text': params['query'], 'target_lang': params['to_lang'][1]}
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
|
||||
res = EngineResults()
|
||||
data = resp.json()
|
||||
if not data.get('translations'):
|
||||
return res
|
||||
|
||||
translations = [res.types.Translations.Item(text=t['text']) for t in data['translations']]
|
||||
res.add(res.types.Translations(translations=translations))
|
||||
|
||||
return res
|
||||
61
searx/engines/deezer.py
Normal file
61
searx/engines/deezer.py
Normal file
@@ -0,0 +1,61 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Deezer (Music)
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://deezer.com',
|
||||
"wikidata_id": 'Q602243',
|
||||
"official_api_documentation": 'https://developers.deezer.com/',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['music']
|
||||
paging = True
|
||||
|
||||
# search-url
|
||||
url = 'https://api.deezer.com/'
|
||||
search_url = url + 'search?{query}&index={offset}'
|
||||
iframe_src = "https://www.deezer.com/plugins/player?type=tracks&id={audioid}"
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
offset = (params['pageno'] - 1) * 25
|
||||
|
||||
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
search_res = loads(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in search_res.get('data', []):
|
||||
if result['type'] == 'track':
|
||||
title = result['title']
|
||||
url = result['link'] # pylint: disable=redefined-outer-name
|
||||
|
||||
if url.startswith('http://'):
|
||||
url = 'https' + url[4:]
|
||||
|
||||
content = '{} - {} - {}'.format(result['artist']['name'], result['album']['title'], result['title'])
|
||||
|
||||
# append result
|
||||
results.append(
|
||||
{'url': url, 'title': title, 'iframe_src': iframe_src.format(audioid=result['id']), 'content': content}
|
||||
)
|
||||
|
||||
# return results
|
||||
return results
|
||||
86
searx/engines/demo_offline.py
Normal file
86
searx/engines/demo_offline.py
Normal file
@@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Within this module we implement a *demo offline engine*. Do not look to
|
||||
close to the implementation, its just a simple example. To get in use of this
|
||||
*demo* engine add the following entry to your engines list in ``settings.yml``:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: my offline engine
|
||||
engine: demo_offline
|
||||
shortcut: demo
|
||||
disabled: false
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from searx.result_types import EngineResults
|
||||
from searx.enginelib import EngineCache
|
||||
|
||||
engine_type = 'offline'
|
||||
categories = ['general']
|
||||
disabled = True
|
||||
timeout = 2.0
|
||||
|
||||
about = {
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
# if there is a need for globals, use a leading underline
|
||||
_my_offline_engine: str = ""
|
||||
|
||||
CACHE: EngineCache
|
||||
"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
|
||||
seconds."""
|
||||
|
||||
|
||||
def init(engine_settings):
|
||||
"""Initialization of the (offline) engine. The origin of this demo engine is a
|
||||
simple json string which is loaded in this example while the engine is
|
||||
initialized."""
|
||||
global _my_offline_engine, CACHE # pylint: disable=global-statement
|
||||
|
||||
CACHE = EngineCache(engine_settings["name"]) # type:ignore
|
||||
|
||||
_my_offline_engine = (
|
||||
'[ {"value": "%s"}'
|
||||
', {"value":"first item"}'
|
||||
', {"value":"second item"}'
|
||||
', {"value":"third item"}'
|
||||
']' % engine_settings.get('name')
|
||||
)
|
||||
|
||||
|
||||
def search(query, request_params) -> EngineResults:
|
||||
"""Query (offline) engine and return results. Assemble the list of results
|
||||
from your local engine. In this demo engine we ignore the 'query' term,
|
||||
usual you would pass the 'query' term to your local engine to filter out the
|
||||
results.
|
||||
"""
|
||||
res = EngineResults()
|
||||
count = CACHE.get("count", 0)
|
||||
|
||||
for row in json.loads(_my_offline_engine):
|
||||
count += 1
|
||||
kvmap = {
|
||||
'query': query,
|
||||
'language': request_params['searxng_locale'],
|
||||
'value': row.get("value"),
|
||||
}
|
||||
res.add(
|
||||
res.types.KeyValue(
|
||||
caption=f"Demo Offline Engine Result #{count}",
|
||||
key_title="Name",
|
||||
value_title="Value",
|
||||
kvmap=kvmap,
|
||||
)
|
||||
)
|
||||
res.add(res.types.LegacyResult(number_of_results=count))
|
||||
|
||||
# cache counter value for 20sec
|
||||
CACHE.set("count", count, expire=20)
|
||||
return res
|
||||
106
searx/engines/demo_online.py
Normal file
106
searx/engines/demo_online.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Within this module we implement a *demo online engine*. Do not look to
|
||||
close to the implementation, its just a simple example which queries `The Art
|
||||
Institute of Chicago <https://www.artic.edu>`_
|
||||
|
||||
To get in use of this *demo* engine add the following entry to your engines
|
||||
list in ``settings.yml``:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: my online engine
|
||||
engine: demo_online
|
||||
shortcut: demo
|
||||
disabled: false
|
||||
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
engine_type = 'online'
|
||||
send_accept_language_header = True
|
||||
categories = ['general']
|
||||
disabled = True
|
||||
timeout = 2.0
|
||||
categories = ['images']
|
||||
paging = True
|
||||
page_size = 20
|
||||
|
||||
search_api = 'https://api.artic.edu/api/v1/artworks/search?'
|
||||
image_api = 'https://www.artic.edu/iiif/2/'
|
||||
|
||||
about = {
|
||||
"website": 'https://www.artic.edu',
|
||||
"wikidata_id": 'Q239303',
|
||||
"official_api_documentation": 'http://api.artic.edu/docs/',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
|
||||
# if there is a need for globals, use a leading underline
|
||||
_my_online_engine = None
|
||||
|
||||
|
||||
def init(engine_settings):
|
||||
"""Initialization of the (online) engine. If no initialization is needed, drop
|
||||
this init function.
|
||||
|
||||
"""
|
||||
global _my_online_engine # pylint: disable=global-statement
|
||||
_my_online_engine = engine_settings.get('name')
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Build up the ``params`` for the online request. In this example we build a
|
||||
URL to fetch images from `artic.edu <https://artic.edu>`__
|
||||
|
||||
"""
|
||||
args = urlencode(
|
||||
{
|
||||
'q': query,
|
||||
'page': params['pageno'],
|
||||
'fields': 'id,title,artist_display,medium_display,image_id,date_display,dimensions,artist_titles',
|
||||
'limit': page_size,
|
||||
}
|
||||
)
|
||||
params['url'] = search_api + args
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
"""Parse out the result items from the response. In this example we parse the
|
||||
response from `api.artic.edu <https://artic.edu>`__ and filter out all
|
||||
images.
|
||||
|
||||
"""
|
||||
res = EngineResults()
|
||||
json_data = loads(resp.text)
|
||||
|
||||
res.add(
|
||||
res.types.Answer(
|
||||
answer="this is a dummy answer ..",
|
||||
url="https://example.org",
|
||||
)
|
||||
)
|
||||
|
||||
for result in json_data['data']:
|
||||
|
||||
if not result['image_id']:
|
||||
continue
|
||||
|
||||
res.append(
|
||||
{
|
||||
'url': 'https://artic.edu/artworks/%(id)s' % result,
|
||||
'title': result['title'] + " (%(date_display)s) // %(artist_display)s" % result,
|
||||
'content': "%(medium_display)s // %(dimensions)s" % result,
|
||||
'author': ', '.join(result['artist_titles']),
|
||||
'img_src': image_api + '/%(image_id)s/full/843,/0/default.jpg' % result,
|
||||
'template': 'images.html',
|
||||
}
|
||||
)
|
||||
|
||||
return res
|
||||
67
searx/engines/destatis.py
Normal file
67
searx/engines/destatis.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""DeStatis
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.utils import eval_xpath, eval_xpath_list, extract_text
|
||||
|
||||
about = {
|
||||
'website': 'https://www.destatis.de',
|
||||
'official_api_documentation': 'https://destatis.api.bund.dev/',
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'HTML',
|
||||
'language': 'de',
|
||||
}
|
||||
|
||||
categories = []
|
||||
paging = True
|
||||
|
||||
base_url = "https://www.destatis.de"
|
||||
search_url = f"{base_url}/SiteGlobals/Forms/Suche/Expertensuche_Formular.html"
|
||||
|
||||
# pylint: disable-next=line-too-long
|
||||
results_xpath = '//div[contains(@class, "l-content-wrapper")]/div[contains(@class, "row")]/div[contains(@class, "column")]/div[contains(@class, "c-result"){extra}]'
|
||||
results_xpath_filter_recommended = " and not(contains(@class, 'c-result--recommended'))"
|
||||
url_xpath = './/a/@href'
|
||||
title_xpath = './/a/text()'
|
||||
date_xpath = './/a/span[contains(@class, "c-result__date")]'
|
||||
content_xpath = './/div[contains(@class, "column")]/p/text()'
|
||||
doctype_xpath = './/div[contains(@class, "c-result__doctype")]/p'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
'templateQueryString': query,
|
||||
'gtp': f"474_list%3D{params['pageno']}",
|
||||
}
|
||||
params['url'] = f"{search_url}?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# filter out suggested results on further page because they're the same on each page
|
||||
extra_xpath = results_xpath_filter_recommended if resp.search_params['pageno'] > 1 else ''
|
||||
res_xpath = results_xpath.format(extra=extra_xpath)
|
||||
|
||||
for result in eval_xpath_list(dom, res_xpath):
|
||||
doctype = extract_text(eval_xpath(result, doctype_xpath))
|
||||
date = extract_text(eval_xpath(result, date_xpath))
|
||||
|
||||
metadata = [meta for meta in (doctype, date) if meta != ""]
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': base_url + "/" + extract_text(eval_xpath(result, url_xpath)),
|
||||
'title': extract_text(eval_xpath(result, title_xpath)),
|
||||
'content': extract_text(eval_xpath(result, content_xpath)),
|
||||
'metadata': ', '.join(metadata),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
87
searx/engines/deviantart.py
Normal file
87
searx/engines/deviantart.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Deviantart (Images)
|
||||
|
||||
"""
|
||||
|
||||
import urllib.parse
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.deviantart.com/',
|
||||
"wikidata_id": 'Q46523',
|
||||
"official_api_documentation": 'https://www.deviantart.com/developers/',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images']
|
||||
paging = True
|
||||
|
||||
# search-url
|
||||
base_url = 'https://www.deviantart.com'
|
||||
|
||||
results_xpath = '//div[@class="_2pZkk"]/div/div/a'
|
||||
url_xpath = './@href'
|
||||
thumbnail_src_xpath = './div/img/@src'
|
||||
img_src_xpath = './div/img/@srcset'
|
||||
title_xpath = './@aria-label'
|
||||
premium_xpath = '../div/div/div/text()'
|
||||
premium_keytext = 'Watch the artist to view this deviation'
|
||||
cursor_xpath = '(//a[@class="_1OGeq"]/@href)[last()]'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
# https://www.deviantart.com/search?q=foo
|
||||
|
||||
nextpage_url = params['engine_data'].get('nextpage')
|
||||
# don't use nextpage when user selected to jump back to page 1
|
||||
if params['pageno'] > 1 and nextpage_url is not None:
|
||||
params['url'] = nextpage_url
|
||||
else:
|
||||
params['url'] = f"{base_url}/search?{urllib.parse.urlencode({'q': query})}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, results_xpath):
|
||||
# skip images that are blurred
|
||||
_text = extract_text(eval_xpath(result, premium_xpath))
|
||||
if _text and premium_keytext in _text:
|
||||
continue
|
||||
img_src = extract_text(eval_xpath(result, img_src_xpath))
|
||||
if img_src:
|
||||
img_src = img_src.split(' ')[0]
|
||||
parsed_url = urllib.parse.urlparse(img_src)
|
||||
img_src = parsed_url._replace(path=parsed_url.path.split('/v1')[0]).geturl()
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'url': extract_text(eval_xpath(result, url_xpath)),
|
||||
'img_src': img_src,
|
||||
'thumbnail_src': extract_text(eval_xpath(result, thumbnail_src_xpath)),
|
||||
'title': extract_text(eval_xpath(result, title_xpath)),
|
||||
}
|
||||
)
|
||||
|
||||
nextpage_url = extract_text(eval_xpath(dom, cursor_xpath))
|
||||
if nextpage_url:
|
||||
results.append(
|
||||
{
|
||||
'engine_data': nextpage_url.replace("http://", "https://"),
|
||||
'key': 'nextpage',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
105
searx/engines/dictzone.py
Normal file
105
searx/engines/dictzone.py
Normal file
@@ -0,0 +1,105 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Dictzone
|
||||
"""
|
||||
|
||||
import urllib.parse
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import eval_xpath, extract_text
|
||||
from searx.result_types import EngineResults
|
||||
from searx.network import get as http_get # https://github.com/searxng/searxng/issues/762
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://dictzone.com/',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
engine_type = 'online_dictionary'
|
||||
categories = ['general', 'translate']
|
||||
base_url = "https://dictzone.com"
|
||||
weight = 100
|
||||
https_support = True
|
||||
|
||||
|
||||
def request(query, params): # pylint: disable=unused-argument
|
||||
|
||||
from_lang = params["from_lang"][2] # "english"
|
||||
to_lang = params["to_lang"][2] # "german"
|
||||
query = params["query"]
|
||||
|
||||
params["url"] = f"{base_url}/{from_lang}-{to_lang}-dictionary/{urllib.parse.quote_plus(query)}"
|
||||
return params
|
||||
|
||||
|
||||
def _clean_up_node(node):
|
||||
for x in ["./i", "./span", "./button"]:
|
||||
for n in node.xpath(x):
|
||||
n.getparent().remove(n)
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
results = EngineResults()
|
||||
|
||||
item_list = []
|
||||
|
||||
if not resp.ok:
|
||||
return results
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath(dom, ".//table[@id='r']//tr"):
|
||||
|
||||
# each row is an Translations.Item
|
||||
|
||||
td_list = result.xpath("./td")
|
||||
if len(td_list) != 2:
|
||||
# ignore header columns "tr/th"
|
||||
continue
|
||||
|
||||
col_from, col_to = td_list
|
||||
_clean_up_node(col_from)
|
||||
|
||||
text = f"{extract_text(col_from)}"
|
||||
|
||||
synonyms = []
|
||||
p_list = col_to.xpath(".//p")
|
||||
|
||||
for i, p_item in enumerate(p_list):
|
||||
|
||||
smpl: str = extract_text(p_list[i].xpath("./i[@class='smpl']")) # type: ignore
|
||||
_clean_up_node(p_item)
|
||||
p_text: str = extract_text(p_item) # type: ignore
|
||||
|
||||
if smpl:
|
||||
p_text += " // " + smpl
|
||||
|
||||
if i == 0:
|
||||
text += f" : {p_text}"
|
||||
continue
|
||||
|
||||
synonyms.append(p_text)
|
||||
|
||||
item = results.types.Translations.Item(text=text, synonyms=synonyms)
|
||||
item_list.append(item)
|
||||
|
||||
# the "autotranslate" of dictzone is loaded by the JS from URL:
|
||||
# https://dictzone.com/trans/hello%20world/en_de
|
||||
|
||||
from_lang = resp.search_params["from_lang"][1] # "en"
|
||||
to_lang = resp.search_params["to_lang"][1] # "de"
|
||||
query = resp.search_params["query"]
|
||||
|
||||
# works only sometimes?
|
||||
autotranslate = http_get(f"{base_url}/trans/{query}/{from_lang}_{to_lang}", timeout=1.0)
|
||||
if autotranslate.ok and autotranslate.text:
|
||||
item_list.insert(0, results.types.Translations.Item(text=autotranslate.text))
|
||||
|
||||
if item_list:
|
||||
results.add(results.types.Translations(translations=item_list, url=resp.search_params["url"]))
|
||||
return results
|
||||
64
searx/engines/digbt.py
Normal file
64
searx/engines/digbt.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
DigBT (Videos, Music, Files)
|
||||
"""
|
||||
|
||||
from urllib.parse import urljoin
|
||||
from lxml import html
|
||||
from searx.utils import extract_text
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://digbt.org',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
categories = ['videos', 'music', 'files']
|
||||
paging = True
|
||||
|
||||
URL = 'https://digbt.org'
|
||||
SEARCH_URL = URL + '/search/{query}-time-{pageno}'
|
||||
FILESIZE = 3
|
||||
FILESIZE_MULTIPLIER = 4
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno'])
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
dom = html.fromstring(resp.text)
|
||||
search_res = dom.xpath('.//td[@class="x-item"]')
|
||||
|
||||
if not search_res:
|
||||
return []
|
||||
|
||||
results = []
|
||||
for result in search_res:
|
||||
url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
|
||||
title = extract_text(result.xpath('.//a[@title]'))
|
||||
content = extract_text(result.xpath('.//div[@class="files"]'))
|
||||
files_data = extract_text(result.xpath('.//div[@class="tail"]')).split()
|
||||
filesize = f"{files_data[FILESIZE]} {files_data[FILESIZE_MULTIPLIER]}"
|
||||
magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0]
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'filesize': filesize,
|
||||
'magnetlink': magnetlink,
|
||||
'seed': 'N/A',
|
||||
'leech': 'N/A',
|
||||
'template': 'torrent.html',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
181
searx/engines/discourse.py
Normal file
181
searx/engines/discourse.py
Normal file
@@ -0,0 +1,181 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
""".. sidebar:: info
|
||||
|
||||
- `builtwith.com Discourse <https://trends.builtwith.com/websitelist/Discourse>`_
|
||||
|
||||
Discourse is an open source Internet forum system. To search in a forum this
|
||||
engine offers some additional settings:
|
||||
|
||||
- :py:obj:`base_url`
|
||||
- :py:obj:`api_order`
|
||||
- :py:obj:`search_endpoint`
|
||||
- :py:obj:`show_avatar`
|
||||
- :py:obj:`api_key`
|
||||
- :py:obj:`api_username`
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
To search in your favorite Discourse forum, add a configuration like shown here
|
||||
for the ``paddling.com`` forum:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: paddling
|
||||
engine: discourse
|
||||
shortcut: paddle
|
||||
base_url: 'https://forums.paddling.com/'
|
||||
api_order: views
|
||||
categories: ['social media', 'sports']
|
||||
show_avatar: true
|
||||
|
||||
If the forum is private, you need to add an API key and username for the search:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: paddling
|
||||
engine: discourse
|
||||
shortcut: paddle
|
||||
base_url: 'https://forums.paddling.com/'
|
||||
api_order: views
|
||||
categories: ['social media', 'sports']
|
||||
show_avatar: true
|
||||
api_key: '<KEY>'
|
||||
api_username: 'system'
|
||||
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime, timedelta
|
||||
import html
|
||||
|
||||
from dateutil import parser
|
||||
|
||||
from flask_babel import gettext
|
||||
|
||||
about = {
|
||||
"website": "https://discourse.org/",
|
||||
"wikidata_id": "Q15054354",
|
||||
"official_api_documentation": "https://docs.discourse.org/",
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
base_url: str = None # type: ignore
|
||||
"""URL of the Discourse forum."""
|
||||
|
||||
search_endpoint = '/search.json'
|
||||
"""URL path of the `search endpoint`_.
|
||||
|
||||
.. _search endpoint: https://docs.discourse.org/#tag/Search
|
||||
"""
|
||||
|
||||
api_order = 'likes'
|
||||
"""Order method, valid values are: ``latest``, ``likes``, ``views``, ``latest_topic``"""
|
||||
|
||||
show_avatar = False
|
||||
"""Show avatar of the user who send the post."""
|
||||
|
||||
api_key = ''
|
||||
"""API key of the Discourse forum."""
|
||||
|
||||
api_username = ''
|
||||
"""API username of the Discourse forum."""
|
||||
|
||||
paging = True
|
||||
time_range_support = True
|
||||
|
||||
AGO_TIMEDELTA = {
|
||||
'day': timedelta(days=1),
|
||||
'week': timedelta(days=7),
|
||||
'month': timedelta(days=31),
|
||||
'year': timedelta(days=365),
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
if len(query) <= 2:
|
||||
return None
|
||||
|
||||
q = [query, f'order:{api_order}']
|
||||
time_range = params.get('time_range')
|
||||
if time_range:
|
||||
after_date = datetime.now() - AGO_TIMEDELTA[time_range]
|
||||
q.append('after:' + after_date.strftime('%Y-%m-%d'))
|
||||
|
||||
args = {
|
||||
'q': ' '.join(q),
|
||||
'page': params['pageno'],
|
||||
}
|
||||
|
||||
params['url'] = f'{base_url}{search_endpoint}?{urlencode(args)}'
|
||||
params['headers'] = {
|
||||
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
}
|
||||
|
||||
if api_key != '':
|
||||
params['headers']['Api-Key'] = api_key
|
||||
|
||||
if api_username != '':
|
||||
params['headers']['Api-Username'] = api_username
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
results = []
|
||||
json_data = resp.json()
|
||||
|
||||
if ('topics' or 'posts') not in json_data.keys():
|
||||
return []
|
||||
|
||||
topics = {}
|
||||
|
||||
for item in json_data['topics']:
|
||||
topics[item['id']] = item
|
||||
|
||||
for post in json_data['posts']:
|
||||
result = topics.get(post['topic_id'], {})
|
||||
|
||||
url = f"{base_url}/p/{post['id']}"
|
||||
status = gettext("closed") if result.get('closed', '') else gettext("open")
|
||||
comments = result.get('posts_count', 0)
|
||||
publishedDate = parser.parse(result['created_at'])
|
||||
|
||||
metadata = []
|
||||
metadata.append('@' + post.get('username', ''))
|
||||
|
||||
if int(comments) > 1:
|
||||
metadata.append(f'{gettext("comments")}: {comments}')
|
||||
|
||||
if result.get('has_accepted_answer'):
|
||||
metadata.append(gettext("answered"))
|
||||
elif int(comments) > 1:
|
||||
metadata.append(status)
|
||||
|
||||
result = {
|
||||
'url': url,
|
||||
'title': html.unescape(result['title']),
|
||||
'content': html.unescape(post.get('blurb', '')),
|
||||
'metadata': ' | '.join(metadata),
|
||||
'publishedDate': publishedDate,
|
||||
'upstream': {'topics': result},
|
||||
}
|
||||
|
||||
avatar = post.get('avatar_template', '').replace('{size}', '96')
|
||||
if show_avatar and avatar:
|
||||
result['thumbnail'] = base_url + avatar
|
||||
|
||||
results.append(result)
|
||||
|
||||
results.append({'number_of_results': len(json_data['topics'])})
|
||||
|
||||
return results
|
||||
71
searx/engines/docker_hub.py
Normal file
71
searx/engines/docker_hub.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Docker Hub (IT)
|
||||
|
||||
"""
|
||||
# pylint: disable=use-dict-literal
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from dateutil import parser
|
||||
|
||||
about = {
|
||||
"website": 'https://hub.docker.com',
|
||||
"wikidata_id": 'Q100769064',
|
||||
"official_api_documentation": 'https://docs.docker.com/registry/spec/api/',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
categories = ['it', 'packages'] # optional
|
||||
paging = True
|
||||
|
||||
base_url = "https://hub.docker.com"
|
||||
page_size = 10
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
"query": query,
|
||||
"from": page_size * (params['pageno'] - 1),
|
||||
"size": page_size,
|
||||
}
|
||||
params['url'] = f"{base_url}/api/search/v3/catalog/search?{urlencode(args)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
'''post-response callback
|
||||
resp: requests response object
|
||||
'''
|
||||
results = []
|
||||
json_resp = resp.json()
|
||||
|
||||
for item in json_resp.get("results", []):
|
||||
image_source = item.get("source")
|
||||
is_official = image_source in ["store", "official"]
|
||||
|
||||
popularity_infos = [f"{item.get('star_count', 0)} stars"]
|
||||
|
||||
architectures = []
|
||||
for rate_plan in item.get("rate_plans", []):
|
||||
pull_count = rate_plan.get("repositories", [{}])[0].get("pull_count")
|
||||
if pull_count:
|
||||
popularity_infos.insert(0, f"{pull_count} pulls")
|
||||
architectures.extend(arch['name'] for arch in rate_plan.get("architectures", []) if arch['name'])
|
||||
|
||||
result = {
|
||||
'template': 'packages.html',
|
||||
'url': base_url + ("/_/" if is_official else "/r/") + item.get("slug", ""),
|
||||
'title': item.get("name"),
|
||||
'content': item.get("short_description"),
|
||||
'thumbnail': item["logo_url"].get("large") or item["logo_url"].get("small"),
|
||||
'package_name': item.get("name"),
|
||||
'maintainer': item["publisher"].get("name"),
|
||||
'publishedDate': parser.parse(item.get("updated_at") or item.get("created_at")),
|
||||
'popularity': ', '.join(popularity_infos),
|
||||
'tags': architectures,
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
87
searx/engines/doku.py
Normal file
87
searx/engines/doku.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Doku Wiki
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from urllib.parse import urljoin
|
||||
from lxml.html import fromstring
|
||||
from searx.utils import extract_text, eval_xpath
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.dokuwiki.org/',
|
||||
"wikidata_id": 'Q851864',
|
||||
"official_api_documentation": 'https://www.dokuwiki.org/devel:xmlrpc',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general'] # 'images', 'music', 'videos', 'files'
|
||||
paging = False
|
||||
number_of_results = 5
|
||||
|
||||
# search-url
|
||||
# Doku is OpenSearch compatible
|
||||
base_url = 'http://localhost:8090'
|
||||
search_url = (
|
||||
# fmt: off
|
||||
'/?do=search'
|
||||
'&{query}'
|
||||
# fmt: on
|
||||
)
|
||||
# '&startRecord={offset}'
|
||||
# '&maximumRecords={limit}'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
|
||||
params['url'] = base_url + search_url.format(query=urlencode({'id': query}))
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
doc = fromstring(resp.text)
|
||||
|
||||
# parse results
|
||||
# Quickhits
|
||||
for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
|
||||
try:
|
||||
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
|
||||
except: # pylint: disable=bare-except
|
||||
continue
|
||||
|
||||
if not res_url:
|
||||
continue
|
||||
|
||||
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
|
||||
|
||||
# append result
|
||||
results.append({'title': title, 'content': "", 'url': urljoin(base_url, res_url)})
|
||||
|
||||
# Search results
|
||||
for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
|
||||
try:
|
||||
if r.tag == "dt":
|
||||
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
|
||||
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
|
||||
elif r.tag == "dd":
|
||||
content = extract_text(eval_xpath(r, '.'))
|
||||
|
||||
# append result
|
||||
results.append({'title': title, 'content': content, 'url': urljoin(base_url, res_url)})
|
||||
except: # pylint: disable=bare-except
|
||||
continue
|
||||
|
||||
if not res_url:
|
||||
continue
|
||||
|
||||
# return results
|
||||
return results
|
||||
496
searx/engines/duckduckgo.py
Normal file
496
searx/engines/duckduckgo.py
Normal file
@@ -0,0 +1,496 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
DuckDuckGo WEB
|
||||
~~~~~~~~~~~~~~
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import typing
|
||||
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
import babel
|
||||
import lxml.html
|
||||
|
||||
from searx import (
|
||||
locales,
|
||||
external_bang,
|
||||
)
|
||||
from searx.utils import (
|
||||
eval_xpath,
|
||||
eval_xpath_getindex,
|
||||
extr,
|
||||
extract_text,
|
||||
)
|
||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.enginelib import EngineCache
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
about = {
|
||||
"website": 'https://lite.duckduckgo.com/lite/',
|
||||
"wikidata_id": 'Q12805',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
"""DuckDuckGo-Lite tries to guess user's preferred language from the HTTP
|
||||
``Accept-Language``. Optional the user can select a region filter (but not a
|
||||
language).
|
||||
"""
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
time_range_support = True
|
||||
safesearch = True # user can't select but the results are filtered
|
||||
|
||||
url = "https://html.duckduckgo.com/html/"
|
||||
|
||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
||||
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
|
||||
|
||||
_CACHE: EngineCache = None # type: ignore
|
||||
"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
|
||||
seconds."""
|
||||
|
||||
|
||||
def get_cache():
|
||||
global _CACHE # pylint: disable=global-statement
|
||||
if _CACHE is None:
|
||||
_CACHE = EngineCache("duckduckgo") # type:ignore
|
||||
return _CACHE
|
||||
|
||||
|
||||
def get_vqd(query: str, region: str, force_request: bool = False) -> str:
|
||||
"""Returns the ``vqd`` that fits to the *query*.
|
||||
|
||||
:param query: The query term
|
||||
:param region: DDG's region code
|
||||
:param force_request: force a request to get a vqd value from DDG
|
||||
|
||||
TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
|
||||
by all request to DDG:
|
||||
|
||||
- DuckDuckGo Lite: ``https://lite.duckduckgo.com/lite`` (POST form data)
|
||||
- DuckDuckGo Web: ``https://links.duckduckgo.com/d.js?q=...&vqd=...``
|
||||
- DuckDuckGo Images: ``https://duckduckgo.com/i.js??q=...&vqd=...``
|
||||
- DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
|
||||
- DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
|
||||
|
||||
DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
|
||||
(such as extremely long search terms that are often sent by bots), no ``vqd``
|
||||
value can be determined.
|
||||
|
||||
If SearXNG cannot determine a ``vqd`` value, then no request should go out
|
||||
to DDG.
|
||||
|
||||
.. attention::
|
||||
|
||||
A request with a wrong ``vqd`` value leads to DDG temporarily putting
|
||||
SearXNG's IP on a block list.
|
||||
|
||||
Requests from IPs in this block list run into timeouts. Not sure, but it
|
||||
seems the block list is a sliding window: to get my IP rid from the bot list
|
||||
I had to cool down my IP for 1h (send no requests from that IP to DDG).
|
||||
"""
|
||||
cache = get_cache()
|
||||
key = cache.secret_hash(f"{query}//{region}")
|
||||
value = cache.get(key=key)
|
||||
if value is not None and not force_request:
|
||||
logger.debug("vqd: re-use cached value: %s", value)
|
||||
return value
|
||||
|
||||
logger.debug("vqd: request value from from duckduckgo.com")
|
||||
resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
|
||||
if resp.status_code == 200: # type: ignore
|
||||
value = extr(resp.text, 'vqd="', '"') # type: ignore
|
||||
if value:
|
||||
logger.debug("vqd value from duckduckgo.com request: '%s'", value)
|
||||
else:
|
||||
logger.error("vqd: can't parse value from ddg response (return empty string)")
|
||||
return ""
|
||||
else:
|
||||
logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code)
|
||||
|
||||
if value:
|
||||
cache.set(key=key, value=value)
|
||||
else:
|
||||
logger.error("vqd value from duckduckgo.com ", resp.status_code)
|
||||
return value
|
||||
|
||||
|
||||
def set_vqd(query: str, region: str, value: str):
|
||||
cache = get_cache()
|
||||
key = cache.secret_hash(f"{query}//{region}")
|
||||
cache.set(key=key, value=value, expire=3600)
|
||||
|
||||
|
||||
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
|
||||
"""Get DuckDuckGo's language identifier from SearXNG's locale.
|
||||
|
||||
DuckDuckGo defines its languages by region codes (see
|
||||
:py:obj:`fetch_traits`).
|
||||
|
||||
To get region and language of a DDG service use:
|
||||
|
||||
.. code: python
|
||||
|
||||
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
||||
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||
|
||||
It might confuse, but the ``l`` value of the cookie is what SearXNG calls
|
||||
the *region*:
|
||||
|
||||
.. code:: python
|
||||
|
||||
# !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
|
||||
params['cookies']['ad'] = eng_lang
|
||||
params['cookies']['ah'] = eng_region
|
||||
params['cookies']['l'] = eng_region
|
||||
|
||||
.. hint::
|
||||
|
||||
`DDG-lite <https://lite.duckduckgo.com/lite>`__ and the *no Javascript*
|
||||
page https://html.duckduckgo.com/html do not offer a language selection
|
||||
to the user, only a region can be selected by the user (``eng_region``
|
||||
from the example above). DDG-lite and *no Javascript* store the selected
|
||||
region in a cookie::
|
||||
|
||||
params['cookies']['kl'] = eng_region # 'ar-es'
|
||||
|
||||
"""
|
||||
return eng_traits.custom['lang_region'].get( # type: ignore
|
||||
sxng_locale, eng_traits.get_language(sxng_locale, default)
|
||||
)
|
||||
|
||||
|
||||
ddg_reg_map = {
|
||||
'tw-tzh': 'zh_TW',
|
||||
'hk-tzh': 'zh_HK',
|
||||
'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
|
||||
'es-ca': 'ca_ES',
|
||||
'id-en': 'id_ID',
|
||||
'no-no': 'nb_NO',
|
||||
'jp-jp': 'ja_JP',
|
||||
'kr-kr': 'ko_KR',
|
||||
'xa-ar': 'ar_SA',
|
||||
'sl-sl': 'sl_SI',
|
||||
'th-en': 'th_TH',
|
||||
'vn-en': 'vi_VN',
|
||||
}
|
||||
|
||||
ddg_lang_map = {
|
||||
# use ar --> ar_EG (Egypt's arabic)
|
||||
"ar_DZ": 'lang_region',
|
||||
"ar_JO": 'lang_region',
|
||||
"ar_SA": 'lang_region',
|
||||
# use bn --> bn_BD
|
||||
'bn_IN': 'lang_region',
|
||||
# use de --> de_DE
|
||||
'de_CH': 'lang_region',
|
||||
# use en --> en_US,
|
||||
'en_AU': 'lang_region',
|
||||
'en_CA': 'lang_region',
|
||||
'en_GB': 'lang_region',
|
||||
# Esperanto
|
||||
'eo_XX': 'eo',
|
||||
# use es --> es_ES,
|
||||
'es_AR': 'lang_region',
|
||||
'es_CL': 'lang_region',
|
||||
'es_CO': 'lang_region',
|
||||
'es_CR': 'lang_region',
|
||||
'es_EC': 'lang_region',
|
||||
'es_MX': 'lang_region',
|
||||
'es_PE': 'lang_region',
|
||||
'es_UY': 'lang_region',
|
||||
'es_VE': 'lang_region',
|
||||
# use fr --> rf_FR
|
||||
'fr_CA': 'lang_region',
|
||||
'fr_CH': 'lang_region',
|
||||
'fr_BE': 'lang_region',
|
||||
# use nl --> nl_NL
|
||||
'nl_BE': 'lang_region',
|
||||
# use pt --> pt_PT
|
||||
'pt_BR': 'lang_region',
|
||||
# skip these languages
|
||||
'od_IN': 'skip',
|
||||
'io_XX': 'skip',
|
||||
'tokipona_XX': 'skip',
|
||||
}
|
||||
|
||||
|
||||
def quote_ddg_bangs(query):
|
||||
# quote ddg bangs
|
||||
query_parts = []
|
||||
|
||||
# for val in re.split(r'(\s+)', query):
|
||||
for val in re.split(r'(\s+)', query):
|
||||
if not val.strip():
|
||||
continue
|
||||
if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
|
||||
val = f"'{val}'"
|
||||
query_parts.append(val)
|
||||
return ' '.join(query_parts)
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query = quote_ddg_bangs(query)
|
||||
|
||||
if len(query) >= 500:
|
||||
# DDG does not accept queries with more than 499 chars
|
||||
params["url"] = None
|
||||
return
|
||||
|
||||
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||
|
||||
# Note: The API is reverse-engineered from DuckDuckGo's HTML webpage
|
||||
# (https://html.duckduckgo.com/html/) and may be subject to additional bot detection mechanisms
|
||||
# and breaking changes in the future.
|
||||
#
|
||||
# The params['data'] dictionary can have the following key parameters, in this order:
|
||||
# - q (str): Search query string
|
||||
# - b (str): Beginning parameter - empty string for first page requests
|
||||
# - s (int): Search offset for pagination
|
||||
# - nextParams (str): Continuation parameters from previous page response, typically empty
|
||||
# - v (str): Typically 'l' for subsequent pages
|
||||
# - o (str): Output format, typically 'json'
|
||||
# - dc (int): Display count - value equal to offset (s) + 1
|
||||
# - api (str): API endpoint identifier, typically 'd.js'
|
||||
# - vqd (str): Validation query digest
|
||||
# - kl (str): Keyboard language/region code (e.g., 'en-us')
|
||||
# - df (str): Time filter, maps to values like 'd' (day), 'w' (week), 'm' (month), 'y' (year)
|
||||
|
||||
params['data']['q'] = query
|
||||
|
||||
if params['pageno'] == 1:
|
||||
params['data']['b'] = ""
|
||||
elif params['pageno'] >= 2:
|
||||
offset = 10 + (params['pageno'] - 2) * 15 # Page 2 = 10, Page 3+ = 10 + n*15
|
||||
params['data']['s'] = offset
|
||||
params['data']['nextParams'] = form_data.get('nextParams', '')
|
||||
params['data']['v'] = form_data.get('v', 'l')
|
||||
params['data']['o'] = form_data.get('o', 'json')
|
||||
params['data']['dc'] = offset + 1
|
||||
params['data']['api'] = form_data.get('api', 'd.js')
|
||||
|
||||
# vqd is required to request other pages after the first one
|
||||
vqd = get_vqd(query, eng_region, force_request=False)
|
||||
if vqd:
|
||||
params['data']['vqd'] = vqd
|
||||
else:
|
||||
# Don't try to call follow up pages without a vqd value.
|
||||
# DDG recognizes this as a request from a bot. This lowers the
|
||||
# reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
|
||||
params["url"] = None
|
||||
return
|
||||
|
||||
if params['searxng_locale'].startswith("zh"):
|
||||
# Some locales (at least China) do not have a "next page" button and DDG
|
||||
# will return a HTTP/2 403 Forbidden for a request of such a page.
|
||||
params["url"] = None
|
||||
return
|
||||
|
||||
# Put empty kl in form data if language/region set to all
|
||||
if eng_region == "wt-wt":
|
||||
params['data']['kl'] = ""
|
||||
else:
|
||||
params['data']['kl'] = eng_region
|
||||
|
||||
params['data']['df'] = ''
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['data']['df'] = time_range_dict[params['time_range']]
|
||||
params['cookies']['df'] = time_range_dict[params['time_range']]
|
||||
|
||||
params['cookies']['kl'] = eng_region
|
||||
|
||||
params['url'] = url
|
||||
params['method'] = 'POST'
|
||||
|
||||
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||
params['headers']['Referer'] = url
|
||||
params['headers']['Sec-Fetch-Dest'] = "document"
|
||||
params['headers']['Sec-Fetch-Mode'] = "navigate" # at least this one is used by ddg's bot detection
|
||||
params['headers']['Sec-Fetch-Site'] = "same-origin"
|
||||
params['headers']['Sec-Fetch-User'] = "?1"
|
||||
|
||||
logger.debug("param headers: %s", params['headers'])
|
||||
logger.debug("param data: %s", params['data'])
|
||||
logger.debug("param cookies: %s", params['cookies'])
|
||||
|
||||
|
||||
def is_ddg_captcha(dom):
|
||||
"""In case of CAPTCHA ddg response its own *not a Robot* dialog and is not
|
||||
redirected to a CAPTCHA page."""
|
||||
|
||||
return bool(eval_xpath(dom, "//form[@id='challenge-form']"))
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
results = EngineResults()
|
||||
|
||||
if resp.status_code == 303:
|
||||
return results
|
||||
|
||||
doc = lxml.html.fromstring(resp.text)
|
||||
|
||||
if is_ddg_captcha(doc):
|
||||
# set suspend time to zero is OK --> ddg does not block the IP
|
||||
raise SearxEngineCaptchaException(suspended_time=0, message=f"CAPTCHA ({resp.search_params['data'].get('kl')})")
|
||||
|
||||
form = eval_xpath(doc, '//input[@name="vqd"]/..')
|
||||
if len(form):
|
||||
# some locales (at least China) does not have a "next page" button
|
||||
form = form[0]
|
||||
form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
|
||||
set_vqd(
|
||||
query=resp.search_params['data']['q'],
|
||||
region=resp.search_params['data']['kl'],
|
||||
value=str(form_vqd),
|
||||
)
|
||||
|
||||
# just select "web-result" and ignore results of class "result--ad result--ad--small"
|
||||
for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
|
||||
|
||||
item = {}
|
||||
title = eval_xpath(div_result, './/h2/a')
|
||||
if not title:
|
||||
# this is the "No results." item in the result list
|
||||
continue
|
||||
item["title"] = extract_text(title)
|
||||
item["url"] = eval_xpath(div_result, './/h2/a/@href')[0]
|
||||
item["content"] = extract_text(
|
||||
eval_xpath_getindex(div_result, './/a[contains(@class, "result__snippet")]', 0, [])
|
||||
)
|
||||
results.append(item)
|
||||
|
||||
zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
|
||||
zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip() # type: ignore
|
||||
|
||||
if zero_click and (
|
||||
"Your IP address is" not in zero_click
|
||||
and "Your user agent:" not in zero_click
|
||||
and "URL Decoded:" not in zero_click
|
||||
):
|
||||
results.add(
|
||||
results.types.Answer(
|
||||
answer=zero_click,
|
||||
url=eval_xpath_getindex(doc, '//div[@id="zero_click_abstract"]/a/@href', 0), # type: ignore
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages & regions from DuckDuckGo.
|
||||
|
||||
SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
|
||||
DuckDuckGo's language "Browsers preferred language" (``wt_WT``) makes no
|
||||
sense in a SearXNG request since SearXNG's ``all`` will not add a
|
||||
``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
|
||||
is ``wt-wt`` (the region).
|
||||
|
||||
Beside regions DuckDuckGo also defines its languages by region codes. By
|
||||
example these are the english languages in DuckDuckGo:
|
||||
|
||||
- en_US
|
||||
- en_AU
|
||||
- en_CA
|
||||
- en_GB
|
||||
|
||||
The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
|
||||
SearXNG's locale.
|
||||
|
||||
"""
|
||||
# pylint: disable=too-many-branches, too-many-statements, disable=import-outside-toplevel
|
||||
from searx.utils import js_variable_to_python
|
||||
|
||||
# fetch regions
|
||||
|
||||
engine_traits.all_locale = 'wt-wt'
|
||||
|
||||
# updated from u661.js to u.7669f071a13a7daa57cb / should be updated automatically?
|
||||
resp = get('https://duckduckgo.com/dist/util/u.7669f071a13a7daa57cb.js')
|
||||
|
||||
if not resp.ok: # type: ignore
|
||||
print("ERROR: response from DuckDuckGo is not OK.")
|
||||
|
||||
js_code = extr(resp.text, 'regions:', ',snippetLengths') # type: ignore
|
||||
|
||||
regions = json.loads(js_code)
|
||||
for eng_tag, name in regions.items():
|
||||
|
||||
if eng_tag == 'wt-wt':
|
||||
engine_traits.all_locale = 'wt-wt'
|
||||
continue
|
||||
|
||||
region = ddg_reg_map.get(eng_tag)
|
||||
if region == 'skip':
|
||||
continue
|
||||
|
||||
if not region:
|
||||
eng_territory, eng_lang = eng_tag.split('-')
|
||||
region = eng_lang + '_' + eng_territory.upper()
|
||||
|
||||
try:
|
||||
sxng_tag = locales.region_tag(babel.Locale.parse(region))
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
|
||||
continue
|
||||
|
||||
conflict = engine_traits.regions.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
engine_traits.regions[sxng_tag] = eng_tag
|
||||
|
||||
# fetch languages
|
||||
|
||||
engine_traits.custom['lang_region'] = {}
|
||||
|
||||
js_code = extr(resp.text, 'languages:', ',regions') # type: ignore
|
||||
|
||||
languages = js_variable_to_python(js_code)
|
||||
for eng_lang, name in languages.items():
|
||||
|
||||
if eng_lang == 'wt_WT':
|
||||
continue
|
||||
|
||||
babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
|
||||
if babel_tag == 'skip':
|
||||
continue
|
||||
|
||||
try:
|
||||
|
||||
if babel_tag == 'lang_region':
|
||||
sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
|
||||
engine_traits.custom['lang_region'][sxng_tag] = eng_lang
|
||||
continue
|
||||
|
||||
sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
|
||||
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
|
||||
continue
|
||||
|
||||
conflict = engine_traits.languages.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_lang:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
|
||||
continue
|
||||
engine_traits.languages[sxng_tag] = eng_lang
|
||||
264
searx/engines/duckduckgo_definitions.py
Normal file
264
searx/engines/duckduckgo_definitions.py
Normal file
@@ -0,0 +1,264 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
DuckDuckGo Instant Answer API
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
|
||||
reverse engineering we can see that some services (e.g. instant answers) still
|
||||
in use from the DDG search engine.
|
||||
|
||||
As far we can say the *instant answers* API does not support languages, or at
|
||||
least we could not find out how language support should work. It seems that
|
||||
most of the features are based on English terms.
|
||||
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from urllib.parse import urlencode, urlparse, urljoin
|
||||
from lxml import html
|
||||
|
||||
from searx.data import WIKIDATA_UNITS
|
||||
from searx.utils import extract_text, html_to_text, get_string_replaces_function
|
||||
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
"wikidata_id": 'Q12805',
|
||||
"official_api_documentation": 'https://duckduckgo.com/api',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
|
||||
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
|
||||
|
||||
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
|
||||
|
||||
replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
|
||||
|
||||
|
||||
def is_broken_text(text):
|
||||
"""duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
|
||||
|
||||
The href URL is broken, the "Related website" may contains some HTML.
|
||||
|
||||
The best solution seems to ignore these results.
|
||||
"""
|
||||
return text.startswith('http') and ' ' in text
|
||||
|
||||
|
||||
def result_to_text(text, htmlResult):
|
||||
# TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
|
||||
result = None
|
||||
dom = html.fromstring(htmlResult)
|
||||
a = dom.xpath('//a')
|
||||
if len(a) >= 1:
|
||||
result = extract_text(a[0])
|
||||
else:
|
||||
result = text
|
||||
if not is_broken_text(result):
|
||||
return result
|
||||
return None
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = URL.format(query=urlencode({'q': query}))
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
|
||||
results = EngineResults()
|
||||
|
||||
search_res = resp.json()
|
||||
|
||||
# search_res.get('Entity') possible values (not exhaustive) :
|
||||
# * continent / country / department / location / waterfall
|
||||
# * actor / musician / artist
|
||||
# * book / performing art / film / television / media franchise / concert tour / playwright
|
||||
# * prepared food
|
||||
# * website / software / os / programming language / file format / software engineer
|
||||
# * company
|
||||
|
||||
content = ''
|
||||
heading = search_res.get('Heading', '')
|
||||
attributes = []
|
||||
urls = []
|
||||
infobox_id = None
|
||||
relatedTopics = []
|
||||
|
||||
# add answer if there is one
|
||||
answer = search_res.get('Answer', '')
|
||||
if answer:
|
||||
answer_type = search_res.get('AnswerType')
|
||||
logger.debug('AnswerType="%s" Answer="%s"', answer_type, answer)
|
||||
if isinstance(answer, str) and answer_type not in ['calc', 'ip']:
|
||||
results.add(
|
||||
results.types.Answer(
|
||||
answer=html_to_text(answer),
|
||||
url=search_res.get('AbstractURL', ''),
|
||||
)
|
||||
)
|
||||
|
||||
# add infobox
|
||||
if 'Definition' in search_res:
|
||||
content = content + search_res.get('Definition', '')
|
||||
|
||||
if 'Abstract' in search_res:
|
||||
content = content + search_res.get('Abstract', '')
|
||||
|
||||
# image
|
||||
image = search_res.get('Image')
|
||||
image = None if image == '' else image
|
||||
if image is not None and urlparse(image).netloc == '':
|
||||
image = urljoin('https://duckduckgo.com', image)
|
||||
|
||||
# urls
|
||||
# Official website, Wikipedia page
|
||||
for ddg_result in search_res.get('Results', []):
|
||||
firstURL = ddg_result.get('FirstURL')
|
||||
text = ddg_result.get('Text')
|
||||
if firstURL is not None and text is not None:
|
||||
urls.append({'title': text, 'url': firstURL})
|
||||
results.append({'title': heading, 'url': firstURL})
|
||||
|
||||
# related topics
|
||||
for ddg_result in search_res.get('RelatedTopics', []):
|
||||
if 'FirstURL' in ddg_result:
|
||||
firstURL = ddg_result.get('FirstURL')
|
||||
text = ddg_result.get('Text')
|
||||
if not is_broken_text(text):
|
||||
suggestion = result_to_text(text, ddg_result.get('Result'))
|
||||
if suggestion != heading and suggestion is not None:
|
||||
results.append({'suggestion': suggestion})
|
||||
elif 'Topics' in ddg_result:
|
||||
suggestions = []
|
||||
relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
|
||||
for topic_result in ddg_result.get('Topics', []):
|
||||
suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
|
||||
if suggestion != heading and suggestion is not None:
|
||||
suggestions.append(suggestion)
|
||||
|
||||
# abstract
|
||||
abstractURL = search_res.get('AbstractURL', '')
|
||||
if abstractURL != '':
|
||||
# add as result ? problem always in english
|
||||
infobox_id = abstractURL
|
||||
urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
|
||||
results.append({'url': abstractURL, 'title': heading})
|
||||
|
||||
# definition
|
||||
definitionURL = search_res.get('DefinitionURL', '')
|
||||
if definitionURL != '':
|
||||
# add as result ? as answer ? problem always in english
|
||||
infobox_id = definitionURL
|
||||
urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
|
||||
|
||||
# to merge with wikidata's infobox
|
||||
if infobox_id:
|
||||
infobox_id = replace_http_by_https(infobox_id)
|
||||
|
||||
# attributes
|
||||
# some will be converted to urls
|
||||
if 'Infobox' in search_res:
|
||||
infobox = search_res.get('Infobox')
|
||||
if 'content' in infobox:
|
||||
osm_zoom = 17
|
||||
coordinates = None
|
||||
for info in infobox.get('content'):
|
||||
data_type = info.get('data_type')
|
||||
data_label = info.get('label')
|
||||
data_value = info.get('value')
|
||||
|
||||
# Workaround: ddg may return a double quote
|
||||
if data_value == '""':
|
||||
continue
|
||||
|
||||
# Is it an external URL ?
|
||||
# * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
|
||||
# * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
|
||||
# * netflix_id
|
||||
external_url = get_external_url(data_type, data_value)
|
||||
if external_url is not None:
|
||||
urls.append({'title': data_label, 'url': external_url})
|
||||
elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
|
||||
# ignore instance: Wikidata value from "Instance Of" (Qxxxx)
|
||||
# ignore wiki_maps_trigger: reference to a javascript
|
||||
# ignore google_play_artist_id: service shutdown
|
||||
pass
|
||||
elif data_type == 'string' and data_label == 'Website':
|
||||
# There is already an URL for the website
|
||||
pass
|
||||
elif data_type == 'area':
|
||||
attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
|
||||
osm_zoom = area_to_osm_zoom(data_value.get('amount'))
|
||||
elif data_type == 'coordinates':
|
||||
if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
|
||||
# coordinate on Earth
|
||||
# get the zoom information from the area
|
||||
coordinates = info
|
||||
else:
|
||||
# coordinate NOT on Earth
|
||||
attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
|
||||
elif data_type == 'string':
|
||||
attributes.append({'label': data_label, 'value': data_value})
|
||||
|
||||
if coordinates:
|
||||
data_label = coordinates.get('label')
|
||||
data_value = coordinates.get('value')
|
||||
latitude = data_value.get('latitude')
|
||||
longitude = data_value.get('longitude')
|
||||
url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
|
||||
urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
|
||||
|
||||
if len(heading) > 0:
|
||||
# TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
|
||||
if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
|
||||
results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
|
||||
else:
|
||||
results.append(
|
||||
{
|
||||
'infobox': heading,
|
||||
'id': infobox_id,
|
||||
'content': content,
|
||||
'img_src': image,
|
||||
'attributes': attributes,
|
||||
'urls': urls,
|
||||
'relatedTopics': relatedTopics,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def unit_to_str(unit):
|
||||
for prefix in WIKIDATA_PREFIX:
|
||||
if unit.startswith(prefix):
|
||||
wikidata_entity = unit[len(prefix) :]
|
||||
real_unit = WIKIDATA_UNITS.get(wikidata_entity)
|
||||
if real_unit is None:
|
||||
return unit
|
||||
return real_unit['symbol']
|
||||
return unit
|
||||
|
||||
|
||||
def area_to_str(area):
|
||||
"""parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
|
||||
unit = unit_to_str(area.get('unit'))
|
||||
if unit is not None:
|
||||
try:
|
||||
amount = float(area.get('amount'))
|
||||
return '{} {}'.format(amount, unit)
|
||||
except ValueError:
|
||||
pass
|
||||
return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))
|
||||
149
searx/engines/duckduckgo_extra.py
Normal file
149
searx/engines/duckduckgo_extra.py
Normal file
@@ -0,0 +1,149 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
DuckDuckGo Extra (images, videos, news)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlencode
|
||||
from searx.utils import get_embeded_stream_url, html_to_text
|
||||
|
||||
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.duckduckgo import get_ddg_lang, get_vqd
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
"wikidata_id": 'Q12805',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON (site requires js to get images)',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images', 'web']
|
||||
ddg_category = 'images'
|
||||
"""The category must be any of ``images``, ``videos`` and ``news``
|
||||
"""
|
||||
paging = True
|
||||
safesearch = True
|
||||
send_accept_language_header = True
|
||||
|
||||
safesearch_cookies = {0: '-2', 1: None, 2: '1'}
|
||||
safesearch_args = {0: '1', 1: None, 2: '1'}
|
||||
|
||||
search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||
|
||||
# request needs a vqd argument
|
||||
vqd = get_vqd(query, eng_region, force_request=True)
|
||||
|
||||
if not vqd:
|
||||
# some search terms do not have results and therefore no vqd value
|
||||
params['url'] = None
|
||||
return params
|
||||
|
||||
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||
|
||||
args = {
|
||||
'q': query,
|
||||
'o': 'json',
|
||||
# 'u': 'bing',
|
||||
'l': eng_region,
|
||||
'f': ',,,,,',
|
||||
'vqd': vqd,
|
||||
}
|
||||
|
||||
if params['pageno'] > 1:
|
||||
args['s'] = (params['pageno'] - 1) * 100
|
||||
|
||||
params['cookies']['ad'] = eng_lang # zh_CN
|
||||
params['cookies']['ah'] = eng_region # "us-en,de-de"
|
||||
params['cookies']['l'] = eng_region # "hk-tzh"
|
||||
|
||||
safe_search = safesearch_cookies.get(params['safesearch'])
|
||||
if safe_search is not None:
|
||||
params['cookies']['p'] = safe_search # "-2", "1"
|
||||
safe_search = safesearch_args.get(params['safesearch'])
|
||||
if safe_search is not None:
|
||||
args['p'] = safe_search # "-1", "1"
|
||||
|
||||
logger.debug("cookies: %s", params['cookies'])
|
||||
|
||||
params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
|
||||
|
||||
# sending these two headers prevents rate limiting for the query
|
||||
params['headers'] = {
|
||||
'Referer': 'https://duckduckgo.com/',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
}
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def _image_result(result):
|
||||
return {
|
||||
'template': 'images.html',
|
||||
'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': '',
|
||||
'thumbnail_src': result['thumbnail'],
|
||||
'img_src': result['image'],
|
||||
'resolution': '%s x %s' % (result['width'], result['height']),
|
||||
'source': result['source'],
|
||||
}
|
||||
|
||||
|
||||
def _video_result(result):
|
||||
return {
|
||||
'template': 'videos.html',
|
||||
'url': result['content'],
|
||||
'title': result['title'],
|
||||
'content': result['description'],
|
||||
'thumbnail': result['images'].get('small') or result['images'].get('medium'),
|
||||
'iframe_src': get_embeded_stream_url(result['content']),
|
||||
'source': result['provider'],
|
||||
'length': result['duration'],
|
||||
'metadata': result.get('uploader'),
|
||||
}
|
||||
|
||||
|
||||
def _news_result(result):
|
||||
return {
|
||||
'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': html_to_text(result['excerpt']),
|
||||
'source': result['source'],
|
||||
'publishedDate': datetime.fromtimestamp(result['date']),
|
||||
}
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
res_json = resp.json()
|
||||
|
||||
for result in res_json['results']:
|
||||
if ddg_category == 'images':
|
||||
results.append(_image_result(result))
|
||||
elif ddg_category == 'videos':
|
||||
results.append(_video_result(result))
|
||||
elif ddg_category == 'news':
|
||||
results.append(_news_result(result))
|
||||
else:
|
||||
raise ValueError(f"Invalid duckduckgo category: {ddg_category}")
|
||||
|
||||
return results
|
||||
158
searx/engines/duckduckgo_weather.py
Normal file
158
searx/engines/duckduckgo_weather.py
Normal file
@@ -0,0 +1,158 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
DuckDuckGo Weather
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from json import loads
|
||||
from urllib.parse import quote
|
||||
|
||||
from dateutil import parser as date_parser
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.duckduckgo import get_ddg_lang
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
"wikidata_id": 'Q12805',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
|
||||
# engine dependent config
|
||||
categories = ["weather"]
|
||||
URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
|
||||
|
||||
|
||||
def generate_condition_table(condition):
|
||||
res = ""
|
||||
|
||||
res += f"<tr><td><b>{gettext('Condition')}</b></td>" f"<td><b>{condition['conditionCode']}</b></td></tr>"
|
||||
|
||||
res += (
|
||||
f"<tr><td><b>{gettext('Temperature')}</b></td>"
|
||||
f"<td><b>{condition['temperature']}°C / {c_to_f(condition['temperature'])}°F</b></td></tr>"
|
||||
)
|
||||
|
||||
res += (
|
||||
f"<tr><td>{gettext('Feels like')}</td><td>{condition['temperatureApparent']}°C / "
|
||||
f"{c_to_f(condition['temperatureApparent'])}°F</td></tr>"
|
||||
)
|
||||
|
||||
res += (
|
||||
f"<tr><td>{gettext('Wind')}</td><td>{condition['windDirection']}° — "
|
||||
f"{(condition['windSpeed'] * 1.6093440006147):.2f} km/h / {condition['windSpeed']} mph</td></tr>"
|
||||
)
|
||||
|
||||
res += f"<tr><td>{gettext('Visibility')}</td><td>{condition['visibility']} m</td>"
|
||||
|
||||
res += f"<tr><td>{gettext('Humidity')}</td><td>{(condition['humidity'] * 100):.1f}%</td></tr>"
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def generate_day_table(day):
|
||||
res = ""
|
||||
|
||||
res += (
|
||||
f"<tr><td>{gettext('Min temp.')}</td><td>{day['temperatureMin']}°C / "
|
||||
f"{c_to_f(day['temperatureMin'])}°F</td></tr>"
|
||||
)
|
||||
res += (
|
||||
f"<tr><td>{gettext('Max temp.')}</td><td>{day['temperatureMax']}°C / "
|
||||
f"{c_to_f(day['temperatureMax'])}°F</td></tr>"
|
||||
)
|
||||
res += f"<tr><td>{gettext('UV index')}</td><td>{day['maxUvIndex']}</td></tr>"
|
||||
res += f"<tr><td>{gettext('Sunrise')}</td><td>{date_parser.parse(day['sunrise']).strftime('%H:%M')}</td></tr>"
|
||||
res += f"<tr><td>{gettext('Sunset')}</td><td>{date_parser.parse(day['sunset']).strftime('%H:%M')}</td></tr>"
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
||||
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||
|
||||
# !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
|
||||
params['cookies']['ad'] = eng_lang
|
||||
params['cookies']['ah'] = eng_region
|
||||
params['cookies']['l'] = eng_region
|
||||
logger.debug("cookies: %s", params['cookies'])
|
||||
|
||||
params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0])
|
||||
return params
|
||||
|
||||
|
||||
def c_to_f(temperature):
|
||||
return "%.2f" % ((temperature * 1.8) + 32)
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
if resp.text.strip() == "ddg_spice_forecast();":
|
||||
return []
|
||||
|
||||
result = loads(resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2])
|
||||
|
||||
current = result["currentWeather"]
|
||||
|
||||
title = result['location']
|
||||
|
||||
infobox = f"<h3>{gettext('Current condition')}</h3><table><tbody>"
|
||||
|
||||
infobox += generate_condition_table(current)
|
||||
|
||||
infobox += "</tbody></table>"
|
||||
|
||||
last_date = None
|
||||
|
||||
for time in result['forecastHourly']['hours']:
|
||||
current_time = date_parser.parse(time['forecastStart'])
|
||||
|
||||
if last_date != current_time.date():
|
||||
if last_date is not None:
|
||||
infobox += "</tbody></table>"
|
||||
|
||||
infobox += f"<h3>{current_time.strftime('%Y-%m-%d')}</h3>"
|
||||
|
||||
infobox += "<table><tbody>"
|
||||
|
||||
for day in result['forecastDaily']['days']:
|
||||
if date_parser.parse(day['forecastStart']).date() == current_time.date():
|
||||
infobox += generate_day_table(day)
|
||||
|
||||
infobox += "</tbody></table><table><tbody>"
|
||||
|
||||
last_date = current_time.date()
|
||||
|
||||
infobox += f"<tr><td rowspan=\"7\"><b>{current_time.strftime('%H:%M')}</b></td></tr>"
|
||||
|
||||
infobox += generate_condition_table(time)
|
||||
|
||||
infobox += "</tbody></table>"
|
||||
|
||||
results.append(
|
||||
{
|
||||
"infobox": title,
|
||||
"content": infobox,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
71
searx/engines/duden.py
Normal file
71
searx/engines/duden.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Duden
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
from urllib.parse import quote, urljoin
|
||||
from lxml import html
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
||||
from searx.network import raise_for_httperror
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.duden.de',
|
||||
"wikidata_id": 'Q73624591',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
"language": 'de',
|
||||
}
|
||||
|
||||
categories = ['dictionaries']
|
||||
paging = True
|
||||
|
||||
# search-url
|
||||
base_url = 'https://www.duden.de/'
|
||||
search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
offset = params['pageno'] - 1
|
||||
if offset == 0:
|
||||
search_url_fmt = base_url + 'suchen/dudenonline/{query}'
|
||||
params['url'] = search_url_fmt.format(query=quote(query))
|
||||
else:
|
||||
params['url'] = search_url.format(offset=offset, query=quote(query))
|
||||
# after the last page of results, spelling corrections are returned after a HTTP redirect
|
||||
# whatever the page number is
|
||||
params['soft_max_redirects'] = 1
|
||||
params['raise_for_httperror'] = False
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
if resp.status_code == 404:
|
||||
return results
|
||||
|
||||
raise_for_httperror(resp)
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
number_of_results_element = eval_xpath_getindex(
|
||||
dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None
|
||||
)
|
||||
if number_of_results_element is not None:
|
||||
number_of_results_string = re.sub('[^0-9]', '', number_of_results_element)
|
||||
results.append({'number_of_results': int(number_of_results_string)})
|
||||
|
||||
for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'):
|
||||
url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
|
||||
url = urljoin(base_url, url)
|
||||
title = eval_xpath(result, 'string(.//h2/a)').strip()
|
||||
content = extract_text(eval_xpath(result, './/p'))
|
||||
# append result
|
||||
results.append({'url': url, 'title': title, 'content': content})
|
||||
|
||||
return results
|
||||
23
searx/engines/dummy-offline.py
Normal file
23
searx/engines/dummy-offline.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=invalid-name
|
||||
"""Dummy Offline
|
||||
|
||||
"""
|
||||
|
||||
|
||||
# about
|
||||
about = {
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
|
||||
def search(query, request_params): # pylint: disable=unused-argument
|
||||
return [
|
||||
{
|
||||
'result': 'this is what you get',
|
||||
}
|
||||
]
|
||||
24
searx/engines/dummy.py
Normal file
24
searx/engines/dummy.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Dummy
|
||||
|
||||
"""
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": None,
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'empty array',
|
||||
}
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params): # pylint: disable=unused-argument
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp): # pylint: disable=unused-argument
|
||||
return []
|
||||
77
searx/engines/ebay.py
Normal file
77
searx/engines/ebay.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Ebay (Videos, Music, Files)
|
||||
"""
|
||||
|
||||
from urllib.parse import quote
|
||||
|
||||
from lxml import html
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.ebay.com',
|
||||
"wikidata_id": 'Q58024',
|
||||
"official_api_documentation": 'https://developer.ebay.com/',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
categories = ['shopping']
|
||||
paging = True
|
||||
|
||||
# Set base_url in settings.yml in order to
|
||||
# have the desired local TLD.
|
||||
base_url = None
|
||||
search_url = '/sch/i.html?_nkw={query}&_sacat={pageno}'
|
||||
|
||||
results_xpath = '//li[contains(@class, "s-item")]'
|
||||
url_xpath = './/a[@class="s-item__link"]/@href'
|
||||
title_xpath = './/h3[@class="s-item__title"]'
|
||||
content_xpath = './/div[@span="SECONDARY_INFO"]'
|
||||
price_xpath = './/div[contains(@class, "s-item__detail")]/span[@class="s-item__price"][1]/text()'
|
||||
shipping_xpath = './/span[contains(@class, "s-item__shipping")]/text()'
|
||||
source_country_xpath = './/span[contains(@class, "s-item__location")]/text()'
|
||||
thumbnail_xpath = './/img[@class="s-item__image-img"]/@src'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = f'{base_url}' + search_url.format(query=quote(query), pageno=params['pageno'])
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
results_dom = dom.xpath(results_xpath)
|
||||
if not results_dom:
|
||||
return []
|
||||
|
||||
for result_dom in results_dom:
|
||||
url = extract_text(result_dom.xpath(url_xpath))
|
||||
title = extract_text(result_dom.xpath(title_xpath))
|
||||
content = extract_text(result_dom.xpath(content_xpath))
|
||||
price = extract_text(result_dom.xpath(price_xpath))
|
||||
shipping = extract_text(result_dom.xpath(shipping_xpath))
|
||||
source_country = extract_text(result_dom.xpath(source_country_xpath))
|
||||
thumbnail = extract_text(result_dom.xpath(thumbnail_xpath))
|
||||
|
||||
if title == "":
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'price': price,
|
||||
'shipping': shipping,
|
||||
'source_country': source_country,
|
||||
'thumbnail': thumbnail,
|
||||
'template': 'products.html',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
194
searx/engines/elasticsearch.py
Normal file
194
searx/engines/elasticsearch.py
Normal file
@@ -0,0 +1,194 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
""".. sidebar:: info
|
||||
|
||||
- :origin:`elasticsearch.py <searx/engines/elasticsearch.py>`
|
||||
- `Elasticsearch <https://www.elastic.co/elasticsearch/>`_
|
||||
- `Elasticsearch Guide
|
||||
<https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html>`_
|
||||
- `Install Elasticsearch
|
||||
<https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html>`_
|
||||
|
||||
Elasticsearch_ supports numerous ways to query the data it is storing. At the
|
||||
moment the engine supports the most popular search methods (``query_type``):
|
||||
|
||||
- ``match``,
|
||||
- ``simple_query_string``,
|
||||
- ``term`` and
|
||||
- ``terms``.
|
||||
|
||||
If none of the methods fit your use case, you can select ``custom`` query type
|
||||
and provide the JSON payload to submit to Elasticsearch in
|
||||
``custom_query_json``.
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
The following is an example configuration for an Elasticsearch_ instance with
|
||||
authentication configured to read from ``my-index`` index.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: elasticsearch
|
||||
shortcut: els
|
||||
engine: elasticsearch
|
||||
base_url: http://localhost:9200
|
||||
username: elastic
|
||||
password: changeme
|
||||
index: my-index
|
||||
query_type: match
|
||||
# custom_query_json: '{ ... }'
|
||||
enable_http: true
|
||||
|
||||
"""
|
||||
|
||||
from json import loads, dumps
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.result_types import EngineResults
|
||||
from searx.extended_types import SXNG_Response
|
||||
|
||||
categories = ['general']
|
||||
paging = True
|
||||
|
||||
about = {
|
||||
'website': 'https://www.elastic.co',
|
||||
'wikidata_id': 'Q3050461',
|
||||
'official_api_documentation': 'https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html',
|
||||
'use_official_api': True,
|
||||
'require_api_key': False,
|
||||
'format': 'JSON',
|
||||
}
|
||||
|
||||
base_url = 'http://localhost:9200'
|
||||
username = ''
|
||||
password = ''
|
||||
index = ''
|
||||
query_type = 'match'
|
||||
custom_query_json = {}
|
||||
show_metadata = False
|
||||
page_size = 10
|
||||
|
||||
|
||||
def init(engine_settings):
|
||||
if 'query_type' in engine_settings and engine_settings['query_type'] not in _available_query_types:
|
||||
raise ValueError('unsupported query type', engine_settings['query_type'])
|
||||
|
||||
if index == '':
|
||||
raise ValueError('index cannot be empty')
|
||||
|
||||
|
||||
def request(query, params):
|
||||
if query_type not in _available_query_types:
|
||||
return params
|
||||
|
||||
if username and password:
|
||||
params['auth'] = (username, password)
|
||||
|
||||
args = {
|
||||
'from': (params['pageno'] - 1) * page_size,
|
||||
'size': page_size,
|
||||
}
|
||||
data = _available_query_types[query_type](query)
|
||||
data.update(args)
|
||||
|
||||
params['url'] = f"{base_url}/{index}/_search"
|
||||
params['method'] = 'GET'
|
||||
params['data'] = dumps(data)
|
||||
params['headers']['Content-Type'] = 'application/json'
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def _match_query(query):
|
||||
"""
|
||||
The standard for full text queries.
|
||||
searx format: "key:value" e.g. city:berlin
|
||||
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
|
||||
"""
|
||||
|
||||
try:
|
||||
key, value = query.split(':')
|
||||
except Exception as e:
|
||||
raise ValueError('query format must be "key:value"') from e
|
||||
|
||||
return {"query": {"match": {key: {'query': value}}}}
|
||||
|
||||
|
||||
def _simple_query_string_query(query):
|
||||
"""
|
||||
Accepts query strings, but it is less strict than query_string
|
||||
The field used can be specified in index.query.default_field in Elasticsearch.
|
||||
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html
|
||||
"""
|
||||
|
||||
return {'query': {'simple_query_string': {'query': query}}}
|
||||
|
||||
|
||||
def _term_query(query):
|
||||
"""
|
||||
Accepts one term and the name of the field.
|
||||
searx format: "key:value" e.g. city:berlin
|
||||
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html
|
||||
"""
|
||||
|
||||
try:
|
||||
key, value = query.split(':')
|
||||
except Exception as e:
|
||||
raise ValueError('query format must be key:value') from e
|
||||
|
||||
return {'query': {'term': {key: value}}}
|
||||
|
||||
|
||||
def _terms_query(query):
|
||||
"""
|
||||
Accepts multiple terms and the name of the field.
|
||||
searx format: "key:value1,value2" e.g. city:berlin,paris
|
||||
REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html
|
||||
"""
|
||||
|
||||
try:
|
||||
key, values = query.split(':')
|
||||
except Exception as e:
|
||||
raise ValueError('query format must be key:value1,value2') from e
|
||||
|
||||
return {'query': {'terms': {key: values.split(',')}}}
|
||||
|
||||
|
||||
def _custom_query(query):
|
||||
key, value = query.split(':')
|
||||
custom_query = custom_query_json
|
||||
for query_key, query_value in custom_query.items():
|
||||
if query_key == '{{KEY}}':
|
||||
custom_query[key] = custom_query.pop(query_key)
|
||||
if query_value == '{{VALUE}}':
|
||||
custom_query[query_key] = value
|
||||
return custom_query
|
||||
|
||||
|
||||
def response(resp: SXNG_Response) -> EngineResults:
|
||||
res = EngineResults()
|
||||
|
||||
resp_json = loads(resp.text)
|
||||
if 'error' in resp_json:
|
||||
raise SearxEngineAPIException(resp_json["error"])
|
||||
|
||||
for result in resp_json["hits"]["hits"]:
|
||||
kvmap = {key: str(value) if not key.startswith("_") else value for key, value in result["_source"].items()}
|
||||
if show_metadata:
|
||||
kvmap["metadata"] = {"index": result["_index"], "id": result["_id"], "score": result["_score"]}
|
||||
res.add(res.types.KeyValue(kvmap=kvmap))
|
||||
|
||||
return res
|
||||
|
||||
|
||||
_available_query_types = {
|
||||
# Full text queries
|
||||
# https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html
|
||||
'match': _match_query,
|
||||
'simple_query_string': _simple_query_string_query,
|
||||
# Term-level queries
|
||||
# https://www.elastic.co/guide/en/elasticsearch/reference/current/term-level-queries.html
|
||||
'term': _term_query,
|
||||
'terms': _terms_query,
|
||||
# Query JSON defined by the instance administrator.
|
||||
'custom': _custom_query,
|
||||
}
|
||||
53
searx/engines/emojipedia.py
Normal file
53
searx/engines/emojipedia.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Emojipedia
|
||||
|
||||
Emojipedia is an emoji reference website which documents the meaning and
|
||||
common usage of emoji characters in the Unicode Standard. It is owned by Zedge
|
||||
since 2021. Emojipedia is a voting member of The Unicode Consortium.[1]
|
||||
|
||||
[1] https://en.wikipedia.org/wiki/Emojipedia
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import (
|
||||
eval_xpath_list,
|
||||
extract_text,
|
||||
)
|
||||
|
||||
about = {
|
||||
"website": 'https://emojipedia.org',
|
||||
"wikidata_id": 'Q22908129',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
categories = []
|
||||
|
||||
base_url = 'https://emojipedia.org'
|
||||
search_url = base_url + '/search?{query}'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(
|
||||
query=urlencode({'q': query}),
|
||||
)
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, '//div[starts-with(@class, "EmojisList")]/a'):
|
||||
|
||||
url = base_url + result.attrib.get('href')
|
||||
res = {'url': url, 'title': extract_text(result), 'content': ''}
|
||||
|
||||
results.append(res)
|
||||
|
||||
return results
|
||||
54
searx/engines/fdroid.py
Normal file
54
searx/engines/fdroid.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
F-Droid (a repository of FOSS applications for Android)
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.utils import extract_text
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://f-droid.org/',
|
||||
"wikidata_id": 'Q1386210',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['files', 'apps']
|
||||
paging = True
|
||||
|
||||
# search-url
|
||||
base_url = 'https://search.f-droid.org/'
|
||||
search_url = base_url + '?{query}'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''})
|
||||
params['url'] = search_url.format(query=query)
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for app in dom.xpath('//a[@class="package-header"]'):
|
||||
app_url = app.xpath('./@href')[0]
|
||||
app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()'))
|
||||
app_content = (
|
||||
extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip()
|
||||
+ ' - '
|
||||
+ extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip()
|
||||
)
|
||||
thumbnail = app.xpath('./img[@class="package-icon"]/@src')[0]
|
||||
|
||||
results.append({'url': app_url, 'title': app_title, 'content': app_content, 'thumbnail': thumbnail})
|
||||
|
||||
return results
|
||||
54
searx/engines/findthatmeme.py
Normal file
54
searx/engines/findthatmeme.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""FindThatMeme (Images)"""
|
||||
|
||||
from json import dumps
|
||||
from datetime import datetime
|
||||
from searx.utils import humanize_bytes
|
||||
|
||||
about = {
|
||||
"website": 'https://findthatmeme.com',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
base_url = "https://findthatmeme.com/api/v1/search"
|
||||
categories = ['images']
|
||||
paging = True
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
start_index = (params["pageno"] - 1) * 50
|
||||
data = {"search": query, "offset": start_index}
|
||||
params["url"] = base_url
|
||||
params["method"] = 'POST'
|
||||
params['headers']['content-type'] = "application/json"
|
||||
params['data'] = dumps(data)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
search_res = resp.json()
|
||||
results = []
|
||||
|
||||
for item in search_res:
|
||||
img = 'https://s3.thehackerblog.com/findthatmeme/' + item['image_path']
|
||||
thumb = 'https://s3.thehackerblog.com/findthatmeme/thumb/' + item.get('thumbnail', '')
|
||||
date = datetime.strptime(item["updated_at"].split("T")[0], "%Y-%m-%d")
|
||||
formatted_date = datetime.fromtimestamp(date.timestamp())
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': item['source_page_url'],
|
||||
'title': item['source_site'],
|
||||
'img_src': img if item['type'] == 'IMAGE' else thumb,
|
||||
'filesize': humanize_bytes(item['meme_file_size']),
|
||||
'publishedDate': formatted_date,
|
||||
'template': 'images.html',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
95
searx/engines/flickr.py
Normal file
95
searx/engines/flickr.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Flickr (Images)
|
||||
|
||||
More info on api-key : https://www.flickr.com/services/apps/create/
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.flickr.com',
|
||||
"wikidata_id": 'Q103204',
|
||||
"official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html',
|
||||
"use_official_api": True,
|
||||
"require_api_key": True,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
categories = ['images']
|
||||
|
||||
nb_per_page = 15
|
||||
paging = True
|
||||
api_key = None
|
||||
|
||||
|
||||
url = (
|
||||
'https://api.flickr.com/services/rest/?method=flickr.photos.search'
|
||||
+ '&api_key={api_key}&{text}&sort=relevance'
|
||||
+ '&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z'
|
||||
+ '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
|
||||
)
|
||||
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
|
||||
|
||||
paging = True
|
||||
|
||||
|
||||
def build_flickr_url(user_id, photo_id):
|
||||
return photo_url.format(userid=user_id, photoid=photo_id)
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = url.format(
|
||||
text=urlencode({'text': query}), api_key=api_key, nb_per_page=nb_per_page, page=params['pageno']
|
||||
)
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
search_results = loads(resp.text)
|
||||
|
||||
# return empty array if there are no results
|
||||
if 'photos' not in search_results:
|
||||
return []
|
||||
|
||||
if 'photo' not in search_results['photos']:
|
||||
return []
|
||||
|
||||
photos = search_results['photos']['photo']
|
||||
|
||||
# parse results
|
||||
for photo in photos:
|
||||
if 'url_o' in photo:
|
||||
img_src = photo['url_o']
|
||||
elif 'url_z' in photo:
|
||||
img_src = photo['url_z']
|
||||
else:
|
||||
continue
|
||||
|
||||
# For a bigger thumbnail, keep only the url_z, not the url_n
|
||||
if 'url_n' in photo:
|
||||
thumbnail_src = photo['url_n']
|
||||
elif 'url_z' in photo:
|
||||
thumbnail_src = photo['url_z']
|
||||
else:
|
||||
thumbnail_src = img_src
|
||||
|
||||
# append result
|
||||
results.append(
|
||||
{
|
||||
'url': build_flickr_url(photo['owner'], photo['id']),
|
||||
'title': photo['title'],
|
||||
'img_src': img_src,
|
||||
'thumbnail_src': thumbnail_src,
|
||||
'content': photo['description']['_content'],
|
||||
'author': photo['ownername'],
|
||||
'template': 'images.html',
|
||||
}
|
||||
)
|
||||
|
||||
# return results
|
||||
return results
|
||||
142
searx/engines/flickr_noapi.py
Normal file
142
searx/engines/flickr_noapi.py
Normal file
@@ -0,0 +1,142 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Flickr (Images)
|
||||
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import json
|
||||
from time import time
|
||||
import re
|
||||
from urllib.parse import urlencode
|
||||
from searx.utils import ecma_unescape, html_to_text
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.flickr.com',
|
||||
"wikidata_id": 'Q103204',
|
||||
"official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images']
|
||||
paging = True
|
||||
time_range_support = True
|
||||
safesearch = False
|
||||
|
||||
time_range_dict = {
|
||||
'day': 60 * 60 * 24,
|
||||
'week': 60 * 60 * 24 * 7,
|
||||
'month': 60 * 60 * 24 * 7 * 4,
|
||||
'year': 60 * 60 * 24 * 7 * 52,
|
||||
}
|
||||
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'm', 'n', 't', 'q', 's')
|
||||
|
||||
search_url = 'https://www.flickr.com/search?{query}&page={page}'
|
||||
time_range_url = '&min_upload_date={start}&max_upload_date={end}'
|
||||
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
|
||||
modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M)
|
||||
|
||||
|
||||
def build_flickr_url(user_id, photo_id):
|
||||
return photo_url.format(userid=user_id, photoid=photo_id)
|
||||
|
||||
|
||||
def _get_time_range_url(time_range):
|
||||
if time_range in time_range_dict:
|
||||
return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range]))
|
||||
return ''
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(query=urlencode({'text': query}), page=params['pageno']) + _get_time_range_url(
|
||||
params['time_range']
|
||||
)
|
||||
return params
|
||||
|
||||
|
||||
def response(resp): # pylint: disable=too-many-branches
|
||||
results = []
|
||||
|
||||
matches = modelexport_re.search(resp.text)
|
||||
if matches is None:
|
||||
return results
|
||||
|
||||
match = matches.group(1)
|
||||
model_export = json.loads(match)
|
||||
|
||||
if 'legend' not in model_export:
|
||||
return results
|
||||
legend = model_export['legend']
|
||||
|
||||
# handle empty page
|
||||
if not legend or not legend[0]:
|
||||
return results
|
||||
|
||||
for x, index in enumerate(legend):
|
||||
if len(index) != 8:
|
||||
logger.debug("skip legend enty %s : %s", x, index)
|
||||
continue
|
||||
|
||||
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][index[4]][index[5]][int(index[6])][
|
||||
index[7]
|
||||
]
|
||||
author = ecma_unescape(photo.get('realname', ''))
|
||||
source = ecma_unescape(photo.get('username', ''))
|
||||
if source:
|
||||
source += ' @ Flickr'
|
||||
title = ecma_unescape(photo.get('title', ''))
|
||||
content = html_to_text(ecma_unescape(photo.get('description', '')))
|
||||
img_src = None
|
||||
|
||||
# From the biggest to the lowest format
|
||||
size_data = None
|
||||
for image_size in image_sizes:
|
||||
if image_size in photo['sizes']['data']:
|
||||
size_data = photo['sizes']['data'][image_size]['data']
|
||||
break
|
||||
|
||||
if not size_data:
|
||||
logger.debug('cannot find valid image size: {0}'.format(repr(photo['sizes']['data'])))
|
||||
continue
|
||||
|
||||
img_src = size_data['url']
|
||||
resolution = f"{size_data['width']} x {size_data['height']}"
|
||||
|
||||
# For a bigger thumbnail, keep only the url_z, not the url_n
|
||||
if 'n' in photo['sizes']['data']:
|
||||
thumbnail_src = photo['sizes']['data']['n']['data']['url']
|
||||
elif 'z' in photo['sizes']['data']:
|
||||
thumbnail_src = photo['sizes']['data']['z']['data']['url']
|
||||
else:
|
||||
thumbnail_src = img_src
|
||||
|
||||
if 'ownerNsid' not in photo:
|
||||
# should not happen, disowned photo? Show it anyway
|
||||
url = img_src
|
||||
else:
|
||||
url = build_flickr_url(photo['ownerNsid'], photo['id'])
|
||||
|
||||
result = {
|
||||
'url': url,
|
||||
'img_src': img_src,
|
||||
'thumbnail_src': thumbnail_src,
|
||||
'source': source,
|
||||
'resolution': resolution,
|
||||
'template': 'images.html',
|
||||
}
|
||||
result['author'] = author.encode(errors='ignore').decode()
|
||||
result['source'] = source.encode(errors='ignore').decode()
|
||||
result['title'] = title.encode(errors='ignore').decode()
|
||||
result['content'] = content.encode(errors='ignore').decode()
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
65
searx/engines/freesound.py
Normal file
65
searx/engines/freesound.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Freesound (Sound)
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
|
||||
disabled = True
|
||||
api_key = ""
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": "https://freesound.org",
|
||||
"wikidata_id": "Q835703",
|
||||
"official_api_documentation": "https://freesound.org/docs/api",
|
||||
"use_official_api": True,
|
||||
"require_api_key": True,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
paging = True
|
||||
|
||||
# search url
|
||||
url = "https://freesound.org/apiv2/"
|
||||
search_url = (
|
||||
url + "search/text/?query={query}&page={page}&fields=name,url,download,created,description,type&token={api_key}"
|
||||
)
|
||||
|
||||
|
||||
# search request
|
||||
def request(query, params):
|
||||
params["url"] = search_url.format(
|
||||
query=urlencode({"q": query}),
|
||||
page=params["pageno"],
|
||||
api_key=api_key,
|
||||
)
|
||||
return params
|
||||
|
||||
|
||||
# get response from search request
|
||||
def response(resp):
|
||||
results = []
|
||||
search_res = loads(resp.text)
|
||||
# parse results
|
||||
for result in search_res.get("results", []):
|
||||
title = result["name"]
|
||||
content = result["description"][:128]
|
||||
publishedDate = datetime.fromisoformat(result["created"])
|
||||
uri = result["download"]
|
||||
|
||||
# append result
|
||||
results.append(
|
||||
{
|
||||
"url": result["url"],
|
||||
"title": title,
|
||||
"publishedDate": publishedDate,
|
||||
"audio_src": uri,
|
||||
"content": content,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
51
searx/engines/frinkiac.py
Normal file
51
searx/engines/frinkiac.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Frinkiac (Images)
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://frinkiac.com',
|
||||
"wikidata_id": 'Q24882614',
|
||||
"official_api_documentation": {'url': None, 'comment': 'see https://github.com/MitchellAW/CompuGlobal'},
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
categories = ['images']
|
||||
|
||||
BASE = 'https://frinkiac.com/'
|
||||
SEARCH_URL = '{base}api/search?{query}'
|
||||
RESULT_URL = '{base}?{query}'
|
||||
THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg'
|
||||
IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query}))
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
response_data = loads(resp.text)
|
||||
for result in response_data:
|
||||
episode = result['Episode']
|
||||
timestamp = result['Timestamp']
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'url': RESULT_URL.format(base=BASE, query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})),
|
||||
'title': episode,
|
||||
'content': '',
|
||||
'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp),
|
||||
'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
49
searx/engines/fyyd.py
Normal file
49
searx/engines/fyyd.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Fyyd (podcasts)
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
|
||||
about = {
|
||||
'website': 'https://fyyd.de',
|
||||
'official_api_documentation': 'https://github.com/eazyliving/fyyd-api',
|
||||
'use_official_api': True,
|
||||
'require_api_key': False,
|
||||
'results': 'JSON',
|
||||
}
|
||||
categories = []
|
||||
paging = True
|
||||
|
||||
base_url = "https://api.fyyd.de"
|
||||
page_size = 10
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
'term': query,
|
||||
'count': page_size,
|
||||
'page': params['pageno'] - 1,
|
||||
}
|
||||
params['url'] = f"{base_url}/0.2/search/podcast?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
json_results = resp.json()['data']
|
||||
|
||||
for result in json_results:
|
||||
results.append(
|
||||
{
|
||||
'url': result['htmlURL'],
|
||||
'title': result['title'],
|
||||
'content': result['description'],
|
||||
'thumbnail': result['smallImageURL'],
|
||||
'publishedDate': datetime.strptime(result['status_since'], '%Y-%m-%d %H:%M:%S'),
|
||||
'metadata': f"Rank: {result['rank']} || {result['episode_count']} episodes",
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
97
searx/engines/geizhals.py
Normal file
97
searx/engines/geizhals.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Geizhals is a German website to compare the price of a product on the
|
||||
most common German shopping sites and find the lowest price.
|
||||
|
||||
The sorting of the search results can be influenced by the following additions
|
||||
to the search term:
|
||||
|
||||
``asc`` or ``price``
|
||||
To sort by price in ascending order.
|
||||
|
||||
``desc``
|
||||
To sort by price in descending order.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import eval_xpath, eval_xpath_list, extract_text
|
||||
|
||||
about = {
|
||||
'website': 'https://geizhals.de',
|
||||
'wikidata_id': 'Q15977657',
|
||||
'use_official_api': False,
|
||||
'official_api_documentation': None,
|
||||
'require_api_key': False,
|
||||
'results': 'HTML',
|
||||
'language': 'de',
|
||||
}
|
||||
paging = True
|
||||
categories = ['shopping']
|
||||
|
||||
base_url = "https://geizhals.de"
|
||||
sort_order = 'relevance'
|
||||
|
||||
SORT_RE = re.compile(r"sort:(\w+)")
|
||||
sort_order_map = {
|
||||
'relevance': None,
|
||||
'price': 'p',
|
||||
'asc': 'p',
|
||||
'desc': '-p',
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
sort = None
|
||||
|
||||
sort_order_path = SORT_RE.search(query)
|
||||
if sort_order_path:
|
||||
sort = sort_order_map.get(sort_order_path.group(1))
|
||||
query = SORT_RE.sub("", query)
|
||||
logger.debug(query)
|
||||
|
||||
args = {
|
||||
'fs': query,
|
||||
'pg': params['pageno'],
|
||||
'toggle_all': 1, # load item specs
|
||||
'sort': sort,
|
||||
}
|
||||
params['url'] = f"{base_url}/?{urlencode(args)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
for result in eval_xpath_list(dom, "//article[contains(@class, 'listview__item')]"):
|
||||
content = []
|
||||
for spec in eval_xpath_list(result, ".//div[contains(@class, 'specs-grid__item')]"):
|
||||
content.append(f"{extract_text(eval_xpath(spec, './dt'))}: {extract_text(eval_xpath(spec, './dd'))}")
|
||||
|
||||
metadata = [
|
||||
extract_text(eval_xpath(result, ".//div[contains(@class, 'stars-rating-label')]")),
|
||||
extract_text(eval_xpath(result, ".//div[contains(@class, 'listview__offercount')]")),
|
||||
]
|
||||
|
||||
item = {
|
||||
'template': 'products.html',
|
||||
'url': (
|
||||
base_url + "/" + extract_text(eval_xpath(result, ".//a[contains(@class, 'listview__name-link')]/@href"))
|
||||
),
|
||||
'title': extract_text(eval_xpath(result, ".//h3[contains(@class, 'listview__name')]")),
|
||||
'content': ' | '.join(content),
|
||||
'thumbnail': extract_text(eval_xpath(result, ".//img[contains(@class, 'listview__image')]/@src")),
|
||||
'metadata': ', '.join(item for item in metadata if item),
|
||||
}
|
||||
|
||||
best_price = extract_text(eval_xpath(result, ".//a[contains(@class, 'listview__price-link')]")).split(" ")
|
||||
if len(best_price) > 1:
|
||||
item["price"] = f"Bestes Angebot: {best_price[1]}€"
|
||||
results.append(item)
|
||||
|
||||
return results
|
||||
102
searx/engines/genius.py
Normal file
102
searx/engines/genius.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=invalid-name
|
||||
"""Genius
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://genius.com/',
|
||||
"wikidata_id": 'Q3419343',
|
||||
"official_api_documentation": 'https://docs.genius.com/',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['music', 'lyrics']
|
||||
paging = True
|
||||
page_size = 5
|
||||
|
||||
url = 'https://genius.com/api/'
|
||||
search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}'
|
||||
music_player = 'https://genius.com{api_path}/apple_music_player'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(
|
||||
query=urlencode({'q': query}),
|
||||
index='multi',
|
||||
page_size=page_size,
|
||||
pageno=params['pageno'],
|
||||
)
|
||||
return params
|
||||
|
||||
|
||||
def parse_lyric(hit):
|
||||
content = ''
|
||||
highlights = hit['highlights']
|
||||
if highlights:
|
||||
content = hit['highlights'][0]['value']
|
||||
else:
|
||||
content = hit['result'].get('title_with_featured', '')
|
||||
|
||||
timestamp = hit['result']['lyrics_updated_at']
|
||||
result = {
|
||||
'url': hit['result']['url'],
|
||||
'title': hit['result']['full_title'],
|
||||
'content': content,
|
||||
'thumbnail': hit['result']['song_art_image_thumbnail_url'],
|
||||
}
|
||||
if timestamp:
|
||||
result.update({'publishedDate': datetime.fromtimestamp(timestamp)})
|
||||
api_path = hit['result'].get('api_path')
|
||||
if api_path:
|
||||
# The players are just playing 30sec from the title. Some of the player
|
||||
# will be blocked because of a cross-origin request and some players will
|
||||
# link to apple when you press the play button.
|
||||
result['iframe_src'] = music_player.format(api_path=api_path)
|
||||
return result
|
||||
|
||||
|
||||
def parse_artist(hit):
|
||||
result = {
|
||||
'url': hit['result']['url'],
|
||||
'title': hit['result']['name'],
|
||||
'content': '',
|
||||
'thumbnail': hit['result']['image_url'],
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
def parse_album(hit):
|
||||
res = hit['result']
|
||||
content = res.get('name_with_artist', res.get('name', ''))
|
||||
x = res.get('release_date_components')
|
||||
if x:
|
||||
x = x.get('year')
|
||||
if x:
|
||||
content = "%s / %s" % (x, content)
|
||||
return {
|
||||
'url': res['url'],
|
||||
'title': res['full_title'],
|
||||
'thumbnail': res['cover_art_url'],
|
||||
'content': content.strip(),
|
||||
}
|
||||
|
||||
|
||||
parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album}
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
for section in resp.json()['response']['sections']:
|
||||
for hit in section['hits']:
|
||||
func = parse.get(hit['type'])
|
||||
if func:
|
||||
results.append(func(hit))
|
||||
return results
|
||||
116
searx/engines/gitea.py
Normal file
116
searx/engines/gitea.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Engine to search in collaborative software platforms based on Gitea_ or Forgejo_.
|
||||
|
||||
.. _Gitea: https://about.gitea.com/
|
||||
.. _Forgejo: https://forgejo.org/
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The engine has the following mandatory setting:
|
||||
|
||||
- :py:obj:`base_url`
|
||||
|
||||
Optional settings are:
|
||||
|
||||
- :py:obj:`sort`
|
||||
- :py:obj:`order`
|
||||
- :py:obj:`page_size`
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: gitea.com
|
||||
engine: gitea
|
||||
base_url: https://gitea.com
|
||||
shortcut: gitea
|
||||
|
||||
- name: forgejo.com
|
||||
engine: gitea
|
||||
base_url: https://code.forgejo.org
|
||||
shortcut: forgejo
|
||||
|
||||
If you would like to use additional instances, just configure new engines in the
|
||||
:ref:`settings <settings engines>` and set the ``base_url``.
|
||||
|
||||
|
||||
Implementation
|
||||
==============
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from dateutil import parser
|
||||
|
||||
about = {
|
||||
"website": 'https://about.gitea.com',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": 'https://docs.gitea.com/next/development/api-usage',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
categories = ['it', 'repos']
|
||||
paging = True
|
||||
|
||||
base_url: str = ''
|
||||
"""URL of the Gitea_ instance."""
|
||||
|
||||
sort: str = "updated"
|
||||
"""Sort criteria, possible values:
|
||||
|
||||
- ``updated`` (default)
|
||||
- ``alpha``
|
||||
- ``created``
|
||||
- ``size``
|
||||
- ``id``
|
||||
"""
|
||||
|
||||
order = "desc"
|
||||
"""Sort order, possible values:
|
||||
|
||||
- ``desc`` (default)
|
||||
- ``asc``
|
||||
"""
|
||||
|
||||
page_size: int = 10
|
||||
"""Maximum number of results per page (default 10)."""
|
||||
|
||||
|
||||
def init(_):
|
||||
if not base_url:
|
||||
raise ValueError('gitea engine: base_url is unset')
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {'q': query, 'limit': page_size, 'sort': sort, 'order': order, 'page': params['pageno']}
|
||||
params['url'] = f"{base_url}/api/v1/repos/search?{urlencode(args)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
for item in resp.json().get('data', []):
|
||||
content = [item.get(i) for i in ['language', 'description'] if item.get(i)]
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'packages.html',
|
||||
'url': item.get('html_url'),
|
||||
'title': item.get('full_name'),
|
||||
'content': ' / '.join(content),
|
||||
# Use Repository Avatar and fall back to Owner Avatar if not set.
|
||||
'thumbnail': item.get('avatar_url') or item.get('owner', {}).get('avatar_url'),
|
||||
'package_name': item.get('name'),
|
||||
'maintainer': item.get('owner', {}).get('username'),
|
||||
'publishedDate': parser.parse(item.get("updated_at") or item.get("created_at")),
|
||||
'tags': item.get('topics', []),
|
||||
'popularity': item.get('stars_count'),
|
||||
'homepage': item.get('website'),
|
||||
'source_code_url': item.get('clone_url'),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
67
searx/engines/github.py
Normal file
67
searx/engines/github.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Github (IT)
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from dateutil import parser
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://github.com/',
|
||||
"wikidata_id": 'Q364',
|
||||
"official_api_documentation": 'https://developer.github.com/v3/',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['it', 'repos']
|
||||
|
||||
# search-url
|
||||
search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}'
|
||||
accept_header = 'application/vnd.github.preview.text-match+json'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
params['url'] = search_url.format(query=urlencode({'q': query}))
|
||||
params['headers']['Accept'] = accept_header
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
for item in resp.json().get('items', []):
|
||||
content = [item.get(i) for i in ['language', 'description'] if item.get(i)]
|
||||
|
||||
# license can be None
|
||||
lic = item.get('license') or {}
|
||||
lic_url = None
|
||||
if lic.get('spdx_id'):
|
||||
lic_url = f"https://spdx.org/licenses/{lic.get('spdx_id')}.html"
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'packages.html',
|
||||
'url': item.get('html_url'),
|
||||
'title': item.get('full_name'),
|
||||
'content': ' / '.join(content),
|
||||
'thumbnail': item.get('owner', {}).get('avatar_url'),
|
||||
'package_name': item.get('name'),
|
||||
# 'version': item.get('updated_at'),
|
||||
'maintainer': item.get('owner', {}).get('login'),
|
||||
'publishedDate': parser.parse(item.get("updated_at") or item.get("created_at")),
|
||||
'tags': item.get('topics', []),
|
||||
'popularity': item.get('stargazers_count'),
|
||||
'license_name': lic.get('name'),
|
||||
'license_url': lic_url,
|
||||
'homepage': item.get('homepage'),
|
||||
'source_code_url': item.get('clone_url'),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
95
searx/engines/gitlab.py
Normal file
95
searx/engines/gitlab.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Engine to search in collaborative software platforms based on GitLab_ with
|
||||
the `GitLab REST API`_.
|
||||
|
||||
.. _GitLab: https://about.gitlab.com/install/
|
||||
.. _GitLab REST API: https://docs.gitlab.com/ee/api/
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The engine has the following mandatory setting:
|
||||
|
||||
- :py:obj:`base_url`
|
||||
|
||||
Optional settings are:
|
||||
|
||||
- :py:obj:`api_path`
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: gitlab
|
||||
engine: gitlab
|
||||
base_url: https://gitlab.com
|
||||
shortcut: gl
|
||||
about:
|
||||
website: https://gitlab.com/
|
||||
wikidata_id: Q16639197
|
||||
|
||||
- name: gnome
|
||||
engine: gitlab
|
||||
base_url: https://gitlab.gnome.org
|
||||
shortcut: gn
|
||||
about:
|
||||
website: https://gitlab.gnome.org
|
||||
wikidata_id: Q44316
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from dateutil import parser
|
||||
|
||||
about = {
|
||||
"website": None,
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": "https://docs.gitlab.com/ee/api/",
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
categories = ['it', 'repos']
|
||||
paging = True
|
||||
|
||||
base_url: str = ""
|
||||
"""Base URL of the GitLab host."""
|
||||
|
||||
api_path: str = 'api/v4/projects'
|
||||
"""The path the `project API <https://docs.gitlab.com/ee/api/projects.html>`_.
|
||||
|
||||
The default path should work fine usually.
|
||||
"""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {'search': query, 'page': params['pageno']}
|
||||
params['url'] = f"{base_url}/{api_path}?{urlencode(args)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
for item in resp.json():
|
||||
results.append(
|
||||
{
|
||||
'template': 'packages.html',
|
||||
'url': item.get('web_url'),
|
||||
'title': item.get('name'),
|
||||
'content': item.get('description'),
|
||||
'thumbnail': item.get('avatar_url'),
|
||||
'package_name': item.get('name'),
|
||||
'maintainer': item.get('namespace', {}).get('name'),
|
||||
'publishedDate': parser.parse(item.get('last_activity_at') or item.get("created_at")),
|
||||
'tags': item.get('tag_list', []),
|
||||
'popularity': item.get('star_count'),
|
||||
'homepage': item.get('readme_url'),
|
||||
'source_code_url': item.get('http_url_to_repo'),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
57
searx/engines/goodreads.py
Normal file
57
searx/engines/goodreads.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Goodreads (books)
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from lxml import html
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list
|
||||
|
||||
about = {
|
||||
'website': 'https://www.goodreads.com',
|
||||
'wikidata_id': 'Q2359213',
|
||||
'official_api_documentation': None,
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'HTML',
|
||||
}
|
||||
categories = []
|
||||
paging = True
|
||||
|
||||
base_url = "https://www.goodreads.com"
|
||||
|
||||
results_xpath = "//table//tr"
|
||||
thumbnail_xpath = ".//img[contains(@class, 'bookCover')]/@src"
|
||||
url_xpath = ".//a[contains(@class, 'bookTitle')]/@href"
|
||||
title_xpath = ".//a[contains(@class, 'bookTitle')]"
|
||||
author_xpath = ".//a[contains(@class, 'authorName')]"
|
||||
info_text_xpath = ".//span[contains(@class, 'uitext')]"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
'q': query,
|
||||
'page': params['pageno'],
|
||||
}
|
||||
|
||||
params['url'] = f"{base_url}/search?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, results_xpath):
|
||||
results.append(
|
||||
{
|
||||
'url': base_url + extract_text(eval_xpath(result, url_xpath)),
|
||||
'title': extract_text(eval_xpath(result, title_xpath)),
|
||||
'thumbnail': extract_text(eval_xpath(result, thumbnail_xpath)),
|
||||
'content': extract_text(eval_xpath(result, info_text_xpath)),
|
||||
'metadata': extract_text(eval_xpath(result, author_xpath)),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
534
searx/engines/google.py
Normal file
534
searx/engines/google.py
Normal file
@@ -0,0 +1,534 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""This is the implementation of the Google WEB engine. Some of this
|
||||
implementations (manly the :py:obj:`get_google_info`) are shared by other
|
||||
engines:
|
||||
|
||||
- :ref:`google images engine`
|
||||
- :ref:`google news engine`
|
||||
- :ref:`google videos engine`
|
||||
- :ref:`google scholar engine`
|
||||
- :ref:`google autocomplete`
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import re
|
||||
import random
|
||||
import string
|
||||
import time
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
import babel
|
||||
import babel.core
|
||||
import babel.languages
|
||||
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
||||
from searx.locales import language_tag, region_tag, get_official_locales
|
||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.google.com',
|
||||
"wikidata_id": 'Q9366',
|
||||
"official_api_documentation": 'https://developers.google.com/custom-search/',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
max_page = 50
|
||||
"""`Google max 50 pages`_
|
||||
|
||||
.. _Google max 50 pages: https://github.com/searxng/searxng/issues/2982
|
||||
"""
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
|
||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
||||
|
||||
# Filter results. 0: None, 1: Moderate, 2: Strict
|
||||
filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
|
||||
|
||||
# specific xpath variables
|
||||
# ------------------------
|
||||
|
||||
# Suggestions are links placed in a *card-section*, we extract only the text
|
||||
# from the links not the links itself.
|
||||
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
|
||||
|
||||
|
||||
_arcid_range = string.ascii_letters + string.digits + "_-"
|
||||
_arcid_random: tuple[str, int] | None = None
|
||||
|
||||
|
||||
def ui_async(start: int) -> str:
|
||||
"""Format of the response from UI's async request.
|
||||
|
||||
- ``arc_id:<...>,use_ac:true,_fmt:prog``
|
||||
|
||||
The arc_id is random generated every hour.
|
||||
"""
|
||||
global _arcid_random # pylint: disable=global-statement
|
||||
|
||||
use_ac = "use_ac:true"
|
||||
# _fmt:html returns a HTTP 500 when user search for celebrities like
|
||||
# '!google natasha allegri' or '!google chris evans'
|
||||
_fmt = "_fmt:prog"
|
||||
|
||||
# create a new random arc_id every hour
|
||||
if not _arcid_random or (int(time.time()) - _arcid_random[1]) > 3600:
|
||||
_arcid_random = (''.join(random.choices(_arcid_range, k=23)), int(time.time()))
|
||||
arc_id = f"arc_id:srp_{_arcid_random[0]}_1{start:02}"
|
||||
|
||||
return ",".join([arc_id, use_ac, _fmt])
|
||||
|
||||
|
||||
def get_google_info(params, eng_traits):
|
||||
"""Composing various (language) properties for the google engines (:ref:`google
|
||||
API`).
|
||||
|
||||
This function is called by the various google engines (:ref:`google web
|
||||
engine`, :ref:`google images engine`, :ref:`google news engine` and
|
||||
:ref:`google videos engine`).
|
||||
|
||||
:param dict param: Request parameters of the engine. At least
|
||||
a ``searxng_locale`` key should be in the dictionary.
|
||||
|
||||
:param eng_traits: Engine's traits fetched from google preferences
|
||||
(:py:obj:`searx.enginelib.traits.EngineTraits`)
|
||||
|
||||
:rtype: dict
|
||||
:returns:
|
||||
Py-Dictionary with the key/value pairs:
|
||||
|
||||
language:
|
||||
The language code that is used by google (e.g. ``lang_en`` or
|
||||
``lang_zh-TW``)
|
||||
|
||||
country:
|
||||
The country code that is used by google (e.g. ``US`` or ``TW``)
|
||||
|
||||
locale:
|
||||
A instance of :py:obj:`babel.core.Locale` build from the
|
||||
``searxng_locale`` value.
|
||||
|
||||
subdomain:
|
||||
Google subdomain :py:obj:`google_domains` that fits to the country
|
||||
code.
|
||||
|
||||
params:
|
||||
Py-Dictionary with additional request arguments (can be passed to
|
||||
:py:func:`urllib.parse.urlencode`).
|
||||
|
||||
- ``hl`` parameter: specifies the interface language of user interface.
|
||||
- ``lr`` parameter: restricts search results to documents written in
|
||||
a particular language.
|
||||
- ``cr`` parameter: restricts search results to documents
|
||||
originating in a particular country.
|
||||
- ``ie`` parameter: sets the character encoding scheme that should
|
||||
be used to interpret the query string ('utf8').
|
||||
- ``oe`` parameter: sets the character encoding scheme that should
|
||||
be used to decode the XML result ('utf8').
|
||||
|
||||
headers:
|
||||
Py-Dictionary with additional HTTP headers (can be passed to
|
||||
request's headers)
|
||||
|
||||
- ``Accept: '*/*``
|
||||
|
||||
"""
|
||||
|
||||
ret_val = {
|
||||
'language': None,
|
||||
'country': None,
|
||||
'subdomain': None,
|
||||
'params': {},
|
||||
'headers': {},
|
||||
'cookies': {},
|
||||
'locale': None,
|
||||
}
|
||||
|
||||
sxng_locale = params.get('searxng_locale', 'all')
|
||||
try:
|
||||
locale = babel.Locale.parse(sxng_locale, sep='-')
|
||||
except babel.core.UnknownLocaleError:
|
||||
locale = None
|
||||
|
||||
eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
|
||||
lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
|
||||
country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
|
||||
|
||||
# Test zh_hans & zh_hant --> in the topmost links in the result list of list
|
||||
# TW and HK you should a find wiktionary.org zh_hant link. In the result
|
||||
# list of zh-CN should not be no hant link instead you should find
|
||||
# zh.m.wikipedia.org/zh somewhere in the top.
|
||||
|
||||
# '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
|
||||
# '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
|
||||
|
||||
ret_val['language'] = eng_lang
|
||||
ret_val['country'] = country
|
||||
ret_val['locale'] = locale
|
||||
ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
|
||||
|
||||
# hl parameter:
|
||||
# The hl parameter specifies the interface language (host language) of
|
||||
# your user interface. To improve the performance and the quality of your
|
||||
# search results, you are strongly encouraged to set this parameter
|
||||
# explicitly.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#hlsp
|
||||
# The Interface Language:
|
||||
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
|
||||
|
||||
# https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
|
||||
ret_val['params']['hl'] = f'{lang_code}-{country}'
|
||||
|
||||
# lr parameter:
|
||||
# The lr (language restrict) parameter restricts search results to
|
||||
# documents written in a particular language.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#lrsp
|
||||
# Language Collection Values:
|
||||
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
|
||||
#
|
||||
# To select 'all' languages an empty 'lr' value is used.
|
||||
#
|
||||
# Different to other google services, Google Scholar supports to select more
|
||||
# than one language. The languages are separated by a pipe '|' (logical OR).
|
||||
# By example: &lr=lang_zh-TW%7Clang_de selects articles written in
|
||||
# traditional chinese OR german language.
|
||||
|
||||
ret_val['params']['lr'] = eng_lang
|
||||
if sxng_locale == 'all':
|
||||
ret_val['params']['lr'] = ''
|
||||
|
||||
# cr parameter:
|
||||
# The cr parameter restricts search results to documents originating in a
|
||||
# particular country.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#crsp
|
||||
|
||||
# specify a region (country) only if a region is given in the selected
|
||||
# locale --> https://github.com/searxng/searxng/issues/2672
|
||||
ret_val['params']['cr'] = ''
|
||||
if len(sxng_locale.split('-')) > 1:
|
||||
ret_val['params']['cr'] = 'country' + country
|
||||
|
||||
# gl parameter: (mandatory by Google News)
|
||||
# The gl parameter value is a two-letter country code. For WebSearch
|
||||
# results, the gl parameter boosts search results whose country of origin
|
||||
# matches the parameter value. See the Country Codes section for a list of
|
||||
# valid values.
|
||||
# Specifying a gl parameter value in WebSearch requests should improve the
|
||||
# relevance of results. This is particularly true for international
|
||||
# customers and, even more specifically, for customers in English-speaking
|
||||
# countries other than the United States.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#glsp
|
||||
|
||||
# https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635
|
||||
# ret_val['params']['gl'] = country
|
||||
|
||||
# ie parameter:
|
||||
# The ie parameter sets the character encoding scheme that should be used
|
||||
# to interpret the query string. The default ie value is latin1.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#iesp
|
||||
|
||||
ret_val['params']['ie'] = 'utf8'
|
||||
|
||||
# oe parameter:
|
||||
# The oe parameter sets the character encoding scheme that should be used
|
||||
# to decode the XML result. The default oe value is latin1.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#oesp
|
||||
|
||||
ret_val['params']['oe'] = 'utf8'
|
||||
|
||||
# num parameter:
|
||||
# The num parameter identifies the number of search results to return.
|
||||
# The default num value is 10, and the maximum value is 20. If you request
|
||||
# more than 20 results, only 20 results will be returned.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#numsp
|
||||
|
||||
# HINT: seems to have no effect (tested in google WEB & Images)
|
||||
# ret_val['params']['num'] = 20
|
||||
|
||||
# HTTP headers
|
||||
|
||||
ret_val['headers']['Accept'] = '*/*'
|
||||
|
||||
# Cookies
|
||||
|
||||
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
|
||||
# - https://github.com/searxng/searxng/issues/1555
|
||||
ret_val['cookies']['CONSENT'] = "YES+"
|
||||
|
||||
return ret_val
|
||||
|
||||
|
||||
def detect_google_sorry(resp):
|
||||
if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Google search request"""
|
||||
# pylint: disable=line-too-long
|
||||
start = (params['pageno'] - 1) * 10
|
||||
str_async = ui_async(start)
|
||||
google_info = get_google_info(params, traits)
|
||||
logger.debug("ARC_ID: %s", str_async)
|
||||
|
||||
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
|
||||
query_url = (
|
||||
'https://'
|
||||
+ google_info['subdomain']
|
||||
+ '/search'
|
||||
+ "?"
|
||||
+ urlencode(
|
||||
{
|
||||
'q': query,
|
||||
**google_info['params'],
|
||||
'filter': '0',
|
||||
'start': start,
|
||||
# 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
|
||||
# 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
|
||||
# 'cs' : 1,
|
||||
# 'sa': 'N',
|
||||
# 'yv': 3,
|
||||
# 'prmd': 'vin',
|
||||
# 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
|
||||
# 'sa': 'N',
|
||||
# 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
|
||||
# formally known as use_mobile_ui
|
||||
'asearch': 'arc',
|
||||
'async': str_async,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
if params['time_range'] in time_range_dict:
|
||||
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
|
||||
if params['safesearch']:
|
||||
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
|
||||
params['url'] = query_url
|
||||
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
return params
|
||||
|
||||
|
||||
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;
|
||||
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
|
||||
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
|
||||
RE_DATA_IMAGE_end = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*)$')
|
||||
|
||||
|
||||
def parse_data_images(text: str):
|
||||
data_image_map = {}
|
||||
|
||||
for img_id, data_image in RE_DATA_IMAGE.findall(text):
|
||||
end_pos = data_image.rfind('=')
|
||||
if end_pos > 0:
|
||||
data_image = data_image[: end_pos + 1]
|
||||
data_image_map[img_id] = data_image
|
||||
last = RE_DATA_IMAGE_end.search(text)
|
||||
if last:
|
||||
data_image_map[last.group(1)] = last.group(2)
|
||||
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
|
||||
return data_image_map
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
"""Get response from google's search request"""
|
||||
# pylint: disable=too-many-branches, too-many-statements
|
||||
detect_google_sorry(resp)
|
||||
data_image_map = parse_data_images(resp.text)
|
||||
|
||||
results = EngineResults()
|
||||
|
||||
# convert the text to dom
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# results --> answer
|
||||
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
|
||||
for item in answer_list:
|
||||
for bubble in eval_xpath(item, './/div[@class="nnFGuf"]'):
|
||||
bubble.drop_tree()
|
||||
results.add(
|
||||
results.types.Answer(
|
||||
answer=extract_text(item),
|
||||
url=(eval_xpath(item, '../..//a/@href') + [None])[0],
|
||||
)
|
||||
)
|
||||
|
||||
# parse results
|
||||
|
||||
for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
|
||||
# pylint: disable=too-many-nested-blocks
|
||||
|
||||
try:
|
||||
title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
|
||||
if title_tag is None:
|
||||
# this not one of the common google results *section*
|
||||
logger.debug('ignoring item from the result_xpath list: missing title')
|
||||
continue
|
||||
title = extract_text(title_tag)
|
||||
|
||||
url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
|
||||
if url is None:
|
||||
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
|
||||
continue
|
||||
|
||||
content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
|
||||
for item in content_nodes:
|
||||
for script in item.xpath(".//script"):
|
||||
script.getparent().remove(script)
|
||||
|
||||
content = extract_text(content_nodes)
|
||||
|
||||
if not content:
|
||||
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
|
||||
continue
|
||||
|
||||
thumbnail = content_nodes[0].xpath('.//img/@src')
|
||||
if thumbnail:
|
||||
thumbnail = thumbnail[0]
|
||||
if thumbnail.startswith('data:image'):
|
||||
img_id = content_nodes[0].xpath('.//img/@id')
|
||||
if img_id:
|
||||
thumbnail = data_image_map.get(img_id[0])
|
||||
else:
|
||||
thumbnail = None
|
||||
|
||||
results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail})
|
||||
|
||||
except Exception as e: # pylint: disable=broad-except
|
||||
logger.error(e, exc_info=True)
|
||||
continue
|
||||
|
||||
# parse suggestion
|
||||
for suggestion in eval_xpath_list(dom, suggestion_xpath):
|
||||
# append suggestion
|
||||
results.append({'suggestion': extract_text(suggestion)})
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
|
||||
|
||||
skip_countries = [
|
||||
# official language of google-country not in google-languages
|
||||
'AL', # Albanien (sq)
|
||||
'AZ', # Aserbaidschan (az)
|
||||
'BD', # Bangladesch (bn)
|
||||
'BN', # Brunei Darussalam (ms)
|
||||
'BT', # Bhutan (dz)
|
||||
'ET', # Äthiopien (am)
|
||||
'GE', # Georgien (ka, os)
|
||||
'GL', # Grönland (kl)
|
||||
'KH', # Kambodscha (km)
|
||||
'LA', # Laos (lo)
|
||||
'LK', # Sri Lanka (si, ta)
|
||||
'ME', # Montenegro (sr)
|
||||
'MK', # Nordmazedonien (mk, sq)
|
||||
'MM', # Myanmar (my)
|
||||
'MN', # Mongolei (mn)
|
||||
'MV', # Malediven (dv) // dv_MV is unknown by babel
|
||||
'MY', # Malaysia (ms)
|
||||
'NP', # Nepal (ne)
|
||||
'TJ', # Tadschikistan (tg)
|
||||
'TM', # Turkmenistan (tk)
|
||||
'UZ', # Usbekistan (uz)
|
||||
]
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
|
||||
"""Fetch languages from Google."""
|
||||
# pylint: disable=import-outside-toplevel, too-many-branches
|
||||
|
||||
engine_traits.custom['supported_domains'] = {}
|
||||
|
||||
resp = get('https://www.google.com/preferences')
|
||||
if not resp.ok: # type: ignore
|
||||
raise RuntimeError("Response from Google's preferences is not OK.")
|
||||
|
||||
dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
|
||||
|
||||
# supported language codes
|
||||
|
||||
lang_map = {'no': 'nb'}
|
||||
for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
|
||||
eng_lang = x.get("value")
|
||||
try:
|
||||
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
|
||||
except babel.UnknownLocaleError:
|
||||
print("INFO: google UI language %s (%s) is unknown by babel" % (eng_lang, x.text.split("(")[0].strip()))
|
||||
continue
|
||||
sxng_lang = language_tag(locale)
|
||||
|
||||
conflict = engine_traits.languages.get(sxng_lang)
|
||||
if conflict:
|
||||
if conflict != eng_lang:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
|
||||
continue
|
||||
engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
|
||||
|
||||
# alias languages
|
||||
engine_traits.languages['zh'] = 'lang_zh-CN'
|
||||
|
||||
# supported region codes
|
||||
|
||||
for x in eval_xpath_list(dom, "//select[@name='gl']/option"):
|
||||
eng_country = x.get("value")
|
||||
|
||||
if eng_country in skip_countries:
|
||||
continue
|
||||
if eng_country == 'ZZ':
|
||||
engine_traits.all_locale = 'ZZ'
|
||||
continue
|
||||
|
||||
sxng_locales = get_official_locales(eng_country, engine_traits.languages.keys(), regional=True)
|
||||
|
||||
if not sxng_locales:
|
||||
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
|
||||
continue
|
||||
|
||||
for sxng_locale in sxng_locales:
|
||||
engine_traits.regions[region_tag(sxng_locale)] = eng_country
|
||||
|
||||
# alias regions
|
||||
engine_traits.regions['zh-CN'] = 'HK'
|
||||
|
||||
# supported domains
|
||||
|
||||
if add_domains:
|
||||
resp = get('https://www.google.com/supported_domains')
|
||||
if not resp.ok: # type: ignore
|
||||
raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
|
||||
|
||||
for domain in resp.text.split(): # type: ignore
|
||||
domain = domain.strip()
|
||||
if not domain or domain in [
|
||||
'.google.com',
|
||||
]:
|
||||
continue
|
||||
region = domain.split('.')[-1].upper()
|
||||
engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
|
||||
if region == 'HK':
|
||||
# There is no google.cn, we use .com.hk for zh-CN
|
||||
engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore
|
||||
132
searx/engines/google_images.py
Normal file
132
searx/engines/google_images.py
Normal file
@@ -0,0 +1,132 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""This is the implementation of the Google Images engine using the internal
|
||||
Google API used by the Google Go Android app.
|
||||
|
||||
This internal API offer results in
|
||||
|
||||
- JSON (``_fmt:json``)
|
||||
- Protobuf_ (``_fmt:pb``)
|
||||
- Protobuf_ compressed? (``_fmt:pc``)
|
||||
- HTML (``_fmt:html``)
|
||||
- Protobuf_ encoded in JSON (``_fmt:jspb``).
|
||||
|
||||
.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from json import loads
|
||||
|
||||
from searx.engines.google import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.google import (
|
||||
get_google_info,
|
||||
time_range_dict,
|
||||
detect_google_sorry,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
logger: logging.Logger
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://images.google.com',
|
||||
"wikidata_id": 'Q521550',
|
||||
"official_api_documentation": 'https://developers.google.com/custom-search',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images', 'web']
|
||||
paging = True
|
||||
max_page = 50
|
||||
"""`Google max 50 pages`_
|
||||
|
||||
.. _Google max 50 pages: https://github.com/searxng/searxng/issues/2982
|
||||
"""
|
||||
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
send_accept_language_header = True
|
||||
|
||||
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Google-Image search request"""
|
||||
|
||||
google_info = get_google_info(params, traits)
|
||||
|
||||
query_url = (
|
||||
'https://'
|
||||
+ google_info['subdomain']
|
||||
+ '/search'
|
||||
+ '?'
|
||||
+ urlencode({'q': query, 'tbm': "isch", **google_info['params'], 'asearch': 'isch'})
|
||||
# don't urlencode this because wildly different AND bad results
|
||||
# pagination uses Zero-based numbering
|
||||
+ f'&async=_fmt:json,p:1,ijn:{params["pageno"] - 1}'
|
||||
)
|
||||
|
||||
if params['time_range'] in time_range_dict:
|
||||
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
|
||||
if params['safesearch']:
|
||||
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
|
||||
params['url'] = query_url
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
# this ua will allow getting ~50 results instead of 10. #1641
|
||||
params['headers']['User-Agent'] = (
|
||||
'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12;' f' {google_info.get("country", "US")}) gzip'
|
||||
)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
"""Get response from google's search request"""
|
||||
results = []
|
||||
|
||||
detect_google_sorry(resp)
|
||||
|
||||
json_start = resp.text.find('{"ischj":')
|
||||
json_data = loads(resp.text[json_start:])
|
||||
|
||||
for item in json_data["ischj"].get("metadata", []):
|
||||
result_item = {
|
||||
'url': item["result"]["referrer_url"],
|
||||
'title': item["result"]["page_title"],
|
||||
'content': item["text_in_grid"]["snippet"],
|
||||
'source': item["result"]["site_title"],
|
||||
'resolution': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
|
||||
'img_src': item["original_image"]["url"],
|
||||
'thumbnail_src': item["thumbnail"]["url"],
|
||||
'template': 'images.html',
|
||||
}
|
||||
|
||||
author = item["result"].get('iptc', {}).get('creator')
|
||||
if author:
|
||||
result_item['author'] = ', '.join(author)
|
||||
|
||||
copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
|
||||
if copyright_notice:
|
||||
result_item['source'] += ' | ' + copyright_notice
|
||||
|
||||
freshness_date = item["result"].get("freshness_date")
|
||||
if freshness_date:
|
||||
result_item['source'] += ' | ' + freshness_date
|
||||
|
||||
file_size = item.get('gsa', {}).get('file_size')
|
||||
if file_size:
|
||||
result_item['source'] += ' (%s)' % file_size
|
||||
|
||||
results.append(result_item)
|
||||
|
||||
return results
|
||||
304
searx/engines/google_news.py
Normal file
304
searx/engines/google_news.py
Normal file
@@ -0,0 +1,304 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""This is the implementation of the Google News engine.
|
||||
|
||||
Google News has a different region handling compared to Google WEB.
|
||||
|
||||
- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
|
||||
- the hl_ argument has to be set correctly (and different to Google WEB)
|
||||
- the gl_ argument is mandatory
|
||||
|
||||
If one of this argument is not set correctly, the request is redirected to
|
||||
CONSENT dialog::
|
||||
|
||||
https://consent.google.com/m?continue=
|
||||
|
||||
The google news API ignores some parameters from the common :ref:`google API`:
|
||||
|
||||
- num_ : the number of search results is ignored / there is no paging all
|
||||
results for a query term are in the first response.
|
||||
- save_ : is ignored / Google-News results are always *SafeSearch*
|
||||
|
||||
.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
|
||||
.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
|
||||
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
|
||||
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from urllib.parse import urlencode
|
||||
import base64
|
||||
from lxml import html
|
||||
import babel
|
||||
|
||||
from searx import locales
|
||||
from searx.utils import (
|
||||
eval_xpath,
|
||||
eval_xpath_list,
|
||||
eval_xpath_getindex,
|
||||
extract_text,
|
||||
)
|
||||
|
||||
from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.google import (
|
||||
get_google_info,
|
||||
detect_google_sorry,
|
||||
)
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://news.google.com',
|
||||
"wikidata_id": 'Q12020',
|
||||
"official_api_documentation": 'https://developers.google.com/custom-search',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['news']
|
||||
paging = False
|
||||
time_range_support = False
|
||||
|
||||
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
|
||||
# False here, otherwise checker will report safesearch-errors::
|
||||
#
|
||||
# safesearch : results are identical for safesearch=0 and safesearch=2
|
||||
safesearch = True
|
||||
# send_accept_language_header = True
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Google-News search request"""
|
||||
|
||||
sxng_locale = params.get('searxng_locale', 'en-US')
|
||||
ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
|
||||
google_info = get_google_info(params, traits)
|
||||
google_info['subdomain'] = 'news.google.com' # google news has only one domain
|
||||
|
||||
ceid_region, ceid_lang = ceid.split(':')
|
||||
ceid_lang, ceid_suffix = (
|
||||
ceid_lang.split('-')
|
||||
+ [
|
||||
None,
|
||||
]
|
||||
)[:2]
|
||||
|
||||
google_info['params']['hl'] = ceid_lang
|
||||
|
||||
if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
|
||||
|
||||
if ceid_region.lower() == ceid_lang:
|
||||
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
|
||||
else:
|
||||
google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
|
||||
|
||||
elif ceid_region.lower() != ceid_lang:
|
||||
|
||||
if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
|
||||
google_info['params']['hl'] = ceid_lang
|
||||
else:
|
||||
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
|
||||
|
||||
google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
|
||||
google_info['params']['gl'] = ceid_region
|
||||
|
||||
query_url = (
|
||||
'https://'
|
||||
+ google_info['subdomain']
|
||||
+ "/search?"
|
||||
+ urlencode(
|
||||
{
|
||||
'q': query,
|
||||
**google_info['params'],
|
||||
}
|
||||
)
|
||||
# ceid includes a ':' character which must not be urlencoded
|
||||
+ ('&ceid=%s' % ceid)
|
||||
)
|
||||
|
||||
params['url'] = query_url
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
"""Get response from google's search request"""
|
||||
results = []
|
||||
detect_google_sorry(resp)
|
||||
|
||||
# convert the text to dom
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
|
||||
|
||||
# The first <a> tag in the <article> contains the link to the article
|
||||
# The href attribute of the <a> tag is a google internal link, we have
|
||||
# to decode
|
||||
|
||||
href = eval_xpath_getindex(result, './article/a/@href', 0)
|
||||
href = href.split('?')[0]
|
||||
href = href.split('/')[-1]
|
||||
href = base64.urlsafe_b64decode(href + '====')
|
||||
href = href[href.index(b'http') :].split(b'\xd2')[0]
|
||||
href = href.decode()
|
||||
|
||||
title = extract_text(eval_xpath(result, './article/h3[1]'))
|
||||
|
||||
# The pub_date is mostly a string like 'yesterday', not a real
|
||||
# timezone date or time. Therefore we can't use publishedDate.
|
||||
pub_date = extract_text(eval_xpath(result, './article//time'))
|
||||
pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
|
||||
|
||||
content = ' / '.join([x for x in [pub_origin, pub_date] if x])
|
||||
|
||||
# The image URL is located in a preceding sibling <img> tag, e.g.:
|
||||
# "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
|
||||
# These URL are long but not personalized (double checked via tor).
|
||||
|
||||
thumbnail = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': href,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'thumbnail': thumbnail,
|
||||
}
|
||||
)
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
ceid_list = [
|
||||
'AE:ar',
|
||||
'AR:es-419',
|
||||
'AT:de',
|
||||
'AU:en',
|
||||
'BD:bn',
|
||||
'BE:fr',
|
||||
'BE:nl',
|
||||
'BG:bg',
|
||||
'BR:pt-419',
|
||||
'BW:en',
|
||||
'CA:en',
|
||||
'CA:fr',
|
||||
'CH:de',
|
||||
'CH:fr',
|
||||
'CL:es-419',
|
||||
'CN:zh-Hans',
|
||||
'CO:es-419',
|
||||
'CU:es-419',
|
||||
'CZ:cs',
|
||||
'DE:de',
|
||||
'EG:ar',
|
||||
'ES:es',
|
||||
'ET:en',
|
||||
'FR:fr',
|
||||
'GB:en',
|
||||
'GH:en',
|
||||
'GR:el',
|
||||
'HK:zh-Hant',
|
||||
'HU:hu',
|
||||
'ID:en',
|
||||
'ID:id',
|
||||
'IE:en',
|
||||
'IL:en',
|
||||
'IL:he',
|
||||
'IN:bn',
|
||||
'IN:en',
|
||||
'IN:hi',
|
||||
'IN:ml',
|
||||
'IN:mr',
|
||||
'IN:ta',
|
||||
'IN:te',
|
||||
'IT:it',
|
||||
'JP:ja',
|
||||
'KE:en',
|
||||
'KR:ko',
|
||||
'LB:ar',
|
||||
'LT:lt',
|
||||
'LV:en',
|
||||
'LV:lv',
|
||||
'MA:fr',
|
||||
'MX:es-419',
|
||||
'MY:en',
|
||||
'NA:en',
|
||||
'NG:en',
|
||||
'NL:nl',
|
||||
'NO:no',
|
||||
'NZ:en',
|
||||
'PE:es-419',
|
||||
'PH:en',
|
||||
'PK:en',
|
||||
'PL:pl',
|
||||
'PT:pt-150',
|
||||
'RO:ro',
|
||||
'RS:sr',
|
||||
'RU:ru',
|
||||
'SA:ar',
|
||||
'SE:sv',
|
||||
'SG:en',
|
||||
'SI:sl',
|
||||
'SK:sk',
|
||||
'SN:fr',
|
||||
'TH:th',
|
||||
'TR:tr',
|
||||
'TW:zh-Hant',
|
||||
'TZ:en',
|
||||
'UA:ru',
|
||||
'UA:uk',
|
||||
'UG:en',
|
||||
'US:en',
|
||||
'US:es-419',
|
||||
'VE:es-419',
|
||||
'VN:vi',
|
||||
'ZA:en',
|
||||
'ZW:en',
|
||||
]
|
||||
"""List of region/language combinations supported by Google News. Values of the
|
||||
``ceid`` argument of the Google News REST API."""
|
||||
|
||||
|
||||
_skip_values = [
|
||||
'ET:en', # english (ethiopia)
|
||||
'ID:en', # english (indonesia)
|
||||
'LV:en', # english (latvia)
|
||||
]
|
||||
|
||||
_ceid_locale_map = {'NO:no': 'nb-NO'}
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
_fetch_traits(engine_traits, add_domains=False)
|
||||
|
||||
engine_traits.custom['ceid'] = {}
|
||||
|
||||
for ceid in ceid_list:
|
||||
if ceid in _skip_values:
|
||||
continue
|
||||
|
||||
region, lang = ceid.split(':')
|
||||
x = lang.split('-')
|
||||
if len(x) > 1:
|
||||
if x[1] not in ['Hant', 'Hans']:
|
||||
lang = x[0]
|
||||
|
||||
sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
|
||||
try:
|
||||
locale = babel.Locale.parse(sxng_locale, sep='-')
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
|
||||
continue
|
||||
|
||||
engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid
|
||||
115
searx/engines/google_play.py
Normal file
115
searx/engines/google_play.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Google Play Apps & Google Play Movies
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.utils import (
|
||||
eval_xpath,
|
||||
extract_url,
|
||||
extract_text,
|
||||
eval_xpath_list,
|
||||
eval_xpath_getindex,
|
||||
)
|
||||
|
||||
about = {
|
||||
"website": "https://play.google.com/",
|
||||
"wikidata_id": "Q79576",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "HTML",
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
|
||||
play_categ = None # apps|movies
|
||||
base_url = 'https://play.google.com'
|
||||
search_url = base_url + "/store/search?{query}&c={play_categ}"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
if play_categ not in ('movies', 'apps'):
|
||||
raise ValueError(f"unknown google play category: {play_categ}")
|
||||
|
||||
params["url"] = search_url.format(
|
||||
query=urlencode({"q": query}),
|
||||
play_categ=play_categ,
|
||||
)
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
if play_categ == 'movies':
|
||||
return response_movies(resp)
|
||||
if play_categ == 'apps':
|
||||
return response_apps(resp)
|
||||
|
||||
raise ValueError(f"Unsupported play category: {play_categ}")
|
||||
|
||||
|
||||
def response_movies(resp):
|
||||
|
||||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for section in eval_xpath(dom, '//c-wiz/section/header/..'):
|
||||
sec_name = extract_text(eval_xpath(section, './header'))
|
||||
for item in eval_xpath(section, './/a'):
|
||||
url = base_url + item.get('href')
|
||||
div_1, div_2 = eval_xpath(item, './div')[:2]
|
||||
title = extract_text(eval_xpath(div_2, './div[@title]'))
|
||||
metadata = extract_text(eval_xpath(div_2, './div[@class]'))
|
||||
img = eval_xpath(div_1, './/img')[0]
|
||||
thumbnail = img.get('src')
|
||||
results.append(
|
||||
{
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": sec_name,
|
||||
"thumbnail": thumbnail,
|
||||
'metadata': metadata,
|
||||
'template': 'videos.html',
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def response_apps(resp):
|
||||
|
||||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
if eval_xpath(dom, '//div[@class="v6DsQb"]'):
|
||||
return []
|
||||
|
||||
spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None)
|
||||
if spot is not None:
|
||||
url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url)
|
||||
title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]'))
|
||||
content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]'))
|
||||
img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src'))
|
||||
|
||||
results.append({"url": url, "title": title, "content": content, "img_src": img})
|
||||
|
||||
more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1)
|
||||
for result in more:
|
||||
url = extract_url(eval_xpath(result, ".//a/@href"), search_url)
|
||||
title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]'))
|
||||
content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]'))
|
||||
img = extract_text(
|
||||
eval_xpath(
|
||||
result,
|
||||
'.//img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src',
|
||||
)
|
||||
)
|
||||
|
||||
results.append({"url": url, "title": title, "content": content, "img_src": img})
|
||||
|
||||
for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'):
|
||||
results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))})
|
||||
|
||||
return results
|
||||
221
searx/engines/google_scholar.py
Normal file
221
searx/engines/google_scholar.py
Normal file
@@ -0,0 +1,221 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""This is the implementation of the Google Scholar engine.
|
||||
|
||||
Compared to other Google services the Scholar engine has a simple GET REST-API
|
||||
and there does not exists `async` API. Even though the API slightly vintage we
|
||||
can make use of the :ref:`google API` to assemble the arguments of the GET
|
||||
request.
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Optional
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import (
|
||||
eval_xpath,
|
||||
eval_xpath_getindex,
|
||||
eval_xpath_list,
|
||||
extract_text,
|
||||
)
|
||||
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
|
||||
from searx.engines.google import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.google import (
|
||||
get_google_info,
|
||||
time_range_dict,
|
||||
)
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://scholar.google.com',
|
||||
"wikidata_id": 'Q494817',
|
||||
"official_api_documentation": 'https://developers.google.com/custom-search',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['science', 'scientific publications']
|
||||
paging = True
|
||||
max_page = 50
|
||||
"""`Google max 50 pages`_
|
||||
|
||||
.. _Google max 50 pages: https://github.com/searxng/searxng/issues/2982
|
||||
"""
|
||||
language_support = True
|
||||
time_range_support = True
|
||||
safesearch = False
|
||||
send_accept_language_header = True
|
||||
|
||||
|
||||
def time_range_args(params):
|
||||
"""Returns a dictionary with a time range arguments based on
|
||||
``params['time_range']``.
|
||||
|
||||
Google Scholar supports a detailed search by year. Searching by *last
|
||||
month* or *last week* (as offered by SearXNG) is uncommon for scientific
|
||||
publications and is not supported by Google Scholar.
|
||||
|
||||
To limit the result list when the users selects a range, all the SearXNG
|
||||
ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
|
||||
is set an empty dictionary of arguments is returned. Example; when
|
||||
user selects a time range (current year minus one in 2022):
|
||||
|
||||
.. code:: python
|
||||
|
||||
{ 'as_ylo' : 2021 }
|
||||
|
||||
"""
|
||||
ret_val = {}
|
||||
if params['time_range'] in time_range_dict:
|
||||
ret_val['as_ylo'] = datetime.now().year - 1
|
||||
return ret_val
|
||||
|
||||
|
||||
def detect_google_captcha(dom):
|
||||
"""In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
|
||||
not redirected to ``sorry.google.com``.
|
||||
"""
|
||||
if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Google-Scholar search request"""
|
||||
|
||||
google_info = get_google_info(params, traits)
|
||||
# subdomain is: scholar.google.xy
|
||||
google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
|
||||
|
||||
args = {
|
||||
'q': query,
|
||||
**google_info['params'],
|
||||
'start': (params['pageno'] - 1) * 10,
|
||||
'as_sdt': '2007', # include patents / to disable set '0,5'
|
||||
'as_vis': '0', # include citations / to disable set '1'
|
||||
}
|
||||
args.update(time_range_args(params))
|
||||
|
||||
params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
return params
|
||||
|
||||
|
||||
def parse_gs_a(text: Optional[str]):
|
||||
"""Parse the text written in green.
|
||||
|
||||
Possible formats:
|
||||
* "{authors} - {journal}, {year} - {publisher}"
|
||||
* "{authors} - {year} - {publisher}"
|
||||
* "{authors} - {publisher}"
|
||||
"""
|
||||
if text is None or text == "":
|
||||
return None, None, None, None
|
||||
|
||||
s_text = text.split(' - ')
|
||||
authors = s_text[0].split(', ')
|
||||
publisher = s_text[-1]
|
||||
if len(s_text) != 3:
|
||||
return authors, None, publisher, None
|
||||
|
||||
# the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
|
||||
# get journal and year
|
||||
journal_year = s_text[1].split(', ')
|
||||
# journal is optional and may contains some coma
|
||||
if len(journal_year) > 1:
|
||||
journal = ', '.join(journal_year[0:-1])
|
||||
if journal == '…':
|
||||
journal = None
|
||||
else:
|
||||
journal = None
|
||||
# year
|
||||
year = journal_year[-1]
|
||||
try:
|
||||
publishedDate = datetime.strptime(year.strip(), '%Y')
|
||||
except ValueError:
|
||||
publishedDate = None
|
||||
return authors, journal, publisher, publishedDate
|
||||
|
||||
|
||||
def response(resp): # pylint: disable=too-many-locals
|
||||
"""Parse response from Google Scholar"""
|
||||
results = []
|
||||
|
||||
# convert the text to dom
|
||||
dom = html.fromstring(resp.text)
|
||||
detect_google_captcha(dom)
|
||||
|
||||
# parse results
|
||||
for result in eval_xpath_list(dom, '//div[@data-rp]'):
|
||||
|
||||
title = extract_text(eval_xpath(result, './/h3[1]//a'))
|
||||
|
||||
if not title:
|
||||
# this is a [ZITATION] block
|
||||
continue
|
||||
|
||||
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
|
||||
if pub_type:
|
||||
pub_type = pub_type[1:-1].lower()
|
||||
|
||||
url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
|
||||
content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
|
||||
authors, journal, publisher, publishedDate = parse_gs_a(
|
||||
extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
|
||||
)
|
||||
if publisher in url:
|
||||
publisher = None
|
||||
|
||||
# cited by
|
||||
comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
|
||||
|
||||
# link to the html or pdf document
|
||||
html_url = None
|
||||
pdf_url = None
|
||||
doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
|
||||
doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
|
||||
if doc_type == "[PDF]":
|
||||
pdf_url = doc_url
|
||||
else:
|
||||
html_url = doc_url
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'paper.html',
|
||||
'type': pub_type,
|
||||
'url': url,
|
||||
'title': title,
|
||||
'authors': authors,
|
||||
'publisher': publisher,
|
||||
'journal': journal,
|
||||
'publishedDate': publishedDate,
|
||||
'content': content,
|
||||
'comments': comments,
|
||||
'html_url': html_url,
|
||||
'pdf_url': pdf_url,
|
||||
}
|
||||
)
|
||||
|
||||
# parse suggestion
|
||||
for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
|
||||
# append suggestion
|
||||
results.append({'suggestion': extract_text(suggestion)})
|
||||
|
||||
for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
|
||||
results.append({'correction': extract_text(correction)})
|
||||
|
||||
return results
|
||||
153
searx/engines/google_videos.py
Normal file
153
searx/engines/google_videos.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""This is the implementation of the Google Videos engine.
|
||||
|
||||
.. admonition:: Content-Security-Policy (CSP)
|
||||
|
||||
This engine needs to allow images from the `data URLs`_ (prefixed with the
|
||||
``data:`` scheme)::
|
||||
|
||||
Header set Content-Security-Policy "img-src 'self' data: ;"
|
||||
|
||||
.. _data URLs:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import (
|
||||
eval_xpath,
|
||||
eval_xpath_list,
|
||||
eval_xpath_getindex,
|
||||
extract_text,
|
||||
)
|
||||
|
||||
from searx.engines.google import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.google import (
|
||||
get_google_info,
|
||||
time_range_dict,
|
||||
filter_mapping,
|
||||
suggestion_xpath,
|
||||
detect_google_sorry,
|
||||
ui_async,
|
||||
parse_data_images,
|
||||
)
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.utils import get_embeded_stream_url
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.google.com',
|
||||
"wikidata_id": 'Q219885',
|
||||
"official_api_documentation": 'https://developers.google.com/custom-search',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
|
||||
categories = ['videos', 'web']
|
||||
paging = True
|
||||
max_page = 50
|
||||
"""`Google: max 50 pages`
|
||||
|
||||
.. _Google: max 50 pages: https://github.com/searxng/searxng/issues/2982
|
||||
"""
|
||||
language_support = True
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Google-Video search request"""
|
||||
|
||||
google_info = get_google_info(params, traits)
|
||||
start = (params['pageno'] - 1) * 10
|
||||
|
||||
query_url = (
|
||||
'https://'
|
||||
+ google_info['subdomain']
|
||||
+ '/search'
|
||||
+ "?"
|
||||
+ urlencode(
|
||||
{
|
||||
'q': query,
|
||||
'tbm': "vid",
|
||||
'start': 10 * params['pageno'],
|
||||
**google_info['params'],
|
||||
'asearch': 'arc',
|
||||
'async': ui_async(start),
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
if params['time_range'] in time_range_dict:
|
||||
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
|
||||
if 'safesearch' in params:
|
||||
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
|
||||
params['url'] = query_url
|
||||
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
"""Get response from google's search request"""
|
||||
results = []
|
||||
|
||||
detect_google_sorry(resp)
|
||||
data_image_map = parse_data_images(resp.text)
|
||||
|
||||
# convert the text to dom
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
|
||||
|
||||
thumbnail = eval_xpath_getindex(result, './/img/@src', 0, None)
|
||||
if thumbnail:
|
||||
if thumbnail.startswith('data:image'):
|
||||
img_id = eval_xpath_getindex(result, './/img/@id', 0, None)
|
||||
if img_id:
|
||||
thumbnail = data_image_map.get(img_id)
|
||||
else:
|
||||
thumbnail = None
|
||||
|
||||
title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0))
|
||||
url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0)
|
||||
|
||||
c_node = eval_xpath_getindex(result, './/div[contains(@class, "ITZIwc")]', 0)
|
||||
content = extract_text(c_node)
|
||||
pub_info = extract_text(eval_xpath(result, './/div[contains(@class, "gqF9jc")]'))
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'author': pub_info,
|
||||
'thumbnail': thumbnail,
|
||||
'iframe_src': get_embeded_stream_url(url),
|
||||
'template': 'videos.html',
|
||||
}
|
||||
)
|
||||
|
||||
# parse suggestion
|
||||
for suggestion in eval_xpath_list(dom, suggestion_xpath):
|
||||
# append suggestion
|
||||
results.append({'suggestion': extract_text(suggestion)})
|
||||
|
||||
return results
|
||||
94
searx/engines/hackernews.py
Normal file
94
searx/engines/hackernews.py
Normal file
@@ -0,0 +1,94 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Hackernews
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
from flask_babel import gettext
|
||||
|
||||
# Engine metadata
|
||||
about = {
|
||||
"website": "https://news.ycombinator.com/",
|
||||
"wikidata_id": "Q686797",
|
||||
"official_api_documentation": "https://hn.algolia.com/api",
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
# Engine configuration
|
||||
paging = True
|
||||
time_range_support = True
|
||||
categories = ["it"]
|
||||
results_per_page = 30
|
||||
|
||||
# Search URL
|
||||
base_url = "https://hn.algolia.com/api/v1"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
search_type = 'search'
|
||||
if not query:
|
||||
# if search query is empty show results from HN's front page
|
||||
search_type = 'search_by_date'
|
||||
query_params = {
|
||||
"tags": "front_page",
|
||||
"page": (params["pageno"] - 1),
|
||||
}
|
||||
else:
|
||||
query_params = {
|
||||
"query": query,
|
||||
"page": (params["pageno"] - 1),
|
||||
"hitsPerPage": results_per_page,
|
||||
"minWordSizefor1Typo": 4,
|
||||
"minWordSizefor2Typos": 8,
|
||||
"advancedSyntax": "true",
|
||||
"ignorePlurals": "false",
|
||||
"minProximity": 7,
|
||||
"numericFilters": '[]',
|
||||
"tagFilters": '["story",[]]',
|
||||
"typoTolerance": "true",
|
||||
"queryType": "prefixLast",
|
||||
"restrictSearchableAttributes": '["title","comment_text","url","story_text","author"]',
|
||||
"getRankingInfo": "true",
|
||||
}
|
||||
|
||||
if params['time_range']:
|
||||
search_type = 'search_by_date'
|
||||
timestamp = (
|
||||
# pylint: disable=unexpected-keyword-arg
|
||||
datetime.now()
|
||||
- relativedelta(**{f"{params['time_range']}s": 1}) # type: ignore
|
||||
).timestamp()
|
||||
query_params["numericFilters"] = f"created_at_i>{timestamp}"
|
||||
|
||||
params["url"] = f"{base_url}/{search_type}?{urlencode(query_params)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
data = resp.json()
|
||||
|
||||
for hit in data["hits"]:
|
||||
object_id = hit["objectID"]
|
||||
points = hit.get("points") or 0
|
||||
num_comments = hit.get("num_comments") or 0
|
||||
|
||||
metadata = ""
|
||||
if points != 0 or num_comments != 0:
|
||||
metadata = f"{gettext('points')}: {points}" f" | {gettext('comments')}: {num_comments}"
|
||||
results.append(
|
||||
{
|
||||
"title": hit.get("title") or f"{gettext('author')}: {hit['author']}",
|
||||
"url": f"https://news.ycombinator.com/item?id={object_id}",
|
||||
"content": hit.get("url") or hit.get("comment_text") or hit.get("story_text") or "",
|
||||
"metadata": metadata,
|
||||
"author": hit["author"],
|
||||
"publishedDate": datetime.fromtimestamp(hit["created_at_i"]),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
81
searx/engines/hex.py
Normal file
81
searx/engines/hex.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""hex.pm"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from dateutil import parser
|
||||
|
||||
|
||||
about = {
|
||||
# pylint: disable=line-too-long
|
||||
"website": "https://hex.pm/",
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": "https://github.com/hexpm/hexpm/blob/main/lib/hexpm_web/controllers/api/package_controller.ex",
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
categories = ["it", "packages"]
|
||||
|
||||
|
||||
# engine dependent config
|
||||
paging = True
|
||||
search_url = "https://hex.pm/api/packages/"
|
||||
# Valid values: name inserted_at updated_at total_downloads recent_downloads
|
||||
sort_criteria = "recent_downloads"
|
||||
page_size = 10
|
||||
|
||||
linked_terms = {
|
||||
# lower-case : replacement
|
||||
"author": "Author",
|
||||
"bitbucket": "Bitbucket",
|
||||
"bug tracker": "Issue tracker",
|
||||
"changelog": "Changelog",
|
||||
"doc": "Documentation",
|
||||
"docs": "Documentation",
|
||||
"documentation": "Documentation",
|
||||
"github repository": "GitHub",
|
||||
"github": "GitHub",
|
||||
"gitlab": "GitLab",
|
||||
"issues": "Issue tracker",
|
||||
"project source code": "Source code",
|
||||
"repository": "Source code",
|
||||
"scm": "Source code",
|
||||
"sourcehut": "SourceHut",
|
||||
"sources": "Source code",
|
||||
"sponsor": "Sponsors",
|
||||
"sponsors": "Sponsors",
|
||||
"website": "Homepage",
|
||||
}
|
||||
|
||||
|
||||
def request(query: str, params):
|
||||
args = urlencode({"page": params["pageno"], "per_page": page_size, "sort": sort_criteria, "search": query})
|
||||
params["url"] = f"{search_url}?{args}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
for package in resp.json():
|
||||
meta = package["meta"]
|
||||
published_date = package.get("updated_at")
|
||||
published_date = parser.parse(published_date)
|
||||
links = {linked_terms.get(k.lower(), k): v for k, v in meta.get("links").items()}
|
||||
results.append(
|
||||
{
|
||||
"template": "packages.html",
|
||||
"url": package["html_url"],
|
||||
"title": package["name"],
|
||||
"package_name": package["name"],
|
||||
"content": meta.get("description", ""),
|
||||
"version": meta.get("latest_version"),
|
||||
"maintainer": ", ".join(meta.get("maintainers", [])),
|
||||
"publishedDate": published_date,
|
||||
"license_name": ", ".join(meta.get("licenses", [])),
|
||||
"homepage": package["docs_html_url"],
|
||||
"links": links,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
116
searx/engines/huggingface.py
Normal file
116
searx/engines/huggingface.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""`Hugging Face`_ search engine for SearXNG.
|
||||
|
||||
.. _Hugging Face: https://huggingface.co
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The engine has the following additional settings:
|
||||
|
||||
- :py:obj:`huggingface_endpoint`
|
||||
|
||||
Configurations for endpoints:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: huggingface
|
||||
engine: huggingface
|
||||
shortcut: hf
|
||||
|
||||
- name: huggingface datasets
|
||||
huggingface_endpoint: datasets
|
||||
engine: huggingface
|
||||
shortcut: hfd
|
||||
|
||||
- name: huggingface spaces
|
||||
huggingface_endpoint: spaces
|
||||
engine: huggingface
|
||||
shortcut: hfs
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.utils import html_to_text
|
||||
from searx.result_types import EngineResults, MainResult
|
||||
|
||||
about = {
|
||||
"website": "https://huggingface.co/",
|
||||
"wikidata_id": "Q108943604",
|
||||
"official_api_documentation": "https://huggingface.co/docs/hub/en/api",
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
categories = ['it', 'repos']
|
||||
|
||||
base_url = "https://huggingface.co"
|
||||
|
||||
huggingface_endpoint = 'models'
|
||||
"""Hugging Face supports datasets, models, spaces as search endpoint.
|
||||
|
||||
- ``datasets``: search for datasets
|
||||
- ``models``: search for models
|
||||
- ``spaces``: search for spaces
|
||||
"""
|
||||
|
||||
|
||||
def init(_):
|
||||
if huggingface_endpoint not in ('datasets', 'models', 'spaces'):
|
||||
raise SearxEngineAPIException(f"Unsupported Hugging Face endpoint: {huggingface_endpoint}")
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {
|
||||
"direction": -1,
|
||||
"search": query,
|
||||
}
|
||||
|
||||
params["url"] = f"{base_url}/api/{huggingface_endpoint}?{urlencode(query_params)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
results = EngineResults()
|
||||
|
||||
data = resp.json()
|
||||
|
||||
for entry in data:
|
||||
if huggingface_endpoint != 'models':
|
||||
url = f"{base_url}/{huggingface_endpoint}/{entry['id']}"
|
||||
else:
|
||||
url = f"{base_url}/{entry['id']}"
|
||||
|
||||
published_date = None
|
||||
try:
|
||||
published_date = datetime.strptime(entry["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
contents = []
|
||||
if entry.get("likes"):
|
||||
contents.append(f"Likes: {entry['likes']}")
|
||||
if entry.get("downloads"):
|
||||
contents.append(f"Downloads: {entry['downloads']:,}")
|
||||
if entry.get("tags"):
|
||||
contents.append(f"Tags: {', '.join(entry['tags'])}")
|
||||
if entry.get("description"):
|
||||
contents.append(f"Description: {entry['description']}")
|
||||
|
||||
item = MainResult(
|
||||
title=entry["id"],
|
||||
content=html_to_text(" | ".join(contents)),
|
||||
url=url,
|
||||
publishedDate=published_date,
|
||||
)
|
||||
results.add(item)
|
||||
|
||||
return results
|
||||
71
searx/engines/il_post.py
Normal file
71
searx/engines/il_post.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Engine for Il Post, a largely independent online Italian newspaper.
|
||||
|
||||
To use this engine add the following entry to your engines
|
||||
list in ``settings.yml``:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: il post
|
||||
engine: il_post
|
||||
shortcut: pst
|
||||
disabled: false
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
engine_type = "online"
|
||||
language_support = False
|
||||
categories = ["news"]
|
||||
paging = True
|
||||
page_size = 10
|
||||
|
||||
time_range_support = True
|
||||
time_range_args = {"month": "pub_date:ultimi_30_giorni", "year": "pub_date:ultimo_anno"}
|
||||
|
||||
search_api = "https://api.ilpost.org/search/api/site_search/?"
|
||||
|
||||
about = {
|
||||
"website": "https://www.ilpost.it",
|
||||
"wikidata_id": "Q3792882",
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
"language": "it",
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {
|
||||
"qs": query,
|
||||
"pg": params["pageno"],
|
||||
"sort": "date_d",
|
||||
"filters": "ctype:articoli",
|
||||
}
|
||||
|
||||
if params["time_range"]:
|
||||
if params["time_range"] not in time_range_args:
|
||||
return None
|
||||
query_params["filters"] += f";{time_range_args.get(params['time_range'], 'pub_date:da_sempre')}"
|
||||
params["url"] = search_api + urlencode(query_params)
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
res = EngineResults()
|
||||
json_data = resp.json()
|
||||
|
||||
for result in json_data["docs"]:
|
||||
res.add(
|
||||
res.types.MainResult(
|
||||
url=result["link"],
|
||||
title=result["title"],
|
||||
content=result.get("summary", ""),
|
||||
thumbnail=result.get("image"),
|
||||
)
|
||||
)
|
||||
|
||||
return res
|
||||
97
searx/engines/imdb.py
Normal file
97
searx/engines/imdb.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""IMDB - Internet Movie Database
|
||||
|
||||
Retrieves results from a basic search. Advanced search options are not
|
||||
supported. IMDB's API is undocumented, here are some posts about:
|
||||
|
||||
- https://stackoverflow.com/questions/1966503/does-imdb-provide-an-api
|
||||
- https://rapidapi.com/blog/how-to-use-imdb-api/
|
||||
|
||||
An alternative that needs IMDPro_ is `IMDb and Box Office Mojo
|
||||
<https://developer.imdb.com/documentation>`_
|
||||
|
||||
.. __IMDPro: https://pro.imdb.com/login
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
about = {
|
||||
"website": 'https://imdb.com/',
|
||||
"wikidata_id": 'Q37312',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
categories = ["movies"]
|
||||
paging = False
|
||||
|
||||
# suggestion_url = "https://sg.media-imdb.com/suggestion/{letter}/{query}.json"
|
||||
suggestion_url = "https://v2.sg.media-imdb.com/suggestion/{letter}/{query}.json"
|
||||
|
||||
href_base = 'https://imdb.com/{category}/{entry_id}'
|
||||
|
||||
search_categories = {"nm": "name", "tt": "title", "kw": "keyword", "co": "company", "ep": "episode"}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
query = query.replace(" ", "_").lower()
|
||||
params['url'] = suggestion_url.format(letter=query[0], query=query)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
suggestions = json.loads(resp.text)
|
||||
results = []
|
||||
|
||||
for entry in suggestions.get('d', []):
|
||||
|
||||
# https://developer.imdb.com/documentation/key-concepts#imdb-ids
|
||||
entry_id = entry['id']
|
||||
categ = search_categories.get(entry_id[:2])
|
||||
if categ is None:
|
||||
logger.error('skip unknown category tag %s in %s', entry_id[:2], entry_id)
|
||||
continue
|
||||
|
||||
title = entry['l']
|
||||
if 'q' in entry:
|
||||
title += " (%s)" % entry['q']
|
||||
|
||||
content = ''
|
||||
if 'rank' in entry:
|
||||
content += "(%s) " % entry['rank']
|
||||
if 'y' in entry:
|
||||
content += str(entry['y']) + " - "
|
||||
if 's' in entry:
|
||||
content += entry['s']
|
||||
|
||||
# imageUrl is the image itself, it is not a thumb!
|
||||
image_url = entry.get('i', {}).get('imageUrl')
|
||||
if image_url:
|
||||
# get thumbnail
|
||||
image_url_name, image_url_prefix = image_url.rsplit('.', 1)
|
||||
# recipe to get the magic value:
|
||||
# * search on imdb.com, look at the URL of the thumbnail on the right side of the screen
|
||||
# * search using the imdb engine, compare the imageUrl and thumbnail URL
|
||||
# QL75 : JPEG quality (?)
|
||||
# UX280 : resize to width 320
|
||||
# 280,414 : size of the image (add white border)
|
||||
magic = 'QL75_UX280_CR0,0,280,414_'
|
||||
if not image_url_name.endswith('_V1_'):
|
||||
magic = '_V1_' + magic
|
||||
image_url = image_url_name + magic + '.' + image_url_prefix
|
||||
results.append(
|
||||
{
|
||||
"title": title,
|
||||
"url": href_base.format(category=categ, entry_id=entry_id),
|
||||
"content": content,
|
||||
"thumbnail": image_url,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
65
searx/engines/imgur.py
Normal file
65
searx/engines/imgur.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Imgur (images)
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list
|
||||
|
||||
about = {
|
||||
"website": 'https://imgur.com/',
|
||||
"wikidata_id": 'Q355022',
|
||||
"official_api_documentation": 'https://api.imgur.com/',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
categories = ['images']
|
||||
paging = True
|
||||
time_range_support = True
|
||||
|
||||
base_url = "https://imgur.com"
|
||||
|
||||
results_xpath = "//div[contains(@class, 'cards')]/div[contains(@class, 'post')]"
|
||||
url_xpath = "./a/@href"
|
||||
title_xpath = "./a/img/@alt"
|
||||
thumbnail_xpath = "./a/img/@src"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
time_range = params['time_range'] or 'all'
|
||||
args = {
|
||||
'q': query,
|
||||
'qs': 'thumbs',
|
||||
'p': params['pageno'] - 1,
|
||||
}
|
||||
params['url'] = f"{base_url}/search/score/{time_range}?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, results_xpath):
|
||||
thumbnail_src = extract_text(eval_xpath(result, thumbnail_xpath))
|
||||
img_src = thumbnail_src.replace("b.", ".")
|
||||
|
||||
# that's a bug at imgur's side:
|
||||
# sometimes there's just no preview image, hence we skip the image
|
||||
if len(thumbnail_src) < 25:
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'url': base_url + extract_text(eval_xpath(result, url_xpath)),
|
||||
'title': extract_text(eval_xpath(result, title_xpath)),
|
||||
'img_src': img_src,
|
||||
'thumbnail_src': thumbnail_src,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
75
searx/engines/ina.py
Normal file
75
searx/engines/ina.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
INA (Videos)
|
||||
"""
|
||||
|
||||
from html import unescape
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.ina.fr/',
|
||||
"wikidata_id": 'Q1665109',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
"language": 'fr',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['videos']
|
||||
paging = True
|
||||
page_size = 12
|
||||
|
||||
# search-url
|
||||
base_url = 'https://www.ina.fr'
|
||||
search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'
|
||||
|
||||
# specific xpath variables
|
||||
results_xpath = '//div[@id="searchHits"]/div'
|
||||
url_xpath = './/a/@href'
|
||||
title_xpath = './/div[contains(@class,"title-bloc-small")]'
|
||||
content_xpath = './/div[contains(@class,"sous-titre-fonction")]'
|
||||
thumbnail_xpath = './/img/@data-src'
|
||||
publishedDate_xpath = './/div[contains(@class,"dateAgenda")]'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(start=params['pageno'] * page_size, query=urlencode({'q': query}))
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
# we get html in a JSON container...
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in eval_xpath_list(dom, results_xpath):
|
||||
url_relative = eval_xpath_getindex(result, url_xpath, 0)
|
||||
url = base_url + url_relative
|
||||
title = unescape(extract_text(eval_xpath(result, title_xpath)))
|
||||
thumbnail = extract_text(eval_xpath(result, thumbnail_xpath))
|
||||
content = extract_text(eval_xpath(result, publishedDate_xpath)) + extract_text(
|
||||
eval_xpath(result, content_xpath)
|
||||
)
|
||||
|
||||
# append result
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'template': 'videos.html',
|
||||
'thumbnail': thumbnail,
|
||||
}
|
||||
)
|
||||
|
||||
# return results
|
||||
return results
|
||||
118
searx/engines/invidious.py
Normal file
118
searx/engines/invidious.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Invidious (Videos)
|
||||
|
||||
If you want to use invidious with SearXNG you should setup one locally.
|
||||
No public instance offer a public API now
|
||||
|
||||
- https://github.com/searxng/searxng/issues/2722#issuecomment-2884993248
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import random
|
||||
from urllib.parse import quote_plus, urlparse
|
||||
from dateutil import parser
|
||||
|
||||
from searx.utils import humanize_number
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://api.invidious.io/',
|
||||
"wikidata_id": 'Q79343316',
|
||||
"official_api_documentation": 'https://docs.invidious.io/api/',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ["videos", "music"]
|
||||
paging = True
|
||||
time_range_support = True
|
||||
|
||||
# base_url can be overwritten by a list of URLs in the settings.yml
|
||||
base_url: list | str = []
|
||||
|
||||
|
||||
def init(_):
|
||||
if not base_url:
|
||||
raise ValueError("missing invidious base_url")
|
||||
|
||||
|
||||
def request(query, params):
|
||||
time_range_dict = {
|
||||
"day": "today",
|
||||
"week": "week",
|
||||
"month": "month",
|
||||
"year": "year",
|
||||
}
|
||||
|
||||
if isinstance(base_url, list):
|
||||
params["base_url"] = random.choice(base_url)
|
||||
else:
|
||||
params["base_url"] = base_url
|
||||
|
||||
search_url = params["base_url"] + "/api/v1/search?q={query}"
|
||||
params["url"] = search_url.format(query=quote_plus(query)) + "&page={pageno}".format(pageno=params["pageno"])
|
||||
|
||||
if params["time_range"] in time_range_dict:
|
||||
params["url"] += "&date={timerange}".format(timerange=time_range_dict[params["time_range"]])
|
||||
|
||||
if params["language"] != "all":
|
||||
lang = params["language"].split("-")
|
||||
if len(lang) == 2:
|
||||
params["url"] += "&range={lrange}".format(lrange=lang[1])
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
search_results = resp.json()
|
||||
base_invidious_url = resp.search_params['base_url'] + "/watch?v="
|
||||
|
||||
for result in search_results:
|
||||
rtype = result.get("type", None)
|
||||
if rtype == "video":
|
||||
videoid = result.get("videoId", None)
|
||||
if not videoid:
|
||||
continue
|
||||
|
||||
url = base_invidious_url + videoid
|
||||
thumbs = result.get("videoThumbnails", [])
|
||||
thumb = next((th for th in thumbs if th["quality"] == "sddefault"), None)
|
||||
if thumb:
|
||||
thumbnail = thumb.get("url", "")
|
||||
else:
|
||||
thumbnail = ""
|
||||
|
||||
# some instances return a partial thumbnail url
|
||||
# we check if the url is partial, and prepend the base_url if it is
|
||||
if thumbnail and not urlparse(thumbnail).netloc:
|
||||
thumbnail = resp.search_params['base_url'] + thumbnail
|
||||
|
||||
publishedDate = parser.parse(time.ctime(result.get("published", 0)))
|
||||
length = time.gmtime(result.get("lengthSeconds"))
|
||||
if length.tm_hour:
|
||||
length = time.strftime("%H:%M:%S", length)
|
||||
else:
|
||||
length = time.strftime("%M:%S", length)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"url": url,
|
||||
"title": result.get("title", ""),
|
||||
"content": result.get("description", ""),
|
||||
"length": length,
|
||||
"views": humanize_number(result['viewCount']),
|
||||
"template": "videos.html",
|
||||
"author": result.get("author"),
|
||||
"publishedDate": publishedDate,
|
||||
"iframe_src": resp.search_params['base_url'] + '/embed/' + videoid,
|
||||
"thumbnail": thumbnail,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
76
searx/engines/ipernity.py
Normal file
76
searx/engines/ipernity.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Ipernity (images)"""
|
||||
|
||||
from datetime import datetime
|
||||
from json import loads, JSONDecodeError
|
||||
|
||||
from urllib.parse import quote_plus
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import extr, extract_text, eval_xpath, eval_xpath_list
|
||||
|
||||
about = {
|
||||
'website': 'https://www.ipernity.com',
|
||||
'official_api_documentation': 'https://www.ipernity.com/help/api',
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'HTML',
|
||||
}
|
||||
|
||||
paging = True
|
||||
categories = ['images']
|
||||
|
||||
|
||||
base_url = 'https://www.ipernity.com'
|
||||
page_size = 10
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = f"{base_url}/search/photo/@/page:{params['pageno']}:{page_size}?q={quote_plus(query)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
doc = html.fromstring(resp.text)
|
||||
|
||||
images = eval_xpath_list(doc, '//a[starts-with(@href, "/doc")]//img')
|
||||
|
||||
result_index = 0
|
||||
for result in eval_xpath_list(doc, '//script[@type="text/javascript"]'):
|
||||
info_js = extr(extract_text(result), '] = ', '};') + '}'
|
||||
|
||||
if not info_js:
|
||||
continue
|
||||
|
||||
try:
|
||||
info_item = loads(info_js)
|
||||
|
||||
if not info_item.get('mediakey'):
|
||||
continue
|
||||
|
||||
thumbnail_src = extract_text(eval_xpath(images[result_index], './@src'))
|
||||
img_src = thumbnail_src.replace('240.jpg', '640.jpg')
|
||||
|
||||
resolution = None
|
||||
if info_item.get("width") and info_item.get("height"):
|
||||
resolution = f'{info_item["width"]}x{info_item["height"]}'
|
||||
|
||||
item = {
|
||||
'template': 'images.html',
|
||||
'url': f"{base_url}/doc/{info_item['user_id']}/{info_item['doc_id']}",
|
||||
'title': info_item.get('title'),
|
||||
'content': info_item.get('content', ''),
|
||||
'resolution': resolution,
|
||||
'publishedDate': datetime.fromtimestamp(int(info_item['posted_at'])),
|
||||
'thumbnail_src': thumbnail_src,
|
||||
'img_src': img_src,
|
||||
}
|
||||
results.append(item)
|
||||
|
||||
result_index += 1
|
||||
except JSONDecodeError:
|
||||
continue
|
||||
|
||||
return results
|
||||
72
searx/engines/iqiyi.py
Normal file
72
searx/engines/iqiyi.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""iQiyi: A search engine for retrieving videos from iQiyi."""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.utils import parse_duration_string
|
||||
|
||||
about = {
|
||||
"website": "https://www.iqiyi.com/",
|
||||
"wikidata_id": "Q15913890",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
"language": "zh",
|
||||
}
|
||||
|
||||
paging = True
|
||||
time_range_support = True
|
||||
categories = ["videos"]
|
||||
|
||||
time_range_dict = {'day': '1', 'week': '2', 'month': '3'}
|
||||
|
||||
base_url = "https://mesh.if.iqiyi.com"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params = {"key": query, "pageNum": params["pageno"], "pageSize": 25}
|
||||
|
||||
if time_range_dict.get(params['time_range']):
|
||||
query_params["sitePublishDate"] = time_range_dict[params['time_range']]
|
||||
|
||||
params["url"] = f"{base_url}/portal/lw/search/homePageV3?{urlencode(query_params)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
try:
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
raise SearxEngineAPIException(f"Invalid response: {e}") from e
|
||||
results = []
|
||||
|
||||
if "data" not in data or "templates" not in data["data"]:
|
||||
raise SearxEngineAPIException("Invalid response")
|
||||
|
||||
for entry in data["data"]["templates"]:
|
||||
album_info = entry.get("albumInfo", {})
|
||||
|
||||
published_date = None
|
||||
release_time = album_info.get("releaseTime", {}).get("value")
|
||||
if release_time:
|
||||
try:
|
||||
published_date = datetime.strptime(release_time, "%Y-%m-%d")
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
length = parse_duration_string(album_info.get("subscriptionContent"))
|
||||
results.append(
|
||||
{
|
||||
'url': album_info.get("pageUrl", "").replace("http://", "https://"),
|
||||
'title': album_info.get("title", ""),
|
||||
'content': album_info.get("brief", {}).get("value", ""),
|
||||
'template': 'videos.html',
|
||||
'length': length,
|
||||
'publishedDate': published_date,
|
||||
'thumbnail': album_info.get("img", ""),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
137
searx/engines/jisho.py
Normal file
137
searx/engines/jisho.py
Normal file
@@ -0,0 +1,137 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Jisho (the Japanese-English dictionary)
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode, urljoin
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://jisho.org',
|
||||
"wikidata_id": 'Q24568389',
|
||||
"official_api_documentation": "https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api",
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
"language": 'ja',
|
||||
}
|
||||
|
||||
categories = ['dictionaries']
|
||||
paging = False
|
||||
|
||||
URL = 'https://jisho.org'
|
||||
BASE_URL = 'https://jisho.org/word/'
|
||||
SEARCH_URL = URL + '/api/v1/search/words?{query}'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query = urlencode({'keyword': query})
|
||||
params['url'] = SEARCH_URL.format(query=query)
|
||||
logger.debug(f"query_url --> {params['url']}")
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
first_result = True
|
||||
|
||||
search_results = resp.json()
|
||||
|
||||
for page in search_results.get('data', []):
|
||||
# Entries that are purely from Wikipedia are excluded.
|
||||
parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech')
|
||||
if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition':
|
||||
pass
|
||||
|
||||
# Process alternative forms
|
||||
alt_forms = []
|
||||
for title_raw in page['japanese']:
|
||||
if 'word' not in title_raw:
|
||||
alt_forms.append(title_raw['reading'])
|
||||
else:
|
||||
title = title_raw['word']
|
||||
if 'reading' in title_raw:
|
||||
title += ' (' + title_raw['reading'] + ')'
|
||||
alt_forms.append(title)
|
||||
|
||||
result_url = urljoin(BASE_URL, page['slug'])
|
||||
definitions = get_definitions(page)
|
||||
|
||||
# For results, we'll return the URL, all alternative forms (as title),
|
||||
# and all definitions (as description) truncated to 300 characters.
|
||||
content = " ".join(f"{engdef}." for _, engdef, _ in definitions)
|
||||
results.append(
|
||||
{'url': result_url, 'title': ", ".join(alt_forms), 'content': content[:300] + (content[300:] and '...')}
|
||||
)
|
||||
|
||||
# Like Wordnik, we'll return the first result in an infobox too.
|
||||
if first_result:
|
||||
first_result = False
|
||||
results.append(get_infobox(alt_forms, result_url, definitions))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_definitions(page):
|
||||
# Process definitions
|
||||
definitions = []
|
||||
for defn_raw in page['senses']:
|
||||
extra = []
|
||||
# Extra data. Since they're not documented, this implementation is based solely by the author's assumptions.
|
||||
if defn_raw.get('tags'):
|
||||
if defn_raw.get('info'):
|
||||
# "usually written as kana: <kana>"
|
||||
extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ')
|
||||
else:
|
||||
# abbreviation, archaism, etc.
|
||||
extra.append(', '.join(defn_raw['tags']) + '. ')
|
||||
elif defn_raw.get('info'):
|
||||
# inconsistent
|
||||
extra.append(', '.join(defn_raw['info']).capitalize() + '. ')
|
||||
if defn_raw.get('restrictions'):
|
||||
extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ')
|
||||
definitions.append(
|
||||
(
|
||||
', '.join(defn_raw['parts_of_speech']),
|
||||
'; '.join(defn_raw['english_definitions']),
|
||||
''.join(extra)[:-1],
|
||||
)
|
||||
)
|
||||
return definitions
|
||||
|
||||
|
||||
def get_infobox(alt_forms, result_url, definitions):
|
||||
infobox_content = []
|
||||
# title & alt_forms
|
||||
infobox_title = alt_forms[0]
|
||||
if len(alt_forms) > 1:
|
||||
infobox_content.append(f'<p><i>Other forms:</i> {", ".join(alt_forms[1:])}</p>')
|
||||
|
||||
# definitions
|
||||
infobox_content.append(
|
||||
'''
|
||||
<small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a>
|
||||
and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a>
|
||||
by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small>
|
||||
<ul>
|
||||
'''
|
||||
)
|
||||
for pos, engdef, extra in definitions:
|
||||
if pos == 'Wikipedia definition':
|
||||
infobox_content.append('</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>')
|
||||
pos = f'<i>{pos}</i>: ' if pos else ''
|
||||
extra = f' ({extra})' if extra else ''
|
||||
infobox_content.append(f'<li>{pos}{engdef}{extra}</li>')
|
||||
infobox_content.append('</ul>')
|
||||
|
||||
#
|
||||
return {
|
||||
'infobox': infobox_title,
|
||||
'content': ''.join(infobox_content),
|
||||
'urls': [
|
||||
{
|
||||
'title': 'Jisho.org',
|
||||
'url': result_url,
|
||||
}
|
||||
],
|
||||
}
|
||||
423
searx/engines/json_engine.py
Normal file
423
searx/engines/json_engine.py
Normal file
@@ -0,0 +1,423 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""The JSON engine is a *generic* engine with which it is possible to configure
|
||||
engines in the settings.
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
Request:
|
||||
|
||||
- :py:obj:`search_url`
|
||||
- :py:obj:`lang_all`
|
||||
- :py:obj:`soft_max_redirects`
|
||||
- :py:obj:`method`
|
||||
- :py:obj:`request_body`
|
||||
- :py:obj:`cookies`
|
||||
- :py:obj:`headers`
|
||||
|
||||
Paging:
|
||||
|
||||
- :py:obj:`paging`
|
||||
- :py:obj:`page_size`
|
||||
- :py:obj:`first_page_num`
|
||||
|
||||
Time Range:
|
||||
|
||||
- :py:obj:`time_range_support`
|
||||
- :py:obj:`time_range_url`
|
||||
- :py:obj:`time_range_map`
|
||||
|
||||
Safe-Search:
|
||||
|
||||
- :py:obj:`safe_search_support`
|
||||
- :py:obj:`safe_search_map`
|
||||
|
||||
Response:
|
||||
|
||||
- :py:obj:`title_html_to_text`
|
||||
- :py:obj:`content_html_to_text`
|
||||
- :py:obj:`no_result_for_http_status`
|
||||
|
||||
JSON query:
|
||||
|
||||
- :py:obj:`results_query`
|
||||
- :py:obj:`url_query`
|
||||
- :py:obj:`url_prefix`
|
||||
- :py:obj:`title_query`
|
||||
- :py:obj:`content_query`
|
||||
- :py:obj:`thumbnail_query`
|
||||
- :py:obj:`thumbnail_prefix`
|
||||
- :py:obj:`suggestion_query`
|
||||
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
Here is a simple example of a JSON engine configure in the :ref:`settings
|
||||
engines` section, further read :ref:`engines-dev`.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name : mdn
|
||||
engine : json_engine
|
||||
paging : True
|
||||
search_url : https://developer.mozilla.org/api/v1/search?q={query}&page={pageno}
|
||||
results_query : documents
|
||||
url_query : mdn_url
|
||||
url_prefix : https://developer.mozilla.org
|
||||
title_query : title
|
||||
content_query : summary
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
from searx.utils import to_string, html_to_text
|
||||
from searx.network import raise_for_httperror
|
||||
|
||||
search_url = None
|
||||
"""
|
||||
Search URL of the engine. Example::
|
||||
|
||||
https://example.org/?search={query}&page={pageno}{time_range}{safe_search}
|
||||
|
||||
Replacements are:
|
||||
|
||||
``{query}``:
|
||||
Search terms from user.
|
||||
|
||||
``{pageno}``:
|
||||
Page number if engine supports paging :py:obj:`paging`
|
||||
|
||||
``{lang}``:
|
||||
ISO 639-1 language code (en, de, fr ..)
|
||||
|
||||
``{time_range}``:
|
||||
:py:obj:`URL parameter <time_range_url>` if engine :py:obj:`supports time
|
||||
range <time_range_support>`. The value for the parameter is taken from
|
||||
:py:obj:`time_range_map`.
|
||||
|
||||
``{safe_search}``:
|
||||
Safe-search :py:obj:`URL parameter <safe_search_map>` if engine
|
||||
:py:obj:`supports safe-search <safe_search_support>`. The ``{safe_search}``
|
||||
replacement is taken from the :py:obj:`safes_search_map`. Filter results::
|
||||
|
||||
0: none, 1: moderate, 2:strict
|
||||
|
||||
If not supported, the URL parameter is an empty string.
|
||||
|
||||
"""
|
||||
|
||||
lang_all = 'en'
|
||||
'''Replacement ``{lang}`` in :py:obj:`search_url` if language ``all`` is
|
||||
selected.
|
||||
'''
|
||||
|
||||
no_result_for_http_status = []
|
||||
'''Return empty result for these HTTP status codes instead of throwing an error.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
no_result_for_http_status: []
|
||||
'''
|
||||
|
||||
soft_max_redirects = 0
|
||||
'''Maximum redirects, soft limit. Record an error but don't stop the engine'''
|
||||
|
||||
method = 'GET'
|
||||
'''Some engines might require to do POST requests for search.'''
|
||||
|
||||
request_body = ''
|
||||
'''The body of the request. This can only be used if different :py:obj:`method`
|
||||
is set, e.g. ``POST``. For formatting see the documentation of :py:obj:`search_url`.
|
||||
|
||||
Note: Curly brackets which aren't encapsulating a replacement placeholder
|
||||
must be escaped by doubling each ``{`` and ``}``.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
request_body: >-
|
||||
{{
|
||||
"search": "{query}",
|
||||
"page": {pageno},
|
||||
"extra": {{
|
||||
"time_range": {time_range},
|
||||
"rating": "{safe_search}"
|
||||
}}
|
||||
}}
|
||||
'''
|
||||
|
||||
cookies = {}
|
||||
'''Some engines might offer different result based on cookies.
|
||||
Possible use-case: To set safesearch cookie.'''
|
||||
|
||||
headers = {}
|
||||
'''Some engines might offer different result based on cookies or headers.
|
||||
Possible use-case: To set safesearch cookie or header to moderate.'''
|
||||
|
||||
paging = False
|
||||
'''Engine supports paging [True or False].'''
|
||||
|
||||
page_size = 1
|
||||
'''Number of results on each page. Only needed if the site requires not a page
|
||||
number, but an offset.'''
|
||||
|
||||
first_page_num = 1
|
||||
'''Number of the first page (usually 0 or 1).'''
|
||||
|
||||
results_query = ''
|
||||
'''JSON query for the list of result items.
|
||||
|
||||
The query string is a slash `/` separated path of JSON key names.
|
||||
Array entries can be specified using the index or can be omitted entirely,
|
||||
in which case each entry is considered -
|
||||
most implementations will default to the first entry in this case.
|
||||
'''
|
||||
|
||||
url_query = None
|
||||
'''JSON query of result's ``url``. For the query string documentation see :py:obj:`results_query`'''
|
||||
|
||||
url_prefix = ""
|
||||
'''String to prepend to the result's ``url``.'''
|
||||
|
||||
title_query = None
|
||||
'''JSON query of result's ``title``. For the query string documentation see :py:obj:`results_query`'''
|
||||
|
||||
content_query = None
|
||||
'''JSON query of result's ``content``. For the query string documentation see :py:obj:`results_query`'''
|
||||
|
||||
thumbnail_query = False
|
||||
'''JSON query of result's ``thumbnail``. For the query string documentation see :py:obj:`results_query`'''
|
||||
|
||||
thumbnail_prefix = ''
|
||||
'''String to prepend to the result's ``thumbnail``.'''
|
||||
|
||||
suggestion_query = ''
|
||||
'''JSON query of result's ``suggestion``. For the query string documentation see :py:obj:`results_query`'''
|
||||
|
||||
title_html_to_text = False
|
||||
'''Extract text from a HTML title string'''
|
||||
|
||||
content_html_to_text = False
|
||||
'''Extract text from a HTML content string'''
|
||||
|
||||
time_range_support = False
|
||||
'''Engine supports search time range.'''
|
||||
|
||||
time_range_url = '&hours={time_range_val}'
|
||||
'''Time range URL parameter in the in :py:obj:`search_url`. If no time range is
|
||||
requested by the user, the URL parameter is an empty string. The
|
||||
``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
time_range_url : '&days={time_range_val}'
|
||||
'''
|
||||
|
||||
time_range_map = {
|
||||
'day': 24,
|
||||
'week': 24 * 7,
|
||||
'month': 24 * 30,
|
||||
'year': 24 * 365,
|
||||
}
|
||||
'''Maps time range value from user to ``{time_range_val}`` in
|
||||
:py:obj:`time_range_url`.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
time_range_map:
|
||||
day: 1
|
||||
week: 7
|
||||
month: 30
|
||||
year: 365
|
||||
'''
|
||||
|
||||
safe_search_support = False
|
||||
'''Engine supports safe-search.'''
|
||||
|
||||
safe_search_map = {0: '&filter=none', 1: '&filter=moderate', 2: '&filter=strict'}
|
||||
'''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
safesearch: true
|
||||
safes_search_map:
|
||||
0: '&filter=none'
|
||||
1: '&filter=moderate'
|
||||
2: '&filter=strict'
|
||||
|
||||
'''
|
||||
|
||||
|
||||
def iterate(iterable):
|
||||
if isinstance(iterable, dict):
|
||||
items = iterable.items()
|
||||
|
||||
else:
|
||||
items = enumerate(iterable)
|
||||
for index, value in items:
|
||||
yield str(index), value
|
||||
|
||||
|
||||
def is_iterable(obj):
|
||||
if isinstance(obj, str):
|
||||
return False
|
||||
return isinstance(obj, Iterable)
|
||||
|
||||
|
||||
def parse(query): # pylint: disable=redefined-outer-name
|
||||
q = [] # pylint: disable=invalid-name
|
||||
for part in query.split('/'):
|
||||
if part == '':
|
||||
continue
|
||||
q.append(part)
|
||||
return q
|
||||
|
||||
|
||||
def do_query(data, q): # pylint: disable=invalid-name
|
||||
ret = []
|
||||
if not q:
|
||||
return ret
|
||||
|
||||
qkey = q[0]
|
||||
|
||||
for key, value in iterate(data):
|
||||
|
||||
if len(q) == 1:
|
||||
if key == qkey:
|
||||
ret.append(value)
|
||||
elif is_iterable(value):
|
||||
ret.extend(do_query(value, q))
|
||||
else:
|
||||
if not is_iterable(value):
|
||||
continue
|
||||
if key == qkey:
|
||||
ret.extend(do_query(value, q[1:]))
|
||||
else:
|
||||
ret.extend(do_query(value, q))
|
||||
return ret
|
||||
|
||||
|
||||
def query(data, query_string):
|
||||
q = parse(query_string)
|
||||
|
||||
return do_query(data, q)
|
||||
|
||||
|
||||
def request(query, params): # pylint: disable=redefined-outer-name
|
||||
'''Build request parameters (see :ref:`engine request`).'''
|
||||
lang = lang_all
|
||||
if params['language'] != 'all':
|
||||
lang = params['language'][:2]
|
||||
|
||||
time_range = ''
|
||||
if params.get('time_range'):
|
||||
time_range_val = time_range_map.get(params.get('time_range'))
|
||||
time_range = time_range_url.format(time_range_val=time_range_val)
|
||||
|
||||
safe_search = ''
|
||||
if params['safesearch']:
|
||||
safe_search = safe_search_map[params['safesearch']]
|
||||
|
||||
fp = { # pylint: disable=invalid-name
|
||||
'query': urlencode({'q': query})[2:],
|
||||
'lang': lang,
|
||||
'pageno': (params['pageno'] - 1) * page_size + first_page_num,
|
||||
'time_range': time_range,
|
||||
'safe_search': safe_search,
|
||||
}
|
||||
|
||||
params['cookies'].update(cookies)
|
||||
params['headers'].update(headers)
|
||||
|
||||
params['url'] = search_url.format(**fp)
|
||||
params['method'] = method
|
||||
|
||||
if request_body:
|
||||
# don't url-encode the query if it's in the request body
|
||||
fp['query'] = query
|
||||
params['data'] = request_body.format(**fp)
|
||||
|
||||
params['soft_max_redirects'] = soft_max_redirects
|
||||
params['raise_for_httperror'] = False
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def identity(arg):
|
||||
return arg
|
||||
|
||||
|
||||
def extract_response_info(result):
|
||||
title_filter = html_to_text if title_html_to_text else identity
|
||||
content_filter = html_to_text if content_html_to_text else identity
|
||||
|
||||
tmp_result = {}
|
||||
|
||||
try:
|
||||
url = query(result, url_query)[0]
|
||||
tmp_result['url'] = url_prefix + to_string(url)
|
||||
|
||||
title = query(result, title_query)[0]
|
||||
tmp_result['title'] = title_filter(to_string(title))
|
||||
except: # pylint: disable=bare-except
|
||||
return None
|
||||
|
||||
try:
|
||||
content = query(result, content_query)[0]
|
||||
tmp_result['content'] = content_filter(to_string(content))
|
||||
except: # pylint: disable=bare-except
|
||||
tmp_result['content'] = ""
|
||||
|
||||
try:
|
||||
if thumbnail_query:
|
||||
thumbnail_query_result = query(result, thumbnail_query)[0]
|
||||
tmp_result['thumbnail'] = thumbnail_prefix + to_string(thumbnail_query_result)
|
||||
except: # pylint: disable=bare-except
|
||||
pass
|
||||
|
||||
return tmp_result
|
||||
|
||||
|
||||
def response(resp):
|
||||
'''Scrap *results* from the response (see :ref:`result types`).'''
|
||||
results = []
|
||||
|
||||
if no_result_for_http_status and resp.status_code in no_result_for_http_status:
|
||||
return results
|
||||
|
||||
raise_for_httperror(resp)
|
||||
|
||||
if not resp.text:
|
||||
return results
|
||||
|
||||
json = loads(resp.text)
|
||||
is_onion = 'onions' in categories
|
||||
|
||||
if results_query:
|
||||
rs = query(json, results_query) # pylint: disable=invalid-name
|
||||
if not rs:
|
||||
return results
|
||||
rs = rs[0] # pylint: disable=invalid-name
|
||||
else:
|
||||
rs = json # pylint: disable=invalid-name
|
||||
|
||||
for result in rs:
|
||||
tmp_result = extract_response_info(result)
|
||||
if not tmp_result:
|
||||
continue
|
||||
|
||||
if is_onion:
|
||||
tmp_result['is_onion'] = True
|
||||
|
||||
results.append(tmp_result)
|
||||
|
||||
if not suggestion_query:
|
||||
return results
|
||||
for suggestion in query(json, suggestion_query):
|
||||
results.append({'suggestion': suggestion})
|
||||
return results
|
||||
61
searx/engines/kickass.py
Normal file
61
searx/engines/kickass.py
Normal file
@@ -0,0 +1,61 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Kickass Torrent (Videos, Music, Files)"""
|
||||
|
||||
import random
|
||||
from operator import itemgetter
|
||||
from urllib.parse import quote
|
||||
|
||||
from lxml import html
|
||||
from searx.utils import (
|
||||
eval_xpath,
|
||||
eval_xpath_getindex,
|
||||
eval_xpath_list,
|
||||
extract_text,
|
||||
int_or_zero,
|
||||
)
|
||||
|
||||
about = {
|
||||
"website": 'https://kickasstorrents.to',
|
||||
"wikidata_id": 'Q17062285',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
categories = ['files']
|
||||
paging = True
|
||||
|
||||
# base_url can be overwritten by a list of URLs in the settings.yml
|
||||
base_url = 'https://kickasstorrents.to'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['base_url'] = random.choice(base_url) if isinstance(base_url, list) else base_url
|
||||
params['url'] = params['base_url'] + f'/usearch/{quote(query)}/{params["pageno"]}/'
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
search_res = eval_xpath_list(dom, '//table[contains(@class, "data")]//tr[descendant::a]', None)
|
||||
if search_res is None:
|
||||
return []
|
||||
|
||||
for tag in search_res[1:]:
|
||||
result = {'template': 'torrent.html'}
|
||||
url = eval_xpath_getindex(tag, './/a[contains(@class, "cellMainLink")]/@href', 0, None)
|
||||
result['url'] = resp.search_params['base_url'] + url
|
||||
result['title'] = extract_text(eval_xpath(tag, './/a[contains(@class, "cellMainLink")]'))
|
||||
result['content'] = extract_text(eval_xpath(tag, './/span[@class="font11px lightgrey block"]'))
|
||||
result['seed'] = int_or_zero(extract_text(eval_xpath(tag, './/td[contains(@class, "green")]')))
|
||||
result['leech'] = int_or_zero(extract_text(eval_xpath(tag, './/td[contains(@class, "red")]')))
|
||||
result['filesize'] = extract_text(eval_xpath(tag, './/td[contains(@class, "nobr")]'))
|
||||
|
||||
results.append(result)
|
||||
|
||||
# results sorted by seeder count
|
||||
return sorted(results, key=itemgetter('seed'), reverse=True)
|
||||
196
searx/engines/lemmy.py
Normal file
196
searx/engines/lemmy.py
Normal file
@@ -0,0 +1,196 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""This engine uses the Lemmy API (https://lemmy.ml/api/v3/search), which is
|
||||
documented at `lemmy-js-client`_ / `Interface Search`_. Since Lemmy is
|
||||
federated, results are from many different, independent lemmy instances, and not
|
||||
only the official one.
|
||||
|
||||
.. _lemmy-js-client: https://join-lemmy.org/api/modules.html
|
||||
.. _Interface Search: https://join-lemmy.org/api/interfaces/Search.html
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
The engine has the following additional settings:
|
||||
|
||||
- :py:obj:`base_url`
|
||||
- :py:obj:`lemmy_type`
|
||||
|
||||
This implementation is used by different lemmy engines in the :ref:`settings.yml
|
||||
<settings engines>`:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: lemmy communities
|
||||
lemmy_type: Communities
|
||||
...
|
||||
- name: lemmy users
|
||||
lemmy_type: Users
|
||||
...
|
||||
- name: lemmy posts
|
||||
lemmy_type: Posts
|
||||
...
|
||||
- name: lemmy comments
|
||||
lemmy_type: Comments
|
||||
...
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx.utils import markdown_to_text
|
||||
|
||||
about = {
|
||||
"website": 'https://lemmy.ml/',
|
||||
"wikidata_id": 'Q84777032',
|
||||
"official_api_documentation": "https://join-lemmy.org/api/",
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
paging = True
|
||||
categories = ['social media']
|
||||
|
||||
base_url = "https://lemmy.ml/"
|
||||
"""By default, https://lemmy.ml is used for providing the results. If you want
|
||||
to use a different lemmy instance, you can specify ``base_url``.
|
||||
"""
|
||||
|
||||
lemmy_type = "Communities"
|
||||
"""Any of ``Communities``, ``Users``, ``Posts``, ``Comments``"""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
'q': query,
|
||||
'page': params['pageno'],
|
||||
'type_': lemmy_type,
|
||||
}
|
||||
|
||||
params['url'] = f"{base_url}api/v3/search?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def _get_communities(json):
|
||||
results = []
|
||||
|
||||
for result in json["communities"]:
|
||||
counts = result['counts']
|
||||
metadata = (
|
||||
f"{gettext('subscribers')}: {counts.get('subscribers', 0)}"
|
||||
f" | {gettext('posts')}: {counts.get('posts', 0)}"
|
||||
f" | {gettext('active users')}: {counts.get('users_active_half_year', 0)}"
|
||||
)
|
||||
results.append(
|
||||
{
|
||||
'url': result['community']['actor_id'],
|
||||
'title': result['community']['title'],
|
||||
'content': markdown_to_text(result['community'].get('description', '')),
|
||||
'thumbnail': result['community'].get('icon', result['community'].get('banner')),
|
||||
'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
|
||||
'metadata': metadata,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def _get_users(json):
|
||||
results = []
|
||||
|
||||
for result in json["users"]:
|
||||
results.append(
|
||||
{
|
||||
'url': result['person']['actor_id'],
|
||||
'title': result['person']['name'],
|
||||
'content': markdown_to_text(result['person'].get('bio', '')),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _get_posts(json):
|
||||
results = []
|
||||
|
||||
for result in json["posts"]:
|
||||
user = result['creator'].get('display_name', result['creator']['name'])
|
||||
|
||||
thumbnail = None
|
||||
if result['post'].get('thumbnail_url'):
|
||||
thumbnail = result['post']['thumbnail_url'] + '?format=webp&thumbnail=208'
|
||||
|
||||
metadata = (
|
||||
f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}"
|
||||
f" | {gettext('user')}: {user}"
|
||||
f" | {gettext('comments')}: {result['counts']['comments']}"
|
||||
f" | {gettext('community')}: {result['community']['title']}"
|
||||
)
|
||||
|
||||
content = result['post'].get('body', '').strip()
|
||||
if content:
|
||||
content = markdown_to_text(content)
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': result['post']['ap_id'],
|
||||
'title': result['post']['name'],
|
||||
'content': content,
|
||||
'thumbnail': thumbnail,
|
||||
'publishedDate': datetime.strptime(result['post']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
|
||||
'metadata': metadata,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _get_comments(json):
|
||||
results = []
|
||||
|
||||
for result in json["comments"]:
|
||||
user = result['creator'].get('display_name', result['creator']['name'])
|
||||
|
||||
content = result['comment'].get('content', '').strip()
|
||||
if content:
|
||||
content = markdown_to_text(content)
|
||||
|
||||
metadata = (
|
||||
f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}"
|
||||
f" | {gettext('user')}: {user}"
|
||||
f" | {gettext('community')}: {result['community']['title']}"
|
||||
)
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': result['comment']['ap_id'],
|
||||
'title': result['post']['name'],
|
||||
'content': markdown_to_text(result['comment']['content']),
|
||||
'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
|
||||
'metadata': metadata,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def response(resp):
|
||||
json = resp.json()
|
||||
|
||||
if lemmy_type == "Communities":
|
||||
return _get_communities(json)
|
||||
|
||||
if lemmy_type == "Users":
|
||||
return _get_users(json)
|
||||
|
||||
if lemmy_type == "Posts":
|
||||
return _get_posts(json)
|
||||
|
||||
if lemmy_type == "Comments":
|
||||
return _get_comments(json)
|
||||
|
||||
raise ValueError(f"Unsupported lemmy type: {lemmy_type}")
|
||||
55
searx/engines/lib_rs.py
Normal file
55
searx/engines/lib_rs.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""lib.rs (packages)"""
|
||||
|
||||
from urllib.parse import quote_plus
|
||||
from lxml import html
|
||||
from searx.utils import eval_xpath, eval_xpath_list, extract_text
|
||||
|
||||
about = {
|
||||
'website': 'https://lib.rs',
|
||||
'wikidata_id': 'Q113486010',
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': "HTML",
|
||||
}
|
||||
|
||||
categories = ["it", "packages"]
|
||||
|
||||
base_url = 'https://lib.rs'
|
||||
|
||||
results_xpath = '/html/body/main/div/ol/li/a'
|
||||
url_xpath = './@href'
|
||||
title_xpath = './div[@class="h"]/h4'
|
||||
content_xpath = './div[@class="h"]/p'
|
||||
version_xpath = './div[@class="meta"]/span[contains(@class, "version")]'
|
||||
download_count_xpath = './div[@class="meta"]/span[@class="downloads"]'
|
||||
tags_xpath = './div[@class="meta"]/span[contains(@class, "k")]/text()'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = f"{base_url}/search?q={quote_plus(query)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
doc = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(doc, results_xpath):
|
||||
package_name = extract_text(eval_xpath(result, title_xpath))
|
||||
results.append(
|
||||
{
|
||||
'template': 'packages.html',
|
||||
'title': package_name,
|
||||
'url': base_url + extract_text(eval_xpath(result, url_xpath)), # type: ignore
|
||||
'content': extract_text(eval_xpath(result, content_xpath)),
|
||||
'package_name': package_name,
|
||||
'version': extract_text(eval_xpath(result, version_xpath)),
|
||||
'popularity': extract_text(eval_xpath(result, download_count_xpath)),
|
||||
'tags': eval_xpath_list(result, tags_xpath),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
59
searx/engines/libretranslate.py
Normal file
59
searx/engines/libretranslate.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""LibreTranslate (Free and Open Source Machine Translation API)"""
|
||||
|
||||
import random
|
||||
import json
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
about = {
|
||||
"website": 'https://libretranslate.com',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": 'https://libretranslate.com/docs/',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
engine_type = 'online_dictionary'
|
||||
categories = ['general', 'translate']
|
||||
|
||||
base_url = "https://libretranslate.com/translate"
|
||||
api_key = ""
|
||||
|
||||
|
||||
def request(_query, params):
|
||||
request_url = random.choice(base_url) if isinstance(base_url, list) else base_url
|
||||
|
||||
if request_url.startswith("https://libretranslate.com") and not api_key:
|
||||
return None
|
||||
params['url'] = f"{request_url}/translate"
|
||||
|
||||
args = {
|
||||
'q': params['query'],
|
||||
'source': params['from_lang'][1],
|
||||
'target': params['to_lang'][1],
|
||||
'alternatives': 3,
|
||||
}
|
||||
if api_key:
|
||||
args['api_key'] = api_key
|
||||
|
||||
params['data'] = json.dumps(args)
|
||||
params['method'] = 'POST'
|
||||
params['headers'] = {'Content-Type': 'application/json'}
|
||||
params['req_url'] = request_url
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
results = EngineResults()
|
||||
|
||||
json_resp = resp.json()
|
||||
text = json_resp.get('translatedText')
|
||||
if not text:
|
||||
return results
|
||||
|
||||
item = results.types.Translations.Item(text=text, examples=json_resp.get('alternatives', []))
|
||||
results.add(results.types.Translations(translations=[item]))
|
||||
|
||||
return results
|
||||
74
searx/engines/lingva.py
Normal file
74
searx/engines/lingva.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Lingva (alternative Google Translate frontend)"""
|
||||
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
about = {
|
||||
"website": 'https://lingva.ml',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
engine_type = 'online_dictionary'
|
||||
categories = ['general', 'translate']
|
||||
|
||||
url = "https://lingva.thedaviddelta.com"
|
||||
|
||||
|
||||
def request(_query, params):
|
||||
params['url'] = f"{url}/api/v1/{params['from_lang'][1]}/{params['to_lang'][1]}/{params['query']}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
results = EngineResults()
|
||||
|
||||
result = resp.json()
|
||||
info = result["info"]
|
||||
from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1])
|
||||
|
||||
if "typo" in info:
|
||||
results.append({"suggestion": from_to_prefix + info["typo"]})
|
||||
|
||||
if 'definitions' in info: # pylint: disable=too-many-nested-blocks
|
||||
for definition in info['definitions']:
|
||||
for item in definition.get('list', []):
|
||||
for synonym in item.get('synonyms', []):
|
||||
results.append({"suggestion": from_to_prefix + synonym})
|
||||
|
||||
data = []
|
||||
|
||||
for definition in info['definitions']:
|
||||
for translation in definition['list']:
|
||||
data.append(
|
||||
results.types.Translations.Item(
|
||||
text=result['translation'],
|
||||
definitions=[translation['definition']] if translation['definition'] else [],
|
||||
examples=[translation['example']] if translation['example'] else [],
|
||||
synonyms=translation['synonyms'],
|
||||
)
|
||||
)
|
||||
|
||||
for translation in info["extraTranslations"]:
|
||||
for word in translation["list"]:
|
||||
data.append(
|
||||
results.types.Translations.Item(
|
||||
text=word['word'],
|
||||
definitions=word['meanings'],
|
||||
)
|
||||
)
|
||||
|
||||
if not data and result['translation']:
|
||||
data.append(results.types.Translations.Item(text=result['translation']))
|
||||
|
||||
params = resp.search_params
|
||||
results.add(
|
||||
results.types.Translations(
|
||||
translations=data,
|
||||
url=f"{url}/{params['from_lang'][1]}/{params['to_lang'][1]}/{params['query']}",
|
||||
)
|
||||
)
|
||||
return results
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user