first commit
This commit is contained in:
2
searxng_extra/__init__.py
Normal file
2
searxng_extra/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=missing-module-docstring
|
||||
82
searxng_extra/docs_prebuild
Executable file
82
searxng_extra/docs_prebuild
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Script that implements some prebuild tasks needed by target docs.prebuild
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os.path
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
|
||||
from searx import settings, get_setting, locales
|
||||
from searx.infopage import InfoPageSet, InfoPage
|
||||
|
||||
_doc_user = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'docs', 'user'))
|
||||
|
||||
|
||||
def main():
|
||||
locales.locales_initialize()
|
||||
base_url = get_setting('server.base_url', None)
|
||||
if base_url:
|
||||
infopageset_ctx = _instance_infosetset_ctx(base_url)
|
||||
else:
|
||||
infopageset_ctx = _offline_infosetset_ctx()
|
||||
|
||||
with infopageset_ctx as infopageset:
|
||||
for _, _, page in infopageset.iter_pages('en'):
|
||||
fname = os.path.join(_doc_user, os.path.basename(page.fname))
|
||||
with open(fname, 'w', encoding='utf-8') as f:
|
||||
f.write(page.content)
|
||||
|
||||
|
||||
class OfflinePage(InfoPage): # pylint: disable=missing-class-docstring
|
||||
|
||||
def get_ctx(self):
|
||||
"""Jinja context to render :py:obj:`DocPage.content` for offline purpose (no
|
||||
links to SearXNG instance)"""
|
||||
|
||||
ctx = super().get_ctx()
|
||||
ctx['link'] = lambda name, url: '`%s`' % name
|
||||
ctx['search'] = lambda query: '`%s`' % query
|
||||
|
||||
return ctx
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _offline_infosetset_ctx():
|
||||
yield InfoPageSet(OfflinePage)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _instance_infosetset_ctx(base_url):
|
||||
# The url_for functions in the jinja templates need all routes to be
|
||||
# registered in the Flask app.
|
||||
|
||||
settings['server']['secret_key'] = ''
|
||||
from searx.webapp import app # pylint: disable=import-outside-toplevel
|
||||
|
||||
# Specify base_url so that url_for() works for base_urls. If base_url is
|
||||
# specified, then these values from are given preference over any Flask's
|
||||
# generics (see flaskfix.py).
|
||||
|
||||
with app.test_request_context(base_url=base_url):
|
||||
yield InfoPageSet()
|
||||
|
||||
# The searx.webapp import from above fires some HTTP requests, that's
|
||||
# why we get a RuntimeError::
|
||||
#
|
||||
# RuntimeError: The connection pool was closed while 1 HTTP \
|
||||
# requests/responses were still in-flight.
|
||||
#
|
||||
# Closing network won't help ..
|
||||
# from searx.network import network
|
||||
# network.done()
|
||||
|
||||
# waiting some seconds before ending the command line was the only solution I
|
||||
# found ..
|
||||
|
||||
time.sleep(3)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
177
searxng_extra/standalone_searx.py
Executable file
177
searxng_extra/standalone_searx.py
Executable file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Script to run SearXNG from terminal.
|
||||
|
||||
DON'T USE THIS SCRIPT!!
|
||||
|
||||
.. danger::
|
||||
|
||||
Be warned, using the ``standalone_searx.py`` won't give you privacy!
|
||||
|
||||
On the contrary, this script behaves like a SearXNG server: your IP is
|
||||
exposed and tracked by all active engines (google, bing, qwant, ... ), with
|
||||
every query!
|
||||
|
||||
.. note::
|
||||
|
||||
This is an old and grumpy hack / SearXNG is a Flask application with
|
||||
client/server structure, which can't be turned into a command line tool the
|
||||
way it was done here.
|
||||
|
||||
Getting categories without initiate the engine will only return `['general']`
|
||||
|
||||
>>> import searx.engines
|
||||
... list(searx.engines.categories.keys())
|
||||
['general']
|
||||
>>> import searx.search
|
||||
... searx.search.initialize()
|
||||
... list(searx.engines.categories.keys())
|
||||
['general', 'it', 'science', 'images', 'news', 'videos', 'music', 'files', 'social media', 'map']
|
||||
|
||||
Example to use this script:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ python3 searxng_extra/standalone_searx.py rain
|
||||
|
||||
""" # pylint: disable=line-too-long
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from json import dumps
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import searx
|
||||
import searx.preferences
|
||||
import searx.query
|
||||
import searx.search
|
||||
import searx.webadapter
|
||||
|
||||
EngineCategoriesVar = Optional[List[str]]
|
||||
|
||||
|
||||
def get_search_query(
|
||||
args: argparse.Namespace, engine_categories: EngineCategoriesVar = None
|
||||
) -> searx.search.SearchQuery:
|
||||
"""Get search results for the query"""
|
||||
if engine_categories is None:
|
||||
engine_categories = list(searx.engines.categories.keys())
|
||||
try:
|
||||
category = args.category.decode('utf-8')
|
||||
except AttributeError:
|
||||
category = args.category
|
||||
form = {
|
||||
"q": args.query,
|
||||
"categories": category,
|
||||
"pageno": str(args.pageno),
|
||||
"language": args.lang,
|
||||
"time_range": args.timerange,
|
||||
}
|
||||
preferences = searx.preferences.Preferences(['simple'], engine_categories, searx.engines.engines, [])
|
||||
preferences.key_value_settings['safesearch'].parse(args.safesearch)
|
||||
|
||||
search_query = searx.webadapter.get_search_query_from_webapp(preferences, form)[0]
|
||||
return search_query
|
||||
|
||||
|
||||
def no_parsed_url(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Remove parsed url from dict."""
|
||||
for result in results:
|
||||
del result['parsed_url']
|
||||
return results
|
||||
|
||||
|
||||
def json_serial(obj: Any) -> Any:
|
||||
"""JSON serializer for objects not serializable by default json code.
|
||||
|
||||
:raise TypeError: raised when **obj** is not serializable
|
||||
"""
|
||||
if isinstance(obj, datetime):
|
||||
serial = obj.isoformat()
|
||||
return serial
|
||||
if isinstance(obj, bytes):
|
||||
return obj.decode('utf8')
|
||||
if isinstance(obj, set):
|
||||
return list(obj)
|
||||
raise TypeError("Type ({}) not serializable".format(type(obj)))
|
||||
|
||||
|
||||
def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]:
|
||||
"""Get result from parsed arguments."""
|
||||
result_container = searx.search.Search(search_query).search()
|
||||
result_container_json = {
|
||||
"search": {
|
||||
"q": search_query.query,
|
||||
"pageno": search_query.pageno,
|
||||
"lang": search_query.lang,
|
||||
"safesearch": search_query.safesearch,
|
||||
"timerange": search_query.time_range,
|
||||
},
|
||||
"results": no_parsed_url(result_container.get_ordered_results()),
|
||||
"infoboxes": result_container.infoboxes,
|
||||
"suggestions": list(result_container.suggestions),
|
||||
"answers": list(result_container.answers),
|
||||
"paging": result_container.paging,
|
||||
"number_of_results": result_container.number_of_results,
|
||||
}
|
||||
return result_container_json
|
||||
|
||||
|
||||
def parse_argument(
|
||||
args: Optional[List[str]] = None, category_choices: EngineCategoriesVar = None
|
||||
) -> argparse.Namespace:
|
||||
"""Parse command line.
|
||||
|
||||
:raise SystemExit: Query argument required on `args`
|
||||
|
||||
Examples:
|
||||
|
||||
>>> import importlib
|
||||
... # load module
|
||||
... spec = importlib.util.spec_from_file_location(
|
||||
... 'utils.standalone_searx', 'utils/standalone_searx.py')
|
||||
... sas = importlib.util.module_from_spec(spec)
|
||||
... spec.loader.exec_module(sas)
|
||||
... sas.parse_argument()
|
||||
usage: ptipython [-h] [--category [{general}]] [--lang [LANG]] [--pageno [PAGENO]] [--safesearch [{0,1,2}]] [--timerange [{day,week,month,year}]]
|
||||
query
|
||||
SystemExit: 2
|
||||
>>> sas.parse_argument(['rain'])
|
||||
Namespace(category='general', lang='all', pageno=1, query='rain', safesearch='0', timerange=None)
|
||||
""" # noqa: E501
|
||||
if not category_choices:
|
||||
category_choices = list(searx.engines.categories.keys())
|
||||
parser = argparse.ArgumentParser(description='Standalone searx.')
|
||||
parser.add_argument('query', type=str, help='Text query')
|
||||
parser.add_argument(
|
||||
'--category', type=str, nargs='?', choices=category_choices, default='general', help='Search category'
|
||||
)
|
||||
parser.add_argument('--lang', type=str, nargs='?', default='all', help='Search language')
|
||||
parser.add_argument('--pageno', type=int, nargs='?', default=1, help='Page number starting from 1')
|
||||
parser.add_argument(
|
||||
'--safesearch',
|
||||
type=str,
|
||||
nargs='?',
|
||||
choices=['0', '1', '2'],
|
||||
default='0',
|
||||
help='Safe content filter from none to strict',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--timerange', type=str, nargs='?', choices=['day', 'week', 'month', 'year'], help='Filter by time range'
|
||||
)
|
||||
return parser.parse_args(args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
settings_engines = searx.settings['engines']
|
||||
searx.search.load_engines(settings_engines)
|
||||
engine_cs = list(searx.engines.categories.keys())
|
||||
prog_args = parse_argument(category_choices=engine_cs)
|
||||
searx.search.initialize_network(settings_engines, searx.settings['outgoing'])
|
||||
searx.search.check_network_configuration()
|
||||
searx.search.initialize_metrics([engine['name'] for engine in settings_engines])
|
||||
searx.search.initialize_processors(settings_engines)
|
||||
search_q = get_search_query(prog_args, engine_categories=engine_cs)
|
||||
res_dict = to_dict(search_q)
|
||||
sys.stdout.write(dumps(res_dict, sort_keys=True, indent=4, ensure_ascii=False, default=json_serial))
|
||||
2
searxng_extra/update/__init__.py
Normal file
2
searxng_extra/update/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=missing-module-docstring
|
||||
32
searxng_extra/update/update_ahmia_blacklist.py
Executable file
32
searxng_extra/update/update_ahmia_blacklist.py
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""This script saves `Ahmia's blacklist`_ for onion sites.
|
||||
|
||||
Output file: :origin:`searx/data/ahmia_blacklist.txt` (:origin:`CI Update data
|
||||
... <.github/workflows/data-update.yml>`).
|
||||
|
||||
.. _Ahmia's blacklist: https://ahmia.fi/blacklist/
|
||||
|
||||
"""
|
||||
# pylint: disable=use-dict-literal
|
||||
|
||||
import requests
|
||||
from searx.data import data_dir
|
||||
|
||||
DATA_FILE = data_dir / 'ahmia_blacklist.txt'
|
||||
URL = 'https://ahmia.fi/blacklist/banned/'
|
||||
|
||||
|
||||
def fetch_ahmia_blacklist():
|
||||
resp = requests.get(URL, timeout=3.0)
|
||||
if resp.status_code != 200:
|
||||
# pylint: disable=broad-exception-raised
|
||||
raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code) # type: ignore
|
||||
return resp.text.split()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
blacklist = fetch_ahmia_blacklist()
|
||||
blacklist.sort()
|
||||
with DATA_FILE.open("w", encoding='utf-8') as f:
|
||||
f.write('\n'.join(blacklist))
|
||||
155
searxng_extra/update/update_currencies.py
Executable file
155
searxng_extra/update/update_currencies.py
Executable file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Fetch currencies from :origin:`searx/engines/wikidata.py` engine.
|
||||
|
||||
Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
|
||||
<.github/workflows/data-update.yml>`).
|
||||
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
import json
|
||||
|
||||
from searx.locales import LOCALE_NAMES, locales_initialize
|
||||
from searx.engines import wikidata, set_loggers
|
||||
from searx.data.currencies import CurrenciesDB
|
||||
|
||||
set_loggers(wikidata, 'wikidata')
|
||||
locales_initialize()
|
||||
|
||||
# ORDER BY (with all the query fields) is important to keep a deterministic result order
|
||||
# so multiple invocation of this script doesn't change currencies.json
|
||||
SARQL_REQUEST = """
|
||||
SELECT DISTINCT ?iso4217 ?unit ?unicode ?label ?alias WHERE {
|
||||
?item wdt:P498 ?iso4217; rdfs:label ?label.
|
||||
OPTIONAL { ?item skos:altLabel ?alias FILTER (LANG (?alias) = LANG(?label)). }
|
||||
OPTIONAL { ?item wdt:P5061 ?unit. }
|
||||
OPTIONAL { ?item wdt:P489 ?symbol.
|
||||
?symbol wdt:P487 ?unicode. }
|
||||
MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date
|
||||
MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency)
|
||||
FILTER(LANG(?label) IN (%LANGUAGES_SPARQL%)).
|
||||
}
|
||||
ORDER BY ?iso4217 ?unit ?unicode ?label ?alias
|
||||
"""
|
||||
|
||||
# ORDER BY (with all the query fields) is important to keep a deterministic result order
|
||||
# so multiple invocation of this script doesn't change currencies.json
|
||||
SPARQL_WIKIPEDIA_NAMES_REQUEST = """
|
||||
SELECT DISTINCT ?iso4217 ?article_name WHERE {
|
||||
?item wdt:P498 ?iso4217 .
|
||||
?article schema:about ?item ;
|
||||
schema:name ?article_name ;
|
||||
schema:isPartOf [ wikibase:wikiGroup "wikipedia" ]
|
||||
MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date
|
||||
MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency)
|
||||
FILTER(LANG(?article_name) IN (%LANGUAGES_SPARQL%)).
|
||||
}
|
||||
ORDER BY ?iso4217 ?article_name
|
||||
"""
|
||||
|
||||
|
||||
LANGUAGES = LOCALE_NAMES.keys()
|
||||
LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES)))
|
||||
|
||||
|
||||
def remove_accents(name):
|
||||
return unicodedata.normalize('NFKD', name).lower()
|
||||
|
||||
|
||||
def remove_extra(name):
|
||||
for c in ('(', ':'):
|
||||
if c in name:
|
||||
name = name.split(c)[0].strip()
|
||||
return name
|
||||
|
||||
|
||||
def _normalize_name(name):
|
||||
name = re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' '))
|
||||
name = remove_extra(name)
|
||||
return name
|
||||
|
||||
|
||||
def add_currency_name(db, name, iso4217, normalize_name=True):
|
||||
db_names = db['names']
|
||||
|
||||
if normalize_name:
|
||||
name = _normalize_name(name)
|
||||
|
||||
iso4217_set = db_names.setdefault(name, [])
|
||||
if iso4217 not in iso4217_set:
|
||||
iso4217_set.insert(0, iso4217)
|
||||
|
||||
|
||||
def add_currency_label(db, label, iso4217, language):
|
||||
labels = db['iso4217'].setdefault(iso4217, {})
|
||||
labels[language] = label
|
||||
|
||||
|
||||
def wikidata_request_result_iterator(request):
|
||||
result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL), timeout=20)
|
||||
if result is not None:
|
||||
yield from result['results']['bindings']
|
||||
|
||||
|
||||
def fetch_db():
|
||||
db = {
|
||||
'names': {},
|
||||
'iso4217': {},
|
||||
}
|
||||
|
||||
for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST):
|
||||
iso4217 = r['iso4217']['value']
|
||||
article_name = r['article_name']['value']
|
||||
article_lang = r['article_name']['xml:lang']
|
||||
add_currency_name(db, article_name, iso4217)
|
||||
add_currency_label(db, article_name, iso4217, article_lang)
|
||||
|
||||
for r in wikidata_request_result_iterator(SARQL_REQUEST):
|
||||
iso4217 = r['iso4217']['value']
|
||||
if 'label' in r:
|
||||
label = r['label']['value']
|
||||
label_lang = r['label']['xml:lang']
|
||||
add_currency_name(db, label, iso4217)
|
||||
add_currency_label(db, label, iso4217, label_lang)
|
||||
|
||||
if 'alias' in r:
|
||||
add_currency_name(db, r['alias']['value'], iso4217)
|
||||
|
||||
if 'unicode' in r:
|
||||
add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False)
|
||||
|
||||
if 'unit' in r:
|
||||
add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False)
|
||||
|
||||
return db
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
db = fetch_db()
|
||||
|
||||
# static
|
||||
add_currency_name(db, "euro", 'EUR')
|
||||
add_currency_name(db, "euros", 'EUR')
|
||||
add_currency_name(db, "dollar", 'USD')
|
||||
add_currency_name(db, "dollars", 'USD')
|
||||
add_currency_name(db, "peso", 'MXN')
|
||||
add_currency_name(db, "pesos", 'MXN')
|
||||
|
||||
# reduce memory usage:
|
||||
# replace lists with one item by the item. see
|
||||
# searx.search.processors.online_currency.name_to_iso4217
|
||||
for name in db['names']:
|
||||
if len(db['names'][name]) == 1:
|
||||
db['names'][name] = db['names'][name][0]
|
||||
|
||||
with CurrenciesDB.json_file.open('w', encoding='utf8') as f:
|
||||
json.dump(db, f, indent=4, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
371
searxng_extra/update/update_engine_descriptions.py
Executable file
371
searxng_extra/update/update_engine_descriptions.py
Executable file
@@ -0,0 +1,371 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Fetch website description from websites and from
|
||||
:origin:`searx/engines/wikidata.py` engine.
|
||||
|
||||
Output file: :origin:`searx/data/engine_descriptions.json`.
|
||||
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name, global-statement
|
||||
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
from os.path import join
|
||||
|
||||
from lxml.html import fromstring
|
||||
|
||||
from searx.engines import wikidata, set_loggers
|
||||
from searx.utils import extract_text, searx_useragent
|
||||
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
|
||||
from searx import searx_dir
|
||||
from searx.utils import gen_useragent, detect_language
|
||||
import searx.search
|
||||
import searx.network
|
||||
from searx.data import data_dir
|
||||
|
||||
DATA_FILE = data_dir / 'engine_descriptions.json'
|
||||
|
||||
set_loggers(wikidata, 'wikidata')
|
||||
locales_initialize()
|
||||
|
||||
# you can run the query in https://query.wikidata.org
|
||||
# replace %IDS% by Wikidata entities separated by spaces with the prefix wd:
|
||||
# for example wd:Q182496 wd:Q1540899
|
||||
# replace %LANGUAGES_SPARQL% by languages
|
||||
SPARQL_WIKIPEDIA_ARTICLE = """
|
||||
SELECT DISTINCT ?item ?name ?article ?lang
|
||||
WHERE {
|
||||
hint:Query hint:optimizer "None".
|
||||
VALUES ?item { %IDS% }
|
||||
?article schema:about ?item ;
|
||||
schema:inLanguage ?lang ;
|
||||
schema:name ?name ;
|
||||
schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] .
|
||||
FILTER(?lang in (%LANGUAGES_SPARQL%)) .
|
||||
FILTER (!CONTAINS(?name, ':')) .
|
||||
}
|
||||
ORDER BY ?item ?lang
|
||||
"""
|
||||
|
||||
SPARQL_DESCRIPTION = """
|
||||
SELECT DISTINCT ?item ?itemDescription
|
||||
WHERE {
|
||||
VALUES ?item { %IDS% }
|
||||
?item schema:description ?itemDescription .
|
||||
FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%))
|
||||
}
|
||||
ORDER BY ?itemLang
|
||||
"""
|
||||
|
||||
NOT_A_DESCRIPTION = [
|
||||
'web site',
|
||||
'site web',
|
||||
'komputa serĉilo',
|
||||
'interreta serĉilo',
|
||||
'bilaketa motor',
|
||||
'web search engine',
|
||||
'wikimedia täpsustuslehekülg',
|
||||
]
|
||||
|
||||
SKIP_ENGINE_SOURCE = [
|
||||
# fmt: off
|
||||
('gitlab', 'wikidata')
|
||||
# descriptions are about wikipedia disambiguation pages
|
||||
# fmt: on
|
||||
]
|
||||
|
||||
WIKIPEDIA_LANGUAGES = {}
|
||||
LANGUAGES_SPARQL = ''
|
||||
IDS = None
|
||||
WIKIPEDIA_LANGUAGE_VARIANTS = {'zh_Hant': 'zh-tw'}
|
||||
|
||||
|
||||
descriptions = {}
|
||||
wd_to_engine_name = {}
|
||||
|
||||
|
||||
def normalize_description(description):
|
||||
for c in [chr(c) for c in range(0, 31)]:
|
||||
description = description.replace(c, ' ')
|
||||
description = ' '.join(description.strip().split())
|
||||
return description
|
||||
|
||||
|
||||
def update_description(engine_name, lang, description, source, replace=True):
|
||||
if not isinstance(description, str):
|
||||
return
|
||||
description = normalize_description(description)
|
||||
if description.lower() == engine_name.lower():
|
||||
return
|
||||
if description.lower() in NOT_A_DESCRIPTION:
|
||||
return
|
||||
if (engine_name, source) in SKIP_ENGINE_SOURCE:
|
||||
return
|
||||
if ' ' not in description:
|
||||
# skip unique word description (like "website")
|
||||
return
|
||||
if replace or lang not in descriptions[engine_name]:
|
||||
descriptions[engine_name][lang] = [description, source]
|
||||
|
||||
|
||||
def get_wikipedia_summary(wikipedia_url, searxng_locale):
|
||||
# get the REST API URL from the HTML URL
|
||||
|
||||
# Headers
|
||||
headers = {'User-Agent': searx_useragent()}
|
||||
|
||||
if searxng_locale in WIKIPEDIA_LANGUAGE_VARIANTS:
|
||||
headers['Accept-Language'] = WIKIPEDIA_LANGUAGE_VARIANTS.get(searxng_locale)
|
||||
|
||||
# URL path : from HTML URL to REST API URL
|
||||
parsed_url = urlparse(wikipedia_url)
|
||||
# remove the /wiki/ prefix
|
||||
article_name = parsed_url.path.split('/wiki/')[1]
|
||||
# article_name is already encoded but not the / which is required for the REST API call
|
||||
encoded_article_name = article_name.replace('/', '%2F')
|
||||
path = '/api/rest_v1/page/summary/' + encoded_article_name
|
||||
wikipedia_rest_url = parsed_url._replace(path=path).geturl()
|
||||
try:
|
||||
response = searx.network.get(wikipedia_rest_url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
except Exception as e: # pylint: disable=broad-except
|
||||
print(" ", wikipedia_url, e)
|
||||
return None
|
||||
api_result = json.loads(response.text)
|
||||
return api_result.get('extract')
|
||||
|
||||
|
||||
def get_website_description(url, lang1, lang2=None):
|
||||
headers = {
|
||||
'User-Agent': gen_useragent(),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'DNT': '1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-GPC': '1',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
if lang1 is not None:
|
||||
lang_list = [lang1]
|
||||
if lang2 is not None:
|
||||
lang_list.append(lang2)
|
||||
headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8'
|
||||
try:
|
||||
response = searx.network.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
except Exception: # pylint: disable=broad-except
|
||||
return (None, None)
|
||||
|
||||
try:
|
||||
html = fromstring(response.text)
|
||||
except ValueError:
|
||||
html = fromstring(response.content)
|
||||
|
||||
description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content'))
|
||||
if not description:
|
||||
description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content'))
|
||||
if not description:
|
||||
description = extract_text(html.xpath('/html/head/title'))
|
||||
lang = extract_text(html.xpath('/html/@lang'))
|
||||
if lang is None and len(lang1) > 0:
|
||||
lang = lang1
|
||||
lang = detect_language(description) or lang or 'en'
|
||||
lang = lang.split('_')[0]
|
||||
lang = lang.split('-')[0]
|
||||
return (lang, description)
|
||||
|
||||
|
||||
def initialize():
|
||||
global IDS, LANGUAGES_SPARQL
|
||||
searx.search.initialize()
|
||||
wikipedia_engine = searx.engines.engines['wikipedia']
|
||||
|
||||
locale2lang = {'nl-BE': 'nl'}
|
||||
for sxng_ui_lang in LOCALE_NAMES:
|
||||
|
||||
sxng_ui_alias = locale2lang.get(sxng_ui_lang, sxng_ui_lang)
|
||||
wiki_lang = None
|
||||
|
||||
if sxng_ui_alias in wikipedia_engine.traits.custom['WIKIPEDIA_LANGUAGES']:
|
||||
wiki_lang = sxng_ui_alias
|
||||
if not wiki_lang:
|
||||
wiki_lang = wikipedia_engine.traits.get_language(sxng_ui_alias)
|
||||
if not wiki_lang:
|
||||
print(f"WIKIPEDIA_LANGUAGES missing {sxng_ui_lang}")
|
||||
continue
|
||||
WIKIPEDIA_LANGUAGES[sxng_ui_lang] = wiki_lang
|
||||
|
||||
LANGUAGES_SPARQL = ', '.join(f"'{l}'" for l in set(WIKIPEDIA_LANGUAGES.values()))
|
||||
for engine_name, engine in searx.engines.engines.items():
|
||||
descriptions[engine_name] = {}
|
||||
wikidata_id = getattr(engine, "about", {}).get('wikidata_id')
|
||||
if wikidata_id is not None:
|
||||
wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name)
|
||||
|
||||
IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys())))
|
||||
|
||||
|
||||
def fetch_wikidata_descriptions():
|
||||
print('Fetching wikidata descriptions')
|
||||
searx.network.set_timeout_for_thread(60)
|
||||
result = wikidata.send_wikidata_query(
|
||||
SPARQL_DESCRIPTION.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)
|
||||
)
|
||||
if result is not None:
|
||||
for binding in result['results']['bindings']:
|
||||
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
|
||||
wikidata_lang = binding['itemDescription']['xml:lang']
|
||||
desc = binding['itemDescription']['value']
|
||||
for engine_name in wd_to_engine_name[wikidata_id]:
|
||||
for searxng_locale in LOCALE_NAMES:
|
||||
if WIKIPEDIA_LANGUAGES[searxng_locale] != wikidata_lang:
|
||||
continue
|
||||
print(
|
||||
f" engine: {engine_name:20} / wikidata_lang: {wikidata_lang:5}",
|
||||
f"/ len(wikidata_desc): {len(desc)}",
|
||||
)
|
||||
update_description(engine_name, searxng_locale, desc, 'wikidata')
|
||||
|
||||
|
||||
def fetch_wikipedia_descriptions():
|
||||
print('Fetching wikipedia descriptions')
|
||||
result = wikidata.send_wikidata_query(
|
||||
SPARQL_WIKIPEDIA_ARTICLE.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)
|
||||
)
|
||||
if result is not None:
|
||||
for binding in result['results']['bindings']:
|
||||
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
|
||||
wikidata_lang = binding['name']['xml:lang']
|
||||
wikipedia_url = binding['article']['value'] # for example the URL https://de.wikipedia.org/wiki/PubMed
|
||||
for engine_name in wd_to_engine_name[wikidata_id]:
|
||||
for searxng_locale in LOCALE_NAMES:
|
||||
if WIKIPEDIA_LANGUAGES[searxng_locale] != wikidata_lang:
|
||||
continue
|
||||
desc = get_wikipedia_summary(wikipedia_url, searxng_locale)
|
||||
if not desc:
|
||||
continue
|
||||
print(
|
||||
f" engine: {engine_name:20} / wikidata_lang: {wikidata_lang:5}",
|
||||
f"/ len(wikipedia_desc): {len(desc)}",
|
||||
)
|
||||
update_description(engine_name, searxng_locale, desc, 'wikipedia')
|
||||
|
||||
|
||||
def normalize_url(url):
|
||||
url = url.replace('{language}', 'en')
|
||||
url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl()
|
||||
url = url.replace('https://api.', 'https://')
|
||||
return url
|
||||
|
||||
|
||||
def fetch_website_description(engine_name, website):
|
||||
print(f"- fetch website descr: {engine_name} / {website}")
|
||||
default_lang, default_description = get_website_description(website, None, None)
|
||||
|
||||
if default_lang is None or default_description is None:
|
||||
# the front page can't be fetched: skip this engine
|
||||
return
|
||||
|
||||
# to specify an order in where the most common languages are in front of the
|
||||
# language list ..
|
||||
languages = ['en', 'es', 'pt', 'ru', 'tr', 'fr']
|
||||
languages = languages + [l for l in LOCALE_NAMES if l not in languages]
|
||||
|
||||
previous_matched_lang = None
|
||||
previous_count = 0
|
||||
|
||||
for lang in languages:
|
||||
|
||||
if lang in descriptions[engine_name]:
|
||||
continue
|
||||
|
||||
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
|
||||
if fetched_lang is None or desc is None:
|
||||
continue
|
||||
|
||||
# check if desc changed with the different lang values
|
||||
|
||||
if fetched_lang == previous_matched_lang:
|
||||
previous_count += 1
|
||||
if previous_count == 6:
|
||||
# the website has returned the same description for 6 different languages in Accept-Language header
|
||||
# stop now
|
||||
break
|
||||
else:
|
||||
previous_matched_lang = fetched_lang
|
||||
previous_count = 0
|
||||
|
||||
# Don't trust in the value of fetched_lang, some websites return
|
||||
# for some inappropriate values, by example bing-images::
|
||||
#
|
||||
# requested lang: zh-Hans-CN / fetched lang: ceb / desc: 查看根据您的兴趣量身定制的提要
|
||||
#
|
||||
# The lang ceb is "Cebuano" but the description is given in zh-Hans-CN
|
||||
|
||||
print(
|
||||
f" engine: {engine_name:20} / requested lang:{lang:7}"
|
||||
f" / fetched lang: {fetched_lang:7} / len(desc): {len(desc)}"
|
||||
)
|
||||
|
||||
matched_lang = match_locale(fetched_lang, LOCALE_NAMES.keys(), fallback=lang)
|
||||
update_description(engine_name, matched_lang, desc, website, replace=False)
|
||||
|
||||
|
||||
def fetch_website_descriptions():
|
||||
print('Fetching website descriptions')
|
||||
for engine_name, engine in searx.engines.engines.items():
|
||||
website = getattr(engine, "about", {}).get('website')
|
||||
if website is None and hasattr(engine, "search_url"):
|
||||
website = normalize_url(getattr(engine, "search_url"))
|
||||
if website is None and hasattr(engine, "base_url"):
|
||||
website = normalize_url(getattr(engine, "base_url"))
|
||||
if website is not None:
|
||||
fetch_website_description(engine_name, website)
|
||||
|
||||
|
||||
def get_engine_descriptions_filename():
|
||||
return join(join(searx_dir, "data"), "engine_descriptions.json")
|
||||
|
||||
|
||||
def get_output():
|
||||
"""
|
||||
From descriptions[engine][language] = [description, source]
|
||||
To
|
||||
|
||||
* output[language][engine] = description_and_source
|
||||
* description_and_source can be:
|
||||
* [description, source]
|
||||
* description (if source = "wikipedia")
|
||||
* [f"engine:lang", "ref"] (reference to another existing description)
|
||||
"""
|
||||
output = {locale: {} for locale in LOCALE_NAMES}
|
||||
|
||||
seen_descriptions = {}
|
||||
|
||||
for engine_name, lang_descriptions in descriptions.items():
|
||||
for language, description in lang_descriptions.items():
|
||||
if description[0] in seen_descriptions:
|
||||
ref = seen_descriptions[description[0]]
|
||||
description = [f'{ref[0]}:{ref[1]}', 'ref']
|
||||
else:
|
||||
seen_descriptions[description[0]] = (engine_name, language)
|
||||
if description[1] == 'wikipedia':
|
||||
description = description[0]
|
||||
output.setdefault(language, {}).setdefault(engine_name, description)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def main():
|
||||
initialize()
|
||||
fetch_wikidata_descriptions()
|
||||
fetch_wikipedia_descriptions()
|
||||
fetch_website_descriptions()
|
||||
|
||||
output = get_output()
|
||||
with DATA_FILE.open('w', encoding='utf8') as f:
|
||||
f.write(json.dumps(output, indent=1, separators=(',', ':'), sort_keys=True, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
199
searxng_extra/update/update_engine_traits.py
Executable file
199
searxng_extra/update/update_engine_traits.py
Executable file
@@ -0,0 +1,199 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`
|
||||
|
||||
:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:
|
||||
Persistence of engines traits, fetched from the engines.
|
||||
|
||||
:origin:`searx/languages.py`
|
||||
Is generated from intersecting each engine's supported traits.
|
||||
|
||||
The script :origin:`searxng_extra/update/update_engine_traits.py` is called in
|
||||
the :origin:`CI Update data ... <.github/workflows/data-update.yml>`
|
||||
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
from unicodedata import lookup
|
||||
from pathlib import Path
|
||||
from pprint import pformat
|
||||
import babel
|
||||
|
||||
from searx import settings, searx_dir
|
||||
from searx import network
|
||||
from searx.engines import load_engines
|
||||
from searx.enginelib.traits import EngineTraitsMap
|
||||
|
||||
# Output files.
|
||||
languages_file = Path(searx_dir) / 'sxng_locales.py'
|
||||
languages_file_header = """\
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
'''List of SearXNG's locale codes used for the search language/region.
|
||||
|
||||
.. hint::
|
||||
|
||||
Don't modify this file, this file is generated by::
|
||||
|
||||
./manage data.traits
|
||||
'''
|
||||
|
||||
sxng_locales = (
|
||||
"""
|
||||
languages_file_footer = """,
|
||||
)
|
||||
'''
|
||||
A list of five-digit tuples:
|
||||
|
||||
0. SearXNG's internal locale tag (a language or region tag)
|
||||
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
|
||||
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
|
||||
Empty string for language tags.
|
||||
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
|
||||
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
|
||||
are represented by a globe (\U0001F310)
|
||||
|
||||
.. code:: python
|
||||
|
||||
('en', 'English', '', 'English', '\U0001f310'),
|
||||
('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
|
||||
('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
|
||||
..
|
||||
('fr', 'Français', '', 'French', '\U0001f310'),
|
||||
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
|
||||
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
|
||||
|
||||
:meta hide-value:
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
lang2emoji = {
|
||||
'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
|
||||
'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
|
||||
'jp': '\U0001F1EF\U0001F1F5', # Japanese
|
||||
'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
|
||||
'he': '\U0001F1EE\U0001F1F1', # Hebrew
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
load_engines(settings['engines'])
|
||||
# traits_map = EngineTraitsMap.from_data()
|
||||
traits_map = fetch_traits_map()
|
||||
sxng_tag_list = filter_locales(traits_map)
|
||||
write_languages_file(sxng_tag_list)
|
||||
|
||||
|
||||
def fetch_traits_map():
|
||||
"""Fetches supported languages for each engine and writes json file with those."""
|
||||
network.set_timeout_for_thread(10.0)
|
||||
|
||||
def log(msg):
|
||||
print(msg)
|
||||
|
||||
traits_map = EngineTraitsMap.fetch_traits(log=log)
|
||||
print("fetched properties from %s engines" % len(traits_map))
|
||||
print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE)
|
||||
traits_map.save_data()
|
||||
return traits_map
|
||||
|
||||
|
||||
def filter_locales(traits_map: EngineTraitsMap):
|
||||
"""Filter language & region tags by a threshold."""
|
||||
|
||||
min_eng_per_region = 18
|
||||
min_eng_per_lang = 22
|
||||
|
||||
_ = {}
|
||||
for eng in traits_map.values():
|
||||
for reg in eng.regions.keys():
|
||||
_[reg] = _.get(reg, 0) + 1
|
||||
|
||||
regions = set(k for k, v in _.items() if v >= min_eng_per_region)
|
||||
lang_from_region = set(k.split('-')[0] for k in regions)
|
||||
|
||||
_ = {}
|
||||
for eng in traits_map.values():
|
||||
for lang in eng.languages.keys():
|
||||
# ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they
|
||||
# already counted by existence of 'zh' or 'sr', 'pa')
|
||||
if '_' in lang:
|
||||
# print("ignore %s" % lang)
|
||||
continue
|
||||
_[lang] = _.get(lang, 0) + 1
|
||||
|
||||
languages = set(k for k, v in _.items() if v >= min_eng_per_lang)
|
||||
|
||||
sxng_tag_list = set()
|
||||
sxng_tag_list.update(regions)
|
||||
sxng_tag_list.update(lang_from_region)
|
||||
sxng_tag_list.update(languages)
|
||||
|
||||
return sxng_tag_list
|
||||
|
||||
|
||||
def write_languages_file(sxng_tag_list):
|
||||
|
||||
language_codes = []
|
||||
|
||||
for sxng_tag in sorted(sxng_tag_list):
|
||||
sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-')
|
||||
|
||||
flag = get_unicode_flag(sxng_locale) or ''
|
||||
|
||||
item = (
|
||||
sxng_tag,
|
||||
sxng_locale.get_language_name().title(), # type: ignore
|
||||
sxng_locale.get_territory_name() or '',
|
||||
sxng_locale.english_name.split(' (')[0] if sxng_locale.english_name else '',
|
||||
UnicodeEscape(flag),
|
||||
)
|
||||
|
||||
language_codes.append(item)
|
||||
|
||||
language_codes = tuple(language_codes)
|
||||
|
||||
with languages_file.open('w', encoding='utf-8') as new_file:
|
||||
file_content = "{header} {language_codes}{footer}".format(
|
||||
header=languages_file_header,
|
||||
language_codes=pformat(language_codes, width=120, indent=4)[1:-1],
|
||||
footer=languages_file_footer,
|
||||
)
|
||||
new_file.write(file_content)
|
||||
new_file.close()
|
||||
|
||||
|
||||
class UnicodeEscape(str):
|
||||
"""Escape unicode string in :py:obj:`pprint.pformat`"""
|
||||
|
||||
def __repr__(self):
|
||||
return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
|
||||
|
||||
|
||||
def get_unicode_flag(locale: babel.Locale):
|
||||
"""Determine a unicode flag (emoji) that fits to the ``locale``"""
|
||||
|
||||
emoji = lang2emoji.get(locale.language)
|
||||
if emoji:
|
||||
return emoji
|
||||
|
||||
if not locale.territory:
|
||||
return '\U0001F310'
|
||||
|
||||
emoji = lang2emoji.get(locale.territory.lower())
|
||||
if emoji:
|
||||
return emoji
|
||||
|
||||
try:
|
||||
c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0])
|
||||
c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1])
|
||||
# print("OK : %s --> %s%s" % (locale, c1, c2))
|
||||
except KeyError as exc:
|
||||
print("ERROR: %s --> %s" % (locale, exc))
|
||||
return None
|
||||
|
||||
return c1 + c2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
143
searxng_extra/update/update_external_bangs.py
Executable file
143
searxng_extra/update/update_external_bangs.py
Executable file
@@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Update :origin:`searx/data/external_bangs.json` using the duckduckgo bangs
|
||||
from :py:obj:`BANGS_URL`.
|
||||
|
||||
- :origin:`CI Update data ... <.github/workflows/data-update.yml>`
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
import httpx
|
||||
|
||||
from searx.external_bang import LEAF_KEY
|
||||
from searx.data import data_dir
|
||||
|
||||
DATA_FILE = data_dir / 'external_bangs.json'
|
||||
|
||||
BANGS_URL = 'https://duckduckgo.com/bang.js'
|
||||
"""JSON file which contains the bangs."""
|
||||
|
||||
HTTPS_COLON = 'https:'
|
||||
HTTP_COLON = 'http:'
|
||||
|
||||
|
||||
def main():
|
||||
print(f'fetch bangs from {BANGS_URL}')
|
||||
response = httpx.get(BANGS_URL)
|
||||
response.raise_for_status()
|
||||
ddg_bangs = json.loads(response.content.decode())
|
||||
trie = parse_ddg_bangs(ddg_bangs)
|
||||
output = {
|
||||
'version': 0,
|
||||
'trie': trie,
|
||||
}
|
||||
with DATA_FILE.open('w', encoding="utf8") as f:
|
||||
json.dump(output, f, indent=4, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
|
||||
def merge_when_no_leaf(node):
|
||||
"""Minimize the number of nodes
|
||||
|
||||
``A -> B -> C``
|
||||
|
||||
- ``B`` is child of ``A``
|
||||
- ``C`` is child of ``B``
|
||||
|
||||
If there are no ``C`` equals to ``<LEAF_KEY>``, then each ``C`` are merged
|
||||
into ``A``. For example (5 nodes)::
|
||||
|
||||
d -> d -> g -> <LEAF_KEY> (ddg)
|
||||
-> i -> g -> <LEAF_KEY> (dig)
|
||||
|
||||
becomes (3 nodes)::
|
||||
|
||||
d -> dg -> <LEAF_KEY>
|
||||
-> ig -> <LEAF_KEY>
|
||||
|
||||
"""
|
||||
restart = False
|
||||
if not isinstance(node, dict):
|
||||
return
|
||||
|
||||
# create a copy of the keys so node can be modified
|
||||
keys = list(node.keys())
|
||||
|
||||
for key in keys:
|
||||
if key == LEAF_KEY:
|
||||
continue
|
||||
|
||||
value = node[key]
|
||||
value_keys = list(value.keys())
|
||||
if LEAF_KEY not in value_keys:
|
||||
for value_key in value_keys:
|
||||
node[key + value_key] = value[value_key]
|
||||
merge_when_no_leaf(node[key + value_key])
|
||||
del node[key]
|
||||
restart = True
|
||||
else:
|
||||
merge_when_no_leaf(value)
|
||||
|
||||
if restart:
|
||||
merge_when_no_leaf(node)
|
||||
|
||||
|
||||
def optimize_leaf(parent, parent_key, node):
|
||||
if not isinstance(node, dict):
|
||||
return
|
||||
|
||||
if len(node) == 1 and LEAF_KEY in node and parent is not None:
|
||||
parent[parent_key] = node[LEAF_KEY]
|
||||
else:
|
||||
for key, value in node.items():
|
||||
optimize_leaf(node, key, value)
|
||||
|
||||
|
||||
def parse_ddg_bangs(ddg_bangs):
|
||||
bang_trie = {}
|
||||
bang_urls = {}
|
||||
|
||||
for bang_definition in ddg_bangs:
|
||||
# bang_list
|
||||
bang_url = bang_definition['u']
|
||||
if '{{{s}}}' not in bang_url:
|
||||
# ignore invalid bang
|
||||
continue
|
||||
|
||||
bang_url = bang_url.replace('{{{s}}}', chr(2))
|
||||
|
||||
# only for the https protocol: "https://example.com" becomes "//example.com"
|
||||
if bang_url.startswith(HTTPS_COLON + '//'):
|
||||
bang_url = bang_url[len(HTTPS_COLON) :]
|
||||
|
||||
#
|
||||
if bang_url.startswith(HTTP_COLON + '//') and bang_url[len(HTTP_COLON) :] in bang_urls:
|
||||
# if the bang_url uses the http:// protocol, and the same URL exists in https://
|
||||
# then reuse the https:// bang definition. (written //example.com)
|
||||
bang_def_output = bang_urls[bang_url[len(HTTP_COLON) :]]
|
||||
else:
|
||||
# normal use case : new http:// URL or https:// URL (without "https:", see above)
|
||||
bang_rank = str(bang_definition['r'])
|
||||
bang_def_output = bang_url + chr(1) + bang_rank
|
||||
bang_def_output = bang_urls.setdefault(bang_url, bang_def_output)
|
||||
|
||||
bang_urls[bang_url] = bang_def_output
|
||||
|
||||
# bang name
|
||||
bang = bang_definition['t']
|
||||
|
||||
# bang_trie
|
||||
t = bang_trie
|
||||
for bang_letter in bang:
|
||||
t = t.setdefault(bang_letter, {})
|
||||
t = t.setdefault(LEAF_KEY, bang_def_output)
|
||||
|
||||
# optimize the trie
|
||||
merge_when_no_leaf(bang_trie)
|
||||
optimize_leaf(None, None, bang_trie)
|
||||
|
||||
return bang_trie
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
80
searxng_extra/update/update_firefox_version.py
Executable file
80
searxng_extra/update/update_firefox_version.py
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Fetch firefox useragent signatures
|
||||
|
||||
Output file: :origin:`searx/data/useragents.json` (:origin:`CI Update data ...
|
||||
<.github/workflows/data-update.yml>`).
|
||||
|
||||
"""
|
||||
# pylint: disable=use-dict-literal
|
||||
|
||||
import json
|
||||
import re
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from packaging.version import parse
|
||||
|
||||
import requests
|
||||
from lxml import html
|
||||
from searx.data import data_dir
|
||||
|
||||
DATA_FILE = data_dir / 'useragents.json'
|
||||
|
||||
URL = 'https://ftp.mozilla.org/pub/firefox/releases/'
|
||||
RELEASE_PATH = '/pub/firefox/releases/'
|
||||
|
||||
NORMAL_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?$')
|
||||
# BETA_REGEX = re.compile(r'.*[0-9]b([0-9\-a-z]+)$')
|
||||
# ESR_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?esr$')
|
||||
|
||||
#
|
||||
useragents = {
|
||||
# fmt: off
|
||||
"versions": (),
|
||||
"os": ('Windows NT 10.0; Win64; x64',
|
||||
'X11; Linux x86_64'),
|
||||
"ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}",
|
||||
# fmt: on
|
||||
}
|
||||
|
||||
|
||||
def fetch_firefox_versions():
|
||||
resp = requests.get(URL, timeout=2.0)
|
||||
if resp.status_code != 200:
|
||||
# pylint: disable=broad-exception-raised
|
||||
raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code) # type: ignore
|
||||
dom = html.fromstring(resp.text)
|
||||
versions = []
|
||||
|
||||
for link in dom.xpath('//a/@href'):
|
||||
url = urlparse(urljoin(URL, link))
|
||||
path = url.path
|
||||
if path.startswith(RELEASE_PATH):
|
||||
version = path[len(RELEASE_PATH) : -1]
|
||||
if NORMAL_REGEX.match(version):
|
||||
versions.append(parse(version))
|
||||
|
||||
list.sort(versions, reverse=True)
|
||||
return versions
|
||||
|
||||
|
||||
def fetch_firefox_last_versions():
|
||||
versions = fetch_firefox_versions()
|
||||
|
||||
result = []
|
||||
major_last = versions[0].major
|
||||
major_list = (major_last, major_last - 1)
|
||||
for version in versions:
|
||||
major_current = version.major
|
||||
minor_current = version.minor
|
||||
if major_current in major_list:
|
||||
user_agent_version = f'{major_current}.{minor_current}'
|
||||
if user_agent_version not in result:
|
||||
result.append(user_agent_version)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
useragents["versions"] = fetch_firefox_last_versions()
|
||||
with DATA_FILE.open('w', encoding='utf-8') as f:
|
||||
json.dump(useragents, f, indent=4, sort_keys=True, ensure_ascii=False)
|
||||
102
searxng_extra/update/update_locales.py
Executable file
102
searxng_extra/update/update_locales.py
Executable file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Update locale names in :origin:`searx/data/locales.json` used by
|
||||
:ref:`searx.locales`
|
||||
|
||||
- :py:obj:`searx.locales.RTL_LOCALES`
|
||||
- :py:obj:`searx.locales.LOCALE_NAMES`
|
||||
"""
|
||||
# pylint: disable=invalid-name
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Set
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import babel
|
||||
import babel.languages
|
||||
import babel.core
|
||||
|
||||
from searx import searx_dir
|
||||
from searx.locales import (
|
||||
ADDITIONAL_TRANSLATIONS,
|
||||
LOCALE_BEST_MATCH,
|
||||
get_translation_locales,
|
||||
)
|
||||
|
||||
LOCALE_DATA_FILE = Path(searx_dir) / 'data' / 'locales.json'
|
||||
TRANSLATIONS_FOLDER = Path(searx_dir) / 'translations'
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
LOCALE_NAMES = {}
|
||||
RTL_LOCALES: Set[str] = set()
|
||||
|
||||
for tag, descr in ADDITIONAL_TRANSLATIONS.items():
|
||||
locale = babel.Locale.parse(LOCALE_BEST_MATCH[tag], sep='-')
|
||||
LOCALE_NAMES[tag] = descr
|
||||
if locale.text_direction == 'rtl':
|
||||
RTL_LOCALES.add(tag)
|
||||
|
||||
for tag in LOCALE_BEST_MATCH:
|
||||
descr = LOCALE_NAMES.get(tag)
|
||||
if not descr:
|
||||
locale = babel.Locale.parse(tag, sep='-')
|
||||
LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_'))
|
||||
if locale.text_direction == 'rtl':
|
||||
RTL_LOCALES.add(tag)
|
||||
|
||||
for tr_locale in get_translation_locales():
|
||||
sxng_tag = tr_locale.replace('_', '-')
|
||||
descr = LOCALE_NAMES.get(sxng_tag)
|
||||
if not descr:
|
||||
locale = babel.Locale.parse(tr_locale)
|
||||
LOCALE_NAMES[sxng_tag] = get_locale_descr(locale, tr_locale)
|
||||
if locale.text_direction == 'rtl':
|
||||
RTL_LOCALES.add(sxng_tag)
|
||||
|
||||
content = {
|
||||
"LOCALE_NAMES": LOCALE_NAMES,
|
||||
"RTL_LOCALES": sorted(RTL_LOCALES),
|
||||
}
|
||||
|
||||
with LOCALE_DATA_FILE.open('w', encoding='utf-8') as f:
|
||||
json.dump(content, f, indent=2, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
|
||||
def get_locale_descr(locale: babel.Locale, tr_locale):
|
||||
"""Get locale name e.g. 'Français - fr' or 'Português (Brasil) - pt-BR'
|
||||
|
||||
:param locale: instance of :py:class:`Locale`
|
||||
:param tr_locale: name e.g. 'fr' or 'pt_BR' (delimiter is *underscore*)
|
||||
"""
|
||||
|
||||
native_language, native_territory = _get_locale_descr(locale, tr_locale)
|
||||
english_language, english_territory = _get_locale_descr(locale, 'en')
|
||||
|
||||
if native_territory == english_territory:
|
||||
english_territory = None
|
||||
|
||||
if not native_territory and not english_territory:
|
||||
# none territory name
|
||||
if native_language == english_language:
|
||||
return native_language
|
||||
return native_language + ' (' + english_language + ')'
|
||||
|
||||
result = native_language + ', ' + native_territory + ' (' + english_language
|
||||
if english_territory:
|
||||
return result + ', ' + english_territory + ')'
|
||||
return result + ')'
|
||||
|
||||
|
||||
def _get_locale_descr(locale: babel.Locale, tr_locale: str) -> tuple[str, str]:
|
||||
language_name = locale.get_language_name(tr_locale).capitalize() # type: ignore
|
||||
if language_name and ('a' <= language_name[0] <= 'z'):
|
||||
language_name = language_name.capitalize()
|
||||
territory_name: str = locale.get_territory_name(tr_locale) # type: ignore
|
||||
return language_name, territory_name
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
214
searxng_extra/update/update_osm_keys_tags.py
Executable file
214
searxng_extra/update/update_osm_keys_tags.py
Executable file
@@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Fetch OSM keys and tags.
|
||||
|
||||
To get the i18n names, the scripts uses `Wikidata Query Service`_ instead of for
|
||||
example `OSM tags API`_ (side note: the actual change log from
|
||||
map.atownsend.org.uk_ might be useful to normalize OSM tags).
|
||||
|
||||
Output file: :origin:`searx/data/osm_keys_tags` (:origin:`CI Update data ...
|
||||
<.github/workflows/data-update.yml>`).
|
||||
|
||||
.. _Wikidata Query Service: https://query.wikidata.org/
|
||||
.. _OSM tags API: https://taginfo.openstreetmap.org/taginfo/apidoc
|
||||
.. _map.atownsend.org.uk: https://map.atownsend.org.uk/maps/map/changelog.html
|
||||
|
||||
:py:obj:`SPARQL_TAGS_REQUEST` :
|
||||
Wikidata SPARQL query that returns *type-categories* and *types*. The
|
||||
returned tag is ``Tag:{category}={type}`` (see :py:func:`get_tags`).
|
||||
Example:
|
||||
|
||||
- https://taginfo.openstreetmap.org/tags/building=house#overview
|
||||
- https://wiki.openstreetmap.org/wiki/Tag:building%3Dhouse
|
||||
at the bottom of the infobox (right side), there is a link to wikidata:
|
||||
https://www.wikidata.org/wiki/Q3947
|
||||
see property "OpenStreetMap tag or key" (P1282)
|
||||
- https://wiki.openstreetmap.org/wiki/Tag%3Abuilding%3Dbungalow
|
||||
https://www.wikidata.org/wiki/Q850107
|
||||
|
||||
:py:obj:`SPARQL_KEYS_REQUEST` :
|
||||
Wikidata SPARQL query that returns *keys*. Example with "payment":
|
||||
|
||||
- https://wiki.openstreetmap.org/wiki/Key%3Apayment
|
||||
at the bottom of infobox (right side), there is a link to wikidata:
|
||||
https://www.wikidata.org/wiki/Q1148747
|
||||
link made using the "OpenStreetMap tag or key" property (P1282)
|
||||
to be confirm: there is a one wiki page per key ?
|
||||
- https://taginfo.openstreetmap.org/keys/payment#values
|
||||
- https://taginfo.openstreetmap.org/keys/payment:cash#values
|
||||
|
||||
``rdfs:label`` get all the labels without language selection
|
||||
(as opposed to SERVICE ``wikibase:label``).
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
import collections
|
||||
|
||||
from searx.network import set_timeout_for_thread
|
||||
from searx.engines import wikidata, set_loggers
|
||||
from searx.sxng_locales import sxng_locales
|
||||
from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK
|
||||
from searx.data import data_dir
|
||||
|
||||
DATA_FILE = data_dir / 'osm_keys_tags.json'
|
||||
|
||||
set_loggers(wikidata, 'wikidata')
|
||||
|
||||
|
||||
SPARQL_TAGS_REQUEST = """
|
||||
SELECT ?tag ?item ?itemLabel WHERE {
|
||||
?item wdt:P1282 ?tag .
|
||||
?item rdfs:label ?itemLabel .
|
||||
FILTER(STRSTARTS(?tag, 'Tag'))
|
||||
}
|
||||
GROUP BY ?tag ?item ?itemLabel
|
||||
ORDER BY ?tag ?item ?itemLabel
|
||||
"""
|
||||
|
||||
SPARQL_KEYS_REQUEST = """
|
||||
SELECT ?key ?item ?itemLabel WHERE {
|
||||
?item wdt:P1282 ?key .
|
||||
?item rdfs:label ?itemLabel .
|
||||
FILTER(STRSTARTS(?key, 'Key'))
|
||||
}
|
||||
GROUP BY ?key ?item ?itemLabel
|
||||
ORDER BY ?key ?item ?itemLabel
|
||||
"""
|
||||
|
||||
LANGUAGES = [l[0].lower() for l in sxng_locales]
|
||||
|
||||
PRESET_KEYS = {
|
||||
('wikidata',): {'en': 'Wikidata'},
|
||||
('wikipedia',): {'en': 'Wikipedia'},
|
||||
('email',): {'en': 'Email'},
|
||||
('facebook',): {'en': 'Facebook'},
|
||||
('fax',): {'en': 'Fax'},
|
||||
('internet_access', 'ssid'): {'en': 'Wi-Fi'},
|
||||
}
|
||||
|
||||
INCLUDED_KEYS = {('addr',)}
|
||||
|
||||
|
||||
def get_preset_keys():
|
||||
results = collections.OrderedDict()
|
||||
for keys, value in PRESET_KEYS.items():
|
||||
r = results
|
||||
for k in keys:
|
||||
r = r.setdefault(k, {})
|
||||
r.setdefault('*', value)
|
||||
return results
|
||||
|
||||
|
||||
def get_keys():
|
||||
results = get_preset_keys()
|
||||
response = wikidata.send_wikidata_query(SPARQL_KEYS_REQUEST)
|
||||
|
||||
for key in response['results']['bindings']:
|
||||
keys = key['key']['value'].split(':')[1:]
|
||||
if keys[0] == 'currency' and len(keys) > 1:
|
||||
# special case in openstreetmap.py
|
||||
continue
|
||||
if keys[0] == 'contact' and len(keys) > 1:
|
||||
# label for the key "contact.email" is "Email"
|
||||
# whatever the language
|
||||
r = results.setdefault('contact', {})
|
||||
r[keys[1]] = {'*': {'en': keys[1]}}
|
||||
continue
|
||||
if tuple(keys) in PRESET_KEYS:
|
||||
# skip presets (already set above)
|
||||
continue
|
||||
if (
|
||||
get_key_rank(':'.join(keys)) is None
|
||||
and ':'.join(keys) not in VALUE_TO_LINK
|
||||
and tuple(keys) not in INCLUDED_KEYS
|
||||
):
|
||||
# keep only keys that will be displayed by openstreetmap.py
|
||||
continue
|
||||
label = key['itemLabel']['value'].lower()
|
||||
lang = key['itemLabel']['xml:lang']
|
||||
r = results
|
||||
for k in keys:
|
||||
r = r.setdefault(k, {})
|
||||
r = r.setdefault('*', {})
|
||||
if lang in LANGUAGES:
|
||||
r.setdefault(lang, label)
|
||||
|
||||
# special cases
|
||||
results['delivery']['covid19']['*'].clear()
|
||||
for k, v in results['delivery']['*'].items():
|
||||
results['delivery']['covid19']['*'][k] = v + ' (COVID19)'
|
||||
|
||||
results['opening_hours']['covid19']['*'].clear()
|
||||
for k, v in results['opening_hours']['*'].items():
|
||||
results['opening_hours']['covid19']['*'][k] = v + ' (COVID19)'
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_tags():
|
||||
results = collections.OrderedDict()
|
||||
response = wikidata.send_wikidata_query(SPARQL_TAGS_REQUEST)
|
||||
for tag in response['results']['bindings']:
|
||||
tag_names = tag['tag']['value'].split(':')[1].split('=')
|
||||
if len(tag_names) == 2:
|
||||
tag_category, tag_type = tag_names
|
||||
else:
|
||||
tag_category, tag_type = tag_names[0], ''
|
||||
label = tag['itemLabel']['value'].lower()
|
||||
lang = tag['itemLabel']['xml:lang']
|
||||
if lang in LANGUAGES:
|
||||
results.setdefault(tag_category, {}).setdefault(tag_type, {}).setdefault(lang, label)
|
||||
return results
|
||||
|
||||
|
||||
def optimize_data_lang(translations):
|
||||
language_to_delete = []
|
||||
# remove "zh-hk" entry if the value is the same as "zh"
|
||||
# same for "en-ca" / "en" etc...
|
||||
for language in translations:
|
||||
if '-' in language:
|
||||
base_language = language.split('-')[0]
|
||||
if translations.get(base_language) == translations.get(language):
|
||||
language_to_delete.append(language)
|
||||
|
||||
for language in language_to_delete:
|
||||
del translations[language]
|
||||
language_to_delete = []
|
||||
|
||||
# remove entries that have the same value than the "en" entry
|
||||
value_en = translations.get('en')
|
||||
if value_en:
|
||||
for language, value in translations.items():
|
||||
if language != 'en' and value == value_en:
|
||||
language_to_delete.append(language)
|
||||
|
||||
for language in language_to_delete:
|
||||
del translations[language]
|
||||
|
||||
|
||||
def optimize_tags(data):
|
||||
for v in data.values():
|
||||
for translations in v.values():
|
||||
optimize_data_lang(translations)
|
||||
return data
|
||||
|
||||
|
||||
def optimize_keys(data):
|
||||
for k, v in data.items():
|
||||
if k == '*':
|
||||
optimize_data_lang(v)
|
||||
elif isinstance(v, dict):
|
||||
optimize_keys(v)
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
set_timeout_for_thread(60)
|
||||
result = {
|
||||
'keys': optimize_keys(get_keys()),
|
||||
'tags': optimize_tags(get_tags()),
|
||||
}
|
||||
with DATA_FILE.open('w', encoding="utf8") as f:
|
||||
json.dump(result, f, indent=4, sort_keys=True, ensure_ascii=False)
|
||||
72
searxng_extra/update/update_pygments.py
Executable file
72
searxng_extra/update/update_pygments.py
Executable file
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Update pygments style
|
||||
|
||||
Call this script after each upgrade of pygments
|
||||
|
||||
"""
|
||||
# pylint: disable=too-few-public-methods
|
||||
|
||||
from pathlib import Path
|
||||
import pygments
|
||||
from pygments.formatters.html import HtmlFormatter
|
||||
|
||||
from searx import searx_dir
|
||||
|
||||
LESS_FILE = Path(searx_dir).parent / 'client/simple/generated/pygments.less'
|
||||
|
||||
HEADER = f"""\
|
||||
/*
|
||||
this file is generated automatically by searxng_extra/update/update_pygments.py
|
||||
using pygments version {pygments.__version__}
|
||||
*/
|
||||
|
||||
"""
|
||||
|
||||
START_LIGHT_THEME = """
|
||||
.code-highlight {
|
||||
"""
|
||||
|
||||
END_LIGHT_THEME = """
|
||||
}
|
||||
"""
|
||||
|
||||
START_DARK_THEME = """
|
||||
.code-highlight-dark(){
|
||||
.code-highlight {
|
||||
"""
|
||||
|
||||
END_DARK_THEME = """
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
class Formatter(HtmlFormatter): # pylint: disable=missing-class-docstring
|
||||
@property
|
||||
def _pre_style(self):
|
||||
return 'line-height: 100%;'
|
||||
|
||||
def get_style_lines(self, arg=None):
|
||||
style_lines = []
|
||||
style_lines.extend(self.get_linenos_style_defs())
|
||||
style_lines.extend(self.get_background_style_defs(arg))
|
||||
style_lines.extend(self.get_token_style_defs(arg))
|
||||
return style_lines
|
||||
|
||||
|
||||
def generat_css(light_style, dark_style) -> str:
|
||||
css = HEADER + START_LIGHT_THEME
|
||||
for line in Formatter(style=light_style).get_style_lines():
|
||||
css += '\n ' + line
|
||||
css += END_LIGHT_THEME + START_DARK_THEME
|
||||
for line in Formatter(style=dark_style).get_style_lines():
|
||||
css += '\n ' + line
|
||||
css += END_DARK_THEME
|
||||
return css
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("update: %s" % LESS_FILE)
|
||||
with LESS_FILE.open('w', encoding='utf8') as f:
|
||||
f.write(generat_css('default', 'lightbulb'))
|
||||
22
searxng_extra/update/update_wikidata_units.py
Executable file
22
searxng_extra/update/update_wikidata_units.py
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Fetch units from :origin:`searx/engines/wikidata.py` engine.
|
||||
|
||||
Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data
|
||||
... <.github/workflows/data-update.yml>`).
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from searx.engines import wikidata, set_loggers
|
||||
from searx.data import data_dir
|
||||
from searx.wikidata_units import fetch_units
|
||||
|
||||
DATA_FILE = data_dir / 'wikidata_units.json'
|
||||
set_loggers(wikidata, 'wikidata')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with DATA_FILE.open('w', encoding="utf8") as f:
|
||||
json.dump(fetch_units(), f, indent=4, sort_keys=True, ensure_ascii=False)
|
||||
Reference in New Issue
Block a user