first commit

This commit is contained in:
Iyas Altawil
2025-06-26 15:38:10 +03:30
commit e928faf6d2
899 changed files with 403713 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring

82
searxng_extra/docs_prebuild Executable file
View File

@@ -0,0 +1,82 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Script that implements some prebuild tasks needed by target docs.prebuild
"""
import sys
import os.path
import time
from contextlib import contextmanager
from searx import settings, get_setting, locales
from searx.infopage import InfoPageSet, InfoPage
_doc_user = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'docs', 'user'))
def main():
locales.locales_initialize()
base_url = get_setting('server.base_url', None)
if base_url:
infopageset_ctx = _instance_infosetset_ctx(base_url)
else:
infopageset_ctx = _offline_infosetset_ctx()
with infopageset_ctx as infopageset:
for _, _, page in infopageset.iter_pages('en'):
fname = os.path.join(_doc_user, os.path.basename(page.fname))
with open(fname, 'w', encoding='utf-8') as f:
f.write(page.content)
class OfflinePage(InfoPage): # pylint: disable=missing-class-docstring
def get_ctx(self):
"""Jinja context to render :py:obj:`DocPage.content` for offline purpose (no
links to SearXNG instance)"""
ctx = super().get_ctx()
ctx['link'] = lambda name, url: '`%s`' % name
ctx['search'] = lambda query: '`%s`' % query
return ctx
@contextmanager
def _offline_infosetset_ctx():
yield InfoPageSet(OfflinePage)
@contextmanager
def _instance_infosetset_ctx(base_url):
# The url_for functions in the jinja templates need all routes to be
# registered in the Flask app.
settings['server']['secret_key'] = ''
from searx.webapp import app # pylint: disable=import-outside-toplevel
# Specify base_url so that url_for() works for base_urls. If base_url is
# specified, then these values from are given preference over any Flask's
# generics (see flaskfix.py).
with app.test_request_context(base_url=base_url):
yield InfoPageSet()
# The searx.webapp import from above fires some HTTP requests, that's
# why we get a RuntimeError::
#
# RuntimeError: The connection pool was closed while 1 HTTP \
# requests/responses were still in-flight.
#
# Closing network won't help ..
# from searx.network import network
# network.done()
# waiting some seconds before ending the command line was the only solution I
# found ..
time.sleep(3)
if __name__ == '__main__':
sys.exit(main())

177
searxng_extra/standalone_searx.py Executable file
View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Script to run SearXNG from terminal.
DON'T USE THIS SCRIPT!!
.. danger::
Be warned, using the ``standalone_searx.py`` won't give you privacy!
On the contrary, this script behaves like a SearXNG server: your IP is
exposed and tracked by all active engines (google, bing, qwant, ... ), with
every query!
.. note::
This is an old and grumpy hack / SearXNG is a Flask application with
client/server structure, which can't be turned into a command line tool the
way it was done here.
Getting categories without initiate the engine will only return `['general']`
>>> import searx.engines
... list(searx.engines.categories.keys())
['general']
>>> import searx.search
... searx.search.initialize()
... list(searx.engines.categories.keys())
['general', 'it', 'science', 'images', 'news', 'videos', 'music', 'files', 'social media', 'map']
Example to use this script:
.. code:: bash
$ python3 searxng_extra/standalone_searx.py rain
""" # pylint: disable=line-too-long
import argparse
import sys
from datetime import datetime
from json import dumps
from typing import Any, Dict, List, Optional
import searx
import searx.preferences
import searx.query
import searx.search
import searx.webadapter
EngineCategoriesVar = Optional[List[str]]
def get_search_query(
args: argparse.Namespace, engine_categories: EngineCategoriesVar = None
) -> searx.search.SearchQuery:
"""Get search results for the query"""
if engine_categories is None:
engine_categories = list(searx.engines.categories.keys())
try:
category = args.category.decode('utf-8')
except AttributeError:
category = args.category
form = {
"q": args.query,
"categories": category,
"pageno": str(args.pageno),
"language": args.lang,
"time_range": args.timerange,
}
preferences = searx.preferences.Preferences(['simple'], engine_categories, searx.engines.engines, [])
preferences.key_value_settings['safesearch'].parse(args.safesearch)
search_query = searx.webadapter.get_search_query_from_webapp(preferences, form)[0]
return search_query
def no_parsed_url(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Remove parsed url from dict."""
for result in results:
del result['parsed_url']
return results
def json_serial(obj: Any) -> Any:
"""JSON serializer for objects not serializable by default json code.
:raise TypeError: raised when **obj** is not serializable
"""
if isinstance(obj, datetime):
serial = obj.isoformat()
return serial
if isinstance(obj, bytes):
return obj.decode('utf8')
if isinstance(obj, set):
return list(obj)
raise TypeError("Type ({}) not serializable".format(type(obj)))
def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]:
"""Get result from parsed arguments."""
result_container = searx.search.Search(search_query).search()
result_container_json = {
"search": {
"q": search_query.query,
"pageno": search_query.pageno,
"lang": search_query.lang,
"safesearch": search_query.safesearch,
"timerange": search_query.time_range,
},
"results": no_parsed_url(result_container.get_ordered_results()),
"infoboxes": result_container.infoboxes,
"suggestions": list(result_container.suggestions),
"answers": list(result_container.answers),
"paging": result_container.paging,
"number_of_results": result_container.number_of_results,
}
return result_container_json
def parse_argument(
args: Optional[List[str]] = None, category_choices: EngineCategoriesVar = None
) -> argparse.Namespace:
"""Parse command line.
:raise SystemExit: Query argument required on `args`
Examples:
>>> import importlib
... # load module
... spec = importlib.util.spec_from_file_location(
... 'utils.standalone_searx', 'utils/standalone_searx.py')
... sas = importlib.util.module_from_spec(spec)
... spec.loader.exec_module(sas)
... sas.parse_argument()
usage: ptipython [-h] [--category [{general}]] [--lang [LANG]] [--pageno [PAGENO]] [--safesearch [{0,1,2}]] [--timerange [{day,week,month,year}]]
query
SystemExit: 2
>>> sas.parse_argument(['rain'])
Namespace(category='general', lang='all', pageno=1, query='rain', safesearch='0', timerange=None)
""" # noqa: E501
if not category_choices:
category_choices = list(searx.engines.categories.keys())
parser = argparse.ArgumentParser(description='Standalone searx.')
parser.add_argument('query', type=str, help='Text query')
parser.add_argument(
'--category', type=str, nargs='?', choices=category_choices, default='general', help='Search category'
)
parser.add_argument('--lang', type=str, nargs='?', default='all', help='Search language')
parser.add_argument('--pageno', type=int, nargs='?', default=1, help='Page number starting from 1')
parser.add_argument(
'--safesearch',
type=str,
nargs='?',
choices=['0', '1', '2'],
default='0',
help='Safe content filter from none to strict',
)
parser.add_argument(
'--timerange', type=str, nargs='?', choices=['day', 'week', 'month', 'year'], help='Filter by time range'
)
return parser.parse_args(args)
if __name__ == '__main__':
settings_engines = searx.settings['engines']
searx.search.load_engines(settings_engines)
engine_cs = list(searx.engines.categories.keys())
prog_args = parse_argument(category_choices=engine_cs)
searx.search.initialize_network(settings_engines, searx.settings['outgoing'])
searx.search.check_network_configuration()
searx.search.initialize_metrics([engine['name'] for engine in settings_engines])
searx.search.initialize_processors(settings_engines)
search_q = get_search_query(prog_args, engine_categories=engine_cs)
res_dict = to_dict(search_q)
sys.stdout.write(dumps(res_dict, sort_keys=True, indent=4, ensure_ascii=False, default=json_serial))

View File

@@ -0,0 +1,2 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring

View File

@@ -0,0 +1,32 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This script saves `Ahmia's blacklist`_ for onion sites.
Output file: :origin:`searx/data/ahmia_blacklist.txt` (:origin:`CI Update data
... <.github/workflows/data-update.yml>`).
.. _Ahmia's blacklist: https://ahmia.fi/blacklist/
"""
# pylint: disable=use-dict-literal
import requests
from searx.data import data_dir
DATA_FILE = data_dir / 'ahmia_blacklist.txt'
URL = 'https://ahmia.fi/blacklist/banned/'
def fetch_ahmia_blacklist():
resp = requests.get(URL, timeout=3.0)
if resp.status_code != 200:
# pylint: disable=broad-exception-raised
raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code) # type: ignore
return resp.text.split()
if __name__ == '__main__':
blacklist = fetch_ahmia_blacklist()
blacklist.sort()
with DATA_FILE.open("w", encoding='utf-8') as f:
f.write('\n'.join(blacklist))

View File

@@ -0,0 +1,155 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch currencies from :origin:`searx/engines/wikidata.py` engine.
Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
# pylint: disable=invalid-name
import re
import unicodedata
import json
from searx.locales import LOCALE_NAMES, locales_initialize
from searx.engines import wikidata, set_loggers
from searx.data.currencies import CurrenciesDB
set_loggers(wikidata, 'wikidata')
locales_initialize()
# ORDER BY (with all the query fields) is important to keep a deterministic result order
# so multiple invocation of this script doesn't change currencies.json
SARQL_REQUEST = """
SELECT DISTINCT ?iso4217 ?unit ?unicode ?label ?alias WHERE {
?item wdt:P498 ?iso4217; rdfs:label ?label.
OPTIONAL { ?item skos:altLabel ?alias FILTER (LANG (?alias) = LANG(?label)). }
OPTIONAL { ?item wdt:P5061 ?unit. }
OPTIONAL { ?item wdt:P489 ?symbol.
?symbol wdt:P487 ?unicode. }
MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date
MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency)
FILTER(LANG(?label) IN (%LANGUAGES_SPARQL%)).
}
ORDER BY ?iso4217 ?unit ?unicode ?label ?alias
"""
# ORDER BY (with all the query fields) is important to keep a deterministic result order
# so multiple invocation of this script doesn't change currencies.json
SPARQL_WIKIPEDIA_NAMES_REQUEST = """
SELECT DISTINCT ?iso4217 ?article_name WHERE {
?item wdt:P498 ?iso4217 .
?article schema:about ?item ;
schema:name ?article_name ;
schema:isPartOf [ wikibase:wikiGroup "wikipedia" ]
MINUS { ?item wdt:P582 ?end_data . } # Ignore monney with an end date
MINUS { ?item wdt:P31/wdt:P279* wd:Q15893266 . } # Ignore "former entity" (obsolete currency)
FILTER(LANG(?article_name) IN (%LANGUAGES_SPARQL%)).
}
ORDER BY ?iso4217 ?article_name
"""
LANGUAGES = LOCALE_NAMES.keys()
LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES)))
def remove_accents(name):
return unicodedata.normalize('NFKD', name).lower()
def remove_extra(name):
for c in ('(', ':'):
if c in name:
name = name.split(c)[0].strip()
return name
def _normalize_name(name):
name = re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' '))
name = remove_extra(name)
return name
def add_currency_name(db, name, iso4217, normalize_name=True):
db_names = db['names']
if normalize_name:
name = _normalize_name(name)
iso4217_set = db_names.setdefault(name, [])
if iso4217 not in iso4217_set:
iso4217_set.insert(0, iso4217)
def add_currency_label(db, label, iso4217, language):
labels = db['iso4217'].setdefault(iso4217, {})
labels[language] = label
def wikidata_request_result_iterator(request):
result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL), timeout=20)
if result is not None:
yield from result['results']['bindings']
def fetch_db():
db = {
'names': {},
'iso4217': {},
}
for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST):
iso4217 = r['iso4217']['value']
article_name = r['article_name']['value']
article_lang = r['article_name']['xml:lang']
add_currency_name(db, article_name, iso4217)
add_currency_label(db, article_name, iso4217, article_lang)
for r in wikidata_request_result_iterator(SARQL_REQUEST):
iso4217 = r['iso4217']['value']
if 'label' in r:
label = r['label']['value']
label_lang = r['label']['xml:lang']
add_currency_name(db, label, iso4217)
add_currency_label(db, label, iso4217, label_lang)
if 'alias' in r:
add_currency_name(db, r['alias']['value'], iso4217)
if 'unicode' in r:
add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False)
if 'unit' in r:
add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False)
return db
def main():
db = fetch_db()
# static
add_currency_name(db, "euro", 'EUR')
add_currency_name(db, "euros", 'EUR')
add_currency_name(db, "dollar", 'USD')
add_currency_name(db, "dollars", 'USD')
add_currency_name(db, "peso", 'MXN')
add_currency_name(db, "pesos", 'MXN')
# reduce memory usage:
# replace lists with one item by the item. see
# searx.search.processors.online_currency.name_to_iso4217
for name in db['names']:
if len(db['names'][name]) == 1:
db['names'][name] = db['names'][name][0]
with CurrenciesDB.json_file.open('w', encoding='utf8') as f:
json.dump(db, f, indent=4, sort_keys=True, ensure_ascii=False)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,371 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch website description from websites and from
:origin:`searx/engines/wikidata.py` engine.
Output file: :origin:`searx/data/engine_descriptions.json`.
"""
# pylint: disable=invalid-name, global-statement
import json
from urllib.parse import urlparse
from os.path import join
from lxml.html import fromstring
from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, searx_useragent
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir
from searx.utils import gen_useragent, detect_language
import searx.search
import searx.network
from searx.data import data_dir
DATA_FILE = data_dir / 'engine_descriptions.json'
set_loggers(wikidata, 'wikidata')
locales_initialize()
# you can run the query in https://query.wikidata.org
# replace %IDS% by Wikidata entities separated by spaces with the prefix wd:
# for example wd:Q182496 wd:Q1540899
# replace %LANGUAGES_SPARQL% by languages
SPARQL_WIKIPEDIA_ARTICLE = """
SELECT DISTINCT ?item ?name ?article ?lang
WHERE {
hint:Query hint:optimizer "None".
VALUES ?item { %IDS% }
?article schema:about ?item ;
schema:inLanguage ?lang ;
schema:name ?name ;
schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] .
FILTER(?lang in (%LANGUAGES_SPARQL%)) .
FILTER (!CONTAINS(?name, ':')) .
}
ORDER BY ?item ?lang
"""
SPARQL_DESCRIPTION = """
SELECT DISTINCT ?item ?itemDescription
WHERE {
VALUES ?item { %IDS% }
?item schema:description ?itemDescription .
FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%))
}
ORDER BY ?itemLang
"""
NOT_A_DESCRIPTION = [
'web site',
'site web',
'komputa serĉilo',
'interreta serĉilo',
'bilaketa motor',
'web search engine',
'wikimedia täpsustuslehekülg',
]
SKIP_ENGINE_SOURCE = [
# fmt: off
('gitlab', 'wikidata')
# descriptions are about wikipedia disambiguation pages
# fmt: on
]
WIKIPEDIA_LANGUAGES = {}
LANGUAGES_SPARQL = ''
IDS = None
WIKIPEDIA_LANGUAGE_VARIANTS = {'zh_Hant': 'zh-tw'}
descriptions = {}
wd_to_engine_name = {}
def normalize_description(description):
for c in [chr(c) for c in range(0, 31)]:
description = description.replace(c, ' ')
description = ' '.join(description.strip().split())
return description
def update_description(engine_name, lang, description, source, replace=True):
if not isinstance(description, str):
return
description = normalize_description(description)
if description.lower() == engine_name.lower():
return
if description.lower() in NOT_A_DESCRIPTION:
return
if (engine_name, source) in SKIP_ENGINE_SOURCE:
return
if ' ' not in description:
# skip unique word description (like "website")
return
if replace or lang not in descriptions[engine_name]:
descriptions[engine_name][lang] = [description, source]
def get_wikipedia_summary(wikipedia_url, searxng_locale):
# get the REST API URL from the HTML URL
# Headers
headers = {'User-Agent': searx_useragent()}
if searxng_locale in WIKIPEDIA_LANGUAGE_VARIANTS:
headers['Accept-Language'] = WIKIPEDIA_LANGUAGE_VARIANTS.get(searxng_locale)
# URL path : from HTML URL to REST API URL
parsed_url = urlparse(wikipedia_url)
# remove the /wiki/ prefix
article_name = parsed_url.path.split('/wiki/')[1]
# article_name is already encoded but not the / which is required for the REST API call
encoded_article_name = article_name.replace('/', '%2F')
path = '/api/rest_v1/page/summary/' + encoded_article_name
wikipedia_rest_url = parsed_url._replace(path=path).geturl()
try:
response = searx.network.get(wikipedia_rest_url, headers=headers, timeout=10)
response.raise_for_status()
except Exception as e: # pylint: disable=broad-except
print(" ", wikipedia_url, e)
return None
api_result = json.loads(response.text)
return api_result.get('extract')
def get_website_description(url, lang1, lang2=None):
headers = {
'User-Agent': gen_useragent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Sec-GPC': '1',
'Cache-Control': 'max-age=0',
}
if lang1 is not None:
lang_list = [lang1]
if lang2 is not None:
lang_list.append(lang2)
headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8'
try:
response = searx.network.get(url, headers=headers, timeout=10)
response.raise_for_status()
except Exception: # pylint: disable=broad-except
return (None, None)
try:
html = fromstring(response.text)
except ValueError:
html = fromstring(response.content)
description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content'))
if not description:
description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content'))
if not description:
description = extract_text(html.xpath('/html/head/title'))
lang = extract_text(html.xpath('/html/@lang'))
if lang is None and len(lang1) > 0:
lang = lang1
lang = detect_language(description) or lang or 'en'
lang = lang.split('_')[0]
lang = lang.split('-')[0]
return (lang, description)
def initialize():
global IDS, LANGUAGES_SPARQL
searx.search.initialize()
wikipedia_engine = searx.engines.engines['wikipedia']
locale2lang = {'nl-BE': 'nl'}
for sxng_ui_lang in LOCALE_NAMES:
sxng_ui_alias = locale2lang.get(sxng_ui_lang, sxng_ui_lang)
wiki_lang = None
if sxng_ui_alias in wikipedia_engine.traits.custom['WIKIPEDIA_LANGUAGES']:
wiki_lang = sxng_ui_alias
if not wiki_lang:
wiki_lang = wikipedia_engine.traits.get_language(sxng_ui_alias)
if not wiki_lang:
print(f"WIKIPEDIA_LANGUAGES missing {sxng_ui_lang}")
continue
WIKIPEDIA_LANGUAGES[sxng_ui_lang] = wiki_lang
LANGUAGES_SPARQL = ', '.join(f"'{l}'" for l in set(WIKIPEDIA_LANGUAGES.values()))
for engine_name, engine in searx.engines.engines.items():
descriptions[engine_name] = {}
wikidata_id = getattr(engine, "about", {}).get('wikidata_id')
if wikidata_id is not None:
wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name)
IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys())))
def fetch_wikidata_descriptions():
print('Fetching wikidata descriptions')
searx.network.set_timeout_for_thread(60)
result = wikidata.send_wikidata_query(
SPARQL_DESCRIPTION.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)
)
if result is not None:
for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
wikidata_lang = binding['itemDescription']['xml:lang']
desc = binding['itemDescription']['value']
for engine_name in wd_to_engine_name[wikidata_id]:
for searxng_locale in LOCALE_NAMES:
if WIKIPEDIA_LANGUAGES[searxng_locale] != wikidata_lang:
continue
print(
f" engine: {engine_name:20} / wikidata_lang: {wikidata_lang:5}",
f"/ len(wikidata_desc): {len(desc)}",
)
update_description(engine_name, searxng_locale, desc, 'wikidata')
def fetch_wikipedia_descriptions():
print('Fetching wikipedia descriptions')
result = wikidata.send_wikidata_query(
SPARQL_WIKIPEDIA_ARTICLE.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)
)
if result is not None:
for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
wikidata_lang = binding['name']['xml:lang']
wikipedia_url = binding['article']['value'] # for example the URL https://de.wikipedia.org/wiki/PubMed
for engine_name in wd_to_engine_name[wikidata_id]:
for searxng_locale in LOCALE_NAMES:
if WIKIPEDIA_LANGUAGES[searxng_locale] != wikidata_lang:
continue
desc = get_wikipedia_summary(wikipedia_url, searxng_locale)
if not desc:
continue
print(
f" engine: {engine_name:20} / wikidata_lang: {wikidata_lang:5}",
f"/ len(wikipedia_desc): {len(desc)}",
)
update_description(engine_name, searxng_locale, desc, 'wikipedia')
def normalize_url(url):
url = url.replace('{language}', 'en')
url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl()
url = url.replace('https://api.', 'https://')
return url
def fetch_website_description(engine_name, website):
print(f"- fetch website descr: {engine_name} / {website}")
default_lang, default_description = get_website_description(website, None, None)
if default_lang is None or default_description is None:
# the front page can't be fetched: skip this engine
return
# to specify an order in where the most common languages are in front of the
# language list ..
languages = ['en', 'es', 'pt', 'ru', 'tr', 'fr']
languages = languages + [l for l in LOCALE_NAMES if l not in languages]
previous_matched_lang = None
previous_count = 0
for lang in languages:
if lang in descriptions[engine_name]:
continue
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
if fetched_lang is None or desc is None:
continue
# check if desc changed with the different lang values
if fetched_lang == previous_matched_lang:
previous_count += 1
if previous_count == 6:
# the website has returned the same description for 6 different languages in Accept-Language header
# stop now
break
else:
previous_matched_lang = fetched_lang
previous_count = 0
# Don't trust in the value of fetched_lang, some websites return
# for some inappropriate values, by example bing-images::
#
# requested lang: zh-Hans-CN / fetched lang: ceb / desc: 查看根据您的兴趣量身定制的提要
#
# The lang ceb is "Cebuano" but the description is given in zh-Hans-CN
print(
f" engine: {engine_name:20} / requested lang:{lang:7}"
f" / fetched lang: {fetched_lang:7} / len(desc): {len(desc)}"
)
matched_lang = match_locale(fetched_lang, LOCALE_NAMES.keys(), fallback=lang)
update_description(engine_name, matched_lang, desc, website, replace=False)
def fetch_website_descriptions():
print('Fetching website descriptions')
for engine_name, engine in searx.engines.engines.items():
website = getattr(engine, "about", {}).get('website')
if website is None and hasattr(engine, "search_url"):
website = normalize_url(getattr(engine, "search_url"))
if website is None and hasattr(engine, "base_url"):
website = normalize_url(getattr(engine, "base_url"))
if website is not None:
fetch_website_description(engine_name, website)
def get_engine_descriptions_filename():
return join(join(searx_dir, "data"), "engine_descriptions.json")
def get_output():
"""
From descriptions[engine][language] = [description, source]
To
* output[language][engine] = description_and_source
* description_and_source can be:
* [description, source]
* description (if source = "wikipedia")
* [f"engine:lang", "ref"] (reference to another existing description)
"""
output = {locale: {} for locale in LOCALE_NAMES}
seen_descriptions = {}
for engine_name, lang_descriptions in descriptions.items():
for language, description in lang_descriptions.items():
if description[0] in seen_descriptions:
ref = seen_descriptions[description[0]]
description = [f'{ref[0]}:{ref[1]}', 'ref']
else:
seen_descriptions[description[0]] = (engine_name, language)
if description[1] == 'wikipedia':
description = description[0]
output.setdefault(language, {}).setdefault(engine_name, description)
return output
def main():
initialize()
fetch_wikidata_descriptions()
fetch_wikipedia_descriptions()
fetch_website_descriptions()
output = get_output()
with DATA_FILE.open('w', encoding='utf8') as f:
f.write(json.dumps(output, indent=1, separators=(',', ':'), sort_keys=True, ensure_ascii=False))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,199 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`
:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:
Persistence of engines traits, fetched from the engines.
:origin:`searx/languages.py`
Is generated from intersecting each engine's supported traits.
The script :origin:`searxng_extra/update/update_engine_traits.py` is called in
the :origin:`CI Update data ... <.github/workflows/data-update.yml>`
"""
# pylint: disable=invalid-name
from unicodedata import lookup
from pathlib import Path
from pprint import pformat
import babel
from searx import settings, searx_dir
from searx import network
from searx.engines import load_engines
from searx.enginelib.traits import EngineTraitsMap
# Output files.
languages_file = Path(searx_dir) / 'sxng_locales.py'
languages_file_header = """\
# SPDX-License-Identifier: AGPL-3.0-or-later
'''List of SearXNG's locale codes used for the search language/region.
.. hint::
Don't modify this file, this file is generated by::
./manage data.traits
'''
sxng_locales = (
"""
languages_file_footer = """,
)
'''
A list of five-digit tuples:
0. SearXNG's internal locale tag (a language or region tag)
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
Empty string for language tags.
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
are represented by a globe (\U0001F310)
.. code:: python
('en', 'English', '', 'English', '\U0001f310'),
('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
..
('fr', 'Français', '', 'French', '\U0001f310'),
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
:meta hide-value:
'''
"""
lang2emoji = {
'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
'jp': '\U0001F1EF\U0001F1F5', # Japanese
'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
'he': '\U0001F1EE\U0001F1F1', # Hebrew
}
def main():
load_engines(settings['engines'])
# traits_map = EngineTraitsMap.from_data()
traits_map = fetch_traits_map()
sxng_tag_list = filter_locales(traits_map)
write_languages_file(sxng_tag_list)
def fetch_traits_map():
"""Fetches supported languages for each engine and writes json file with those."""
network.set_timeout_for_thread(10.0)
def log(msg):
print(msg)
traits_map = EngineTraitsMap.fetch_traits(log=log)
print("fetched properties from %s engines" % len(traits_map))
print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE)
traits_map.save_data()
return traits_map
def filter_locales(traits_map: EngineTraitsMap):
"""Filter language & region tags by a threshold."""
min_eng_per_region = 18
min_eng_per_lang = 22
_ = {}
for eng in traits_map.values():
for reg in eng.regions.keys():
_[reg] = _.get(reg, 0) + 1
regions = set(k for k, v in _.items() if v >= min_eng_per_region)
lang_from_region = set(k.split('-')[0] for k in regions)
_ = {}
for eng in traits_map.values():
for lang in eng.languages.keys():
# ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they
# already counted by existence of 'zh' or 'sr', 'pa')
if '_' in lang:
# print("ignore %s" % lang)
continue
_[lang] = _.get(lang, 0) + 1
languages = set(k for k, v in _.items() if v >= min_eng_per_lang)
sxng_tag_list = set()
sxng_tag_list.update(regions)
sxng_tag_list.update(lang_from_region)
sxng_tag_list.update(languages)
return sxng_tag_list
def write_languages_file(sxng_tag_list):
language_codes = []
for sxng_tag in sorted(sxng_tag_list):
sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-')
flag = get_unicode_flag(sxng_locale) or ''
item = (
sxng_tag,
sxng_locale.get_language_name().title(), # type: ignore
sxng_locale.get_territory_name() or '',
sxng_locale.english_name.split(' (')[0] if sxng_locale.english_name else '',
UnicodeEscape(flag),
)
language_codes.append(item)
language_codes = tuple(language_codes)
with languages_file.open('w', encoding='utf-8') as new_file:
file_content = "{header} {language_codes}{footer}".format(
header=languages_file_header,
language_codes=pformat(language_codes, width=120, indent=4)[1:-1],
footer=languages_file_footer,
)
new_file.write(file_content)
new_file.close()
class UnicodeEscape(str):
"""Escape unicode string in :py:obj:`pprint.pformat`"""
def __repr__(self):
return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
def get_unicode_flag(locale: babel.Locale):
"""Determine a unicode flag (emoji) that fits to the ``locale``"""
emoji = lang2emoji.get(locale.language)
if emoji:
return emoji
if not locale.territory:
return '\U0001F310'
emoji = lang2emoji.get(locale.territory.lower())
if emoji:
return emoji
try:
c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0])
c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1])
# print("OK : %s --> %s%s" % (locale, c1, c2))
except KeyError as exc:
print("ERROR: %s --> %s" % (locale, exc))
return None
return c1 + c2
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,143 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Update :origin:`searx/data/external_bangs.json` using the duckduckgo bangs
from :py:obj:`BANGS_URL`.
- :origin:`CI Update data ... <.github/workflows/data-update.yml>`
"""
import json
import httpx
from searx.external_bang import LEAF_KEY
from searx.data import data_dir
DATA_FILE = data_dir / 'external_bangs.json'
BANGS_URL = 'https://duckduckgo.com/bang.js'
"""JSON file which contains the bangs."""
HTTPS_COLON = 'https:'
HTTP_COLON = 'http:'
def main():
print(f'fetch bangs from {BANGS_URL}')
response = httpx.get(BANGS_URL)
response.raise_for_status()
ddg_bangs = json.loads(response.content.decode())
trie = parse_ddg_bangs(ddg_bangs)
output = {
'version': 0,
'trie': trie,
}
with DATA_FILE.open('w', encoding="utf8") as f:
json.dump(output, f, indent=4, sort_keys=True, ensure_ascii=False)
def merge_when_no_leaf(node):
"""Minimize the number of nodes
``A -> B -> C``
- ``B`` is child of ``A``
- ``C`` is child of ``B``
If there are no ``C`` equals to ``<LEAF_KEY>``, then each ``C`` are merged
into ``A``. For example (5 nodes)::
d -> d -> g -> <LEAF_KEY> (ddg)
-> i -> g -> <LEAF_KEY> (dig)
becomes (3 nodes)::
d -> dg -> <LEAF_KEY>
-> ig -> <LEAF_KEY>
"""
restart = False
if not isinstance(node, dict):
return
# create a copy of the keys so node can be modified
keys = list(node.keys())
for key in keys:
if key == LEAF_KEY:
continue
value = node[key]
value_keys = list(value.keys())
if LEAF_KEY not in value_keys:
for value_key in value_keys:
node[key + value_key] = value[value_key]
merge_when_no_leaf(node[key + value_key])
del node[key]
restart = True
else:
merge_when_no_leaf(value)
if restart:
merge_when_no_leaf(node)
def optimize_leaf(parent, parent_key, node):
if not isinstance(node, dict):
return
if len(node) == 1 and LEAF_KEY in node and parent is not None:
parent[parent_key] = node[LEAF_KEY]
else:
for key, value in node.items():
optimize_leaf(node, key, value)
def parse_ddg_bangs(ddg_bangs):
bang_trie = {}
bang_urls = {}
for bang_definition in ddg_bangs:
# bang_list
bang_url = bang_definition['u']
if '{{{s}}}' not in bang_url:
# ignore invalid bang
continue
bang_url = bang_url.replace('{{{s}}}', chr(2))
# only for the https protocol: "https://example.com" becomes "//example.com"
if bang_url.startswith(HTTPS_COLON + '//'):
bang_url = bang_url[len(HTTPS_COLON) :]
#
if bang_url.startswith(HTTP_COLON + '//') and bang_url[len(HTTP_COLON) :] in bang_urls:
# if the bang_url uses the http:// protocol, and the same URL exists in https://
# then reuse the https:// bang definition. (written //example.com)
bang_def_output = bang_urls[bang_url[len(HTTP_COLON) :]]
else:
# normal use case : new http:// URL or https:// URL (without "https:", see above)
bang_rank = str(bang_definition['r'])
bang_def_output = bang_url + chr(1) + bang_rank
bang_def_output = bang_urls.setdefault(bang_url, bang_def_output)
bang_urls[bang_url] = bang_def_output
# bang name
bang = bang_definition['t']
# bang_trie
t = bang_trie
for bang_letter in bang:
t = t.setdefault(bang_letter, {})
t = t.setdefault(LEAF_KEY, bang_def_output)
# optimize the trie
merge_when_no_leaf(bang_trie)
optimize_leaf(None, None, bang_trie)
return bang_trie
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch firefox useragent signatures
Output file: :origin:`searx/data/useragents.json` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
# pylint: disable=use-dict-literal
import json
import re
from urllib.parse import urlparse, urljoin
from packaging.version import parse
import requests
from lxml import html
from searx.data import data_dir
DATA_FILE = data_dir / 'useragents.json'
URL = 'https://ftp.mozilla.org/pub/firefox/releases/'
RELEASE_PATH = '/pub/firefox/releases/'
NORMAL_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?$')
# BETA_REGEX = re.compile(r'.*[0-9]b([0-9\-a-z]+)$')
# ESR_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?esr$')
#
useragents = {
# fmt: off
"versions": (),
"os": ('Windows NT 10.0; Win64; x64',
'X11; Linux x86_64'),
"ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}",
# fmt: on
}
def fetch_firefox_versions():
resp = requests.get(URL, timeout=2.0)
if resp.status_code != 200:
# pylint: disable=broad-exception-raised
raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code) # type: ignore
dom = html.fromstring(resp.text)
versions = []
for link in dom.xpath('//a/@href'):
url = urlparse(urljoin(URL, link))
path = url.path
if path.startswith(RELEASE_PATH):
version = path[len(RELEASE_PATH) : -1]
if NORMAL_REGEX.match(version):
versions.append(parse(version))
list.sort(versions, reverse=True)
return versions
def fetch_firefox_last_versions():
versions = fetch_firefox_versions()
result = []
major_last = versions[0].major
major_list = (major_last, major_last - 1)
for version in versions:
major_current = version.major
minor_current = version.minor
if major_current in major_list:
user_agent_version = f'{major_current}.{minor_current}'
if user_agent_version not in result:
result.append(user_agent_version)
return result
if __name__ == '__main__':
useragents["versions"] = fetch_firefox_last_versions()
with DATA_FILE.open('w', encoding='utf-8') as f:
json.dump(useragents, f, indent=4, sort_keys=True, ensure_ascii=False)

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Update locale names in :origin:`searx/data/locales.json` used by
:ref:`searx.locales`
- :py:obj:`searx.locales.RTL_LOCALES`
- :py:obj:`searx.locales.LOCALE_NAMES`
"""
# pylint: disable=invalid-name
from __future__ import annotations
from typing import Set
import json
from pathlib import Path
import babel
import babel.languages
import babel.core
from searx import searx_dir
from searx.locales import (
ADDITIONAL_TRANSLATIONS,
LOCALE_BEST_MATCH,
get_translation_locales,
)
LOCALE_DATA_FILE = Path(searx_dir) / 'data' / 'locales.json'
TRANSLATIONS_FOLDER = Path(searx_dir) / 'translations'
def main():
LOCALE_NAMES = {}
RTL_LOCALES: Set[str] = set()
for tag, descr in ADDITIONAL_TRANSLATIONS.items():
locale = babel.Locale.parse(LOCALE_BEST_MATCH[tag], sep='-')
LOCALE_NAMES[tag] = descr
if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag)
for tag in LOCALE_BEST_MATCH:
descr = LOCALE_NAMES.get(tag)
if not descr:
locale = babel.Locale.parse(tag, sep='-')
LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_'))
if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag)
for tr_locale in get_translation_locales():
sxng_tag = tr_locale.replace('_', '-')
descr = LOCALE_NAMES.get(sxng_tag)
if not descr:
locale = babel.Locale.parse(tr_locale)
LOCALE_NAMES[sxng_tag] = get_locale_descr(locale, tr_locale)
if locale.text_direction == 'rtl':
RTL_LOCALES.add(sxng_tag)
content = {
"LOCALE_NAMES": LOCALE_NAMES,
"RTL_LOCALES": sorted(RTL_LOCALES),
}
with LOCALE_DATA_FILE.open('w', encoding='utf-8') as f:
json.dump(content, f, indent=2, sort_keys=True, ensure_ascii=False)
def get_locale_descr(locale: babel.Locale, tr_locale):
"""Get locale name e.g. 'Français - fr' or 'Português (Brasil) - pt-BR'
:param locale: instance of :py:class:`Locale`
:param tr_locale: name e.g. 'fr' or 'pt_BR' (delimiter is *underscore*)
"""
native_language, native_territory = _get_locale_descr(locale, tr_locale)
english_language, english_territory = _get_locale_descr(locale, 'en')
if native_territory == english_territory:
english_territory = None
if not native_territory and not english_territory:
# none territory name
if native_language == english_language:
return native_language
return native_language + ' (' + english_language + ')'
result = native_language + ', ' + native_territory + ' (' + english_language
if english_territory:
return result + ', ' + english_territory + ')'
return result + ')'
def _get_locale_descr(locale: babel.Locale, tr_locale: str) -> tuple[str, str]:
language_name = locale.get_language_name(tr_locale).capitalize() # type: ignore
if language_name and ('a' <= language_name[0] <= 'z'):
language_name = language_name.capitalize()
territory_name: str = locale.get_territory_name(tr_locale) # type: ignore
return language_name, territory_name
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,214 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch OSM keys and tags.
To get the i18n names, the scripts uses `Wikidata Query Service`_ instead of for
example `OSM tags API`_ (side note: the actual change log from
map.atownsend.org.uk_ might be useful to normalize OSM tags).
Output file: :origin:`searx/data/osm_keys_tags` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
.. _Wikidata Query Service: https://query.wikidata.org/
.. _OSM tags API: https://taginfo.openstreetmap.org/taginfo/apidoc
.. _map.atownsend.org.uk: https://map.atownsend.org.uk/maps/map/changelog.html
:py:obj:`SPARQL_TAGS_REQUEST` :
Wikidata SPARQL query that returns *type-categories* and *types*. The
returned tag is ``Tag:{category}={type}`` (see :py:func:`get_tags`).
Example:
- https://taginfo.openstreetmap.org/tags/building=house#overview
- https://wiki.openstreetmap.org/wiki/Tag:building%3Dhouse
at the bottom of the infobox (right side), there is a link to wikidata:
https://www.wikidata.org/wiki/Q3947
see property "OpenStreetMap tag or key" (P1282)
- https://wiki.openstreetmap.org/wiki/Tag%3Abuilding%3Dbungalow
https://www.wikidata.org/wiki/Q850107
:py:obj:`SPARQL_KEYS_REQUEST` :
Wikidata SPARQL query that returns *keys*. Example with "payment":
- https://wiki.openstreetmap.org/wiki/Key%3Apayment
at the bottom of infobox (right side), there is a link to wikidata:
https://www.wikidata.org/wiki/Q1148747
link made using the "OpenStreetMap tag or key" property (P1282)
to be confirm: there is a one wiki page per key ?
- https://taginfo.openstreetmap.org/keys/payment#values
- https://taginfo.openstreetmap.org/keys/payment:cash#values
``rdfs:label`` get all the labels without language selection
(as opposed to SERVICE ``wikibase:label``).
"""
import json
import collections
from searx.network import set_timeout_for_thread
from searx.engines import wikidata, set_loggers
from searx.sxng_locales import sxng_locales
from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK
from searx.data import data_dir
DATA_FILE = data_dir / 'osm_keys_tags.json'
set_loggers(wikidata, 'wikidata')
SPARQL_TAGS_REQUEST = """
SELECT ?tag ?item ?itemLabel WHERE {
?item wdt:P1282 ?tag .
?item rdfs:label ?itemLabel .
FILTER(STRSTARTS(?tag, 'Tag'))
}
GROUP BY ?tag ?item ?itemLabel
ORDER BY ?tag ?item ?itemLabel
"""
SPARQL_KEYS_REQUEST = """
SELECT ?key ?item ?itemLabel WHERE {
?item wdt:P1282 ?key .
?item rdfs:label ?itemLabel .
FILTER(STRSTARTS(?key, 'Key'))
}
GROUP BY ?key ?item ?itemLabel
ORDER BY ?key ?item ?itemLabel
"""
LANGUAGES = [l[0].lower() for l in sxng_locales]
PRESET_KEYS = {
('wikidata',): {'en': 'Wikidata'},
('wikipedia',): {'en': 'Wikipedia'},
('email',): {'en': 'Email'},
('facebook',): {'en': 'Facebook'},
('fax',): {'en': 'Fax'},
('internet_access', 'ssid'): {'en': 'Wi-Fi'},
}
INCLUDED_KEYS = {('addr',)}
def get_preset_keys():
results = collections.OrderedDict()
for keys, value in PRESET_KEYS.items():
r = results
for k in keys:
r = r.setdefault(k, {})
r.setdefault('*', value)
return results
def get_keys():
results = get_preset_keys()
response = wikidata.send_wikidata_query(SPARQL_KEYS_REQUEST)
for key in response['results']['bindings']:
keys = key['key']['value'].split(':')[1:]
if keys[0] == 'currency' and len(keys) > 1:
# special case in openstreetmap.py
continue
if keys[0] == 'contact' and len(keys) > 1:
# label for the key "contact.email" is "Email"
# whatever the language
r = results.setdefault('contact', {})
r[keys[1]] = {'*': {'en': keys[1]}}
continue
if tuple(keys) in PRESET_KEYS:
# skip presets (already set above)
continue
if (
get_key_rank(':'.join(keys)) is None
and ':'.join(keys) not in VALUE_TO_LINK
and tuple(keys) not in INCLUDED_KEYS
):
# keep only keys that will be displayed by openstreetmap.py
continue
label = key['itemLabel']['value'].lower()
lang = key['itemLabel']['xml:lang']
r = results
for k in keys:
r = r.setdefault(k, {})
r = r.setdefault('*', {})
if lang in LANGUAGES:
r.setdefault(lang, label)
# special cases
results['delivery']['covid19']['*'].clear()
for k, v in results['delivery']['*'].items():
results['delivery']['covid19']['*'][k] = v + ' (COVID19)'
results['opening_hours']['covid19']['*'].clear()
for k, v in results['opening_hours']['*'].items():
results['opening_hours']['covid19']['*'][k] = v + ' (COVID19)'
return results
def get_tags():
results = collections.OrderedDict()
response = wikidata.send_wikidata_query(SPARQL_TAGS_REQUEST)
for tag in response['results']['bindings']:
tag_names = tag['tag']['value'].split(':')[1].split('=')
if len(tag_names) == 2:
tag_category, tag_type = tag_names
else:
tag_category, tag_type = tag_names[0], ''
label = tag['itemLabel']['value'].lower()
lang = tag['itemLabel']['xml:lang']
if lang in LANGUAGES:
results.setdefault(tag_category, {}).setdefault(tag_type, {}).setdefault(lang, label)
return results
def optimize_data_lang(translations):
language_to_delete = []
# remove "zh-hk" entry if the value is the same as "zh"
# same for "en-ca" / "en" etc...
for language in translations:
if '-' in language:
base_language = language.split('-')[0]
if translations.get(base_language) == translations.get(language):
language_to_delete.append(language)
for language in language_to_delete:
del translations[language]
language_to_delete = []
# remove entries that have the same value than the "en" entry
value_en = translations.get('en')
if value_en:
for language, value in translations.items():
if language != 'en' and value == value_en:
language_to_delete.append(language)
for language in language_to_delete:
del translations[language]
def optimize_tags(data):
for v in data.values():
for translations in v.values():
optimize_data_lang(translations)
return data
def optimize_keys(data):
for k, v in data.items():
if k == '*':
optimize_data_lang(v)
elif isinstance(v, dict):
optimize_keys(v)
return data
if __name__ == '__main__':
set_timeout_for_thread(60)
result = {
'keys': optimize_keys(get_keys()),
'tags': optimize_tags(get_tags()),
}
with DATA_FILE.open('w', encoding="utf8") as f:
json.dump(result, f, indent=4, sort_keys=True, ensure_ascii=False)

View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Update pygments style
Call this script after each upgrade of pygments
"""
# pylint: disable=too-few-public-methods
from pathlib import Path
import pygments
from pygments.formatters.html import HtmlFormatter
from searx import searx_dir
LESS_FILE = Path(searx_dir).parent / 'client/simple/generated/pygments.less'
HEADER = f"""\
/*
this file is generated automatically by searxng_extra/update/update_pygments.py
using pygments version {pygments.__version__}
*/
"""
START_LIGHT_THEME = """
.code-highlight {
"""
END_LIGHT_THEME = """
}
"""
START_DARK_THEME = """
.code-highlight-dark(){
.code-highlight {
"""
END_DARK_THEME = """
}
}
"""
class Formatter(HtmlFormatter): # pylint: disable=missing-class-docstring
@property
def _pre_style(self):
return 'line-height: 100%;'
def get_style_lines(self, arg=None):
style_lines = []
style_lines.extend(self.get_linenos_style_defs())
style_lines.extend(self.get_background_style_defs(arg))
style_lines.extend(self.get_token_style_defs(arg))
return style_lines
def generat_css(light_style, dark_style) -> str:
css = HEADER + START_LIGHT_THEME
for line in Formatter(style=light_style).get_style_lines():
css += '\n ' + line
css += END_LIGHT_THEME + START_DARK_THEME
for line in Formatter(style=dark_style).get_style_lines():
css += '\n ' + line
css += END_DARK_THEME
return css
if __name__ == '__main__':
print("update: %s" % LESS_FILE)
with LESS_FILE.open('w', encoding='utf8') as f:
f.write(generat_css('default', 'lightbulb'))

View File

@@ -0,0 +1,22 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch units from :origin:`searx/engines/wikidata.py` engine.
Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data
... <.github/workflows/data-update.yml>`).
"""
import json
from searx.engines import wikidata, set_loggers
from searx.data import data_dir
from searx.wikidata_units import fetch_units
DATA_FILE = data_dir / 'wikidata_units.json'
set_loggers(wikidata, 'wikidata')
if __name__ == '__main__':
with DATA_FILE.open('w', encoding="utf8") as f:
json.dump(fetch_units(), f, indent=4, sort_keys=True, ensure_ascii=False)