first commit

This commit is contained in:
Iyas Altawil
2025-06-26 15:38:10 +03:30
commit e928faf6d2
899 changed files with 403713 additions and 0 deletions

81
searx/data/__init__.py Normal file
View File

@@ -0,0 +1,81 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This module holds the *data* created by::
make data.all
"""
from __future__ import annotations
__all__ = ["ahmia_blacklist_loader"]
import json
import typing
from .core import log, data_dir
from .currencies import CurrenciesDB
from .tracker_patterns import TrackerPatternsDB
CURRENCIES: CurrenciesDB
USER_AGENTS: dict[str, typing.Any]
EXTERNAL_URLS: dict[str, typing.Any]
WIKIDATA_UNITS: dict[str, typing.Any]
EXTERNAL_BANGS: dict[str, typing.Any]
OSM_KEYS_TAGS: dict[str, typing.Any]
ENGINE_DESCRIPTIONS: dict[str, typing.Any]
ENGINE_TRAITS: dict[str, typing.Any]
LOCALES: dict[str, typing.Any]
TRACKER_PATTERNS: TrackerPatternsDB
lazy_globals = {
"CURRENCIES": CurrenciesDB(),
"USER_AGENTS": None,
"EXTERNAL_URLS": None,
"WIKIDATA_UNITS": None,
"EXTERNAL_BANGS": None,
"OSM_KEYS_TAGS": None,
"ENGINE_DESCRIPTIONS": None,
"ENGINE_TRAITS": None,
"LOCALES": None,
"TRACKER_PATTERNS": TrackerPatternsDB(),
}
data_json_files = {
"USER_AGENTS": "useragents.json",
"EXTERNAL_URLS": "external_urls.json",
"WIKIDATA_UNITS": "wikidata_units.json",
"EXTERNAL_BANGS": "external_bangs.json",
"OSM_KEYS_TAGS": "osm_keys_tags.json",
"ENGINE_DESCRIPTIONS": "engine_descriptions.json",
"ENGINE_TRAITS": "engine_traits.json",
"LOCALES": "locales.json",
}
def __getattr__(name):
# lazy init of the global objects
if name not in lazy_globals:
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
data = lazy_globals[name]
if data is not None:
return data
log.debug("init searx.data.%s", name)
with open(data_dir / data_json_files[name], encoding='utf-8') as f:
lazy_globals[name] = json.load(f)
return lazy_globals[name]
def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by::
searxng_extra/update/update_ahmia_blacklist.py
This function is used by :py:mod:`searx.plugins.ahmia_filter`.
"""
with open(data_dir / 'ahmia_blacklist.txt', encoding='utf-8') as f:
return f.read().split()

47814
searx/data/ahmia_blacklist.txt Normal file

File diff suppressed because it is too large Load Diff

29
searx/data/core.py Normal file
View File

@@ -0,0 +1,29 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from __future__ import annotations
import pathlib
from searx import logger
from searx.cache import ExpireCacheCfg, ExpireCacheSQLite
log = logger.getChild("data")
data_dir = pathlib.Path(__file__).parent
_DATA_CACHE: ExpireCacheSQLite = None # type: ignore
def get_cache():
global _DATA_CACHE # pylint: disable=global-statement
if _DATA_CACHE is None:
_DATA_CACHE = ExpireCacheSQLite.build_cache(
ExpireCacheCfg(
name="DATA_CACHE",
# MAX_VALUE_LEN=1024 * 200, # max. 200kB length for a *serialized* value.
# MAXHOLD_TIME=60 * 60 * 24 * 7 * 4, # 4 weeks
)
)
return _DATA_CACHE

15863
searx/data/currencies.json Normal file

File diff suppressed because it is too large Load Diff

55
searx/data/currencies.py Normal file
View File

@@ -0,0 +1,55 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Simple implementation to store currencies data in a SQL database."""
from __future__ import annotations
__all__ = ["CurrenciesDB"]
import json
import pathlib
from .core import get_cache, log
class CurrenciesDB:
# pylint: disable=missing-class-docstring
ctx_names = "data_currencies_names"
ctx_iso4217 = "data_currencies_iso4217"
json_file = pathlib.Path(__file__).parent / "currencies.json"
def __init__(self):
self.cache = get_cache()
def init(self):
if self.cache.properties("currencies loaded") != "OK":
self.load()
self.cache.properties.set("currencies loaded", "OK")
# F I X M E:
# do we need a maintenance .. rember: database is stored
# in /tmp and will be rebuild during the reboot anyway
def load(self):
log.debug("init searx.data.CURRENCIES")
with open(self.json_file, encoding="utf-8") as f:
data_dict = json.load(f)
for key, value in data_dict["names"].items():
self.cache.set(key=key, value=value, ctx=self.ctx_names, expire=None)
for key, value in data_dict["iso4217"].items():
self.cache.set(key=key, value=value, ctx=self.ctx_iso4217, expire=None)
def name_to_iso4217(self, name):
self.init()
ret_val = self.cache.get(key=name, default=name, ctx=self.ctx_names)
if isinstance(ret_val, list):
# if more alternatives, use the last in the list
ret_val = ret_val[-1]
return ret_val
def iso4217_to_name(self, iso4217, language):
self.init()
iso4217_languages: dict = self.cache.get(key=iso4217, default={}, ctx=self.ctx_iso4217)
return iso4217_languages.get(language, iso4217)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

19076
searx/data/external_bangs.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,156 @@
{
"facebook_profile": {
"category_name": "Facebook",
"url_name": "Facebook profile",
"urls": {
"default": "https://facebook.com/$1"
}
},
"youtube_channel": {
"category_name": "YouTube",
"url_name": "YouTube channel",
"urls": {
"default": "https://www.youtube.com/channel/$1"
}
},
"youtube_video": {
"category_name": "YouTube",
"url_name": "YouTube video",
"urls": {
"default": "https://www.youtube.com/watch?v=$1"
}
},
"twitter_profile": {
"category_name": "Twitter",
"url_name": "Twitter profile",
"urls": {
"default": "https://twitter.com/$1"
}
},
"instagram_profile": {
"category_name": "Instagram",
"url_name": "Instagram profile",
"urls": {
"default": "https://www.instagram.com/$1"
}
},
"imdb_title": {
"category_name": "IMDB",
"url_name": "IMDB title",
"urls": {
"default": "https://www.imdb.com/title/$1"
}
},
"imdb_name": {
"category_name": "IMDB",
"url_name": "IMDB name",
"urls": {
"default": "https://www.imdb.com/name/$1"
}
},
"imdb_character": {
"category_name": "IMDB",
"url_name": "IMDB character",
"urls": {
"default": "https://www.imdb.com/character/$1"
}
},
"imdb_company": {
"category_name": "IMDB",
"url_name": "IMDB company",
"urls": {
"default": "https://www.imdb.com/company/$1"
}
},
"imdb_event": {
"category_name": "IMDB",
"url_name": "IMDB event",
"urls": {
"default": "https://www.imdb.com/event/$1"
}
},
"rotten_tomatoes": {
"category_name": "Rotten tomatoes",
"url_name": "Rotten tomatoes title",
"urls": {
"default": "https://www.rottentomatoes.com/$1"
}
},
"spotify_artist_id": {
"category_name": "Spotify",
"url_name": "Spotify artist",
"urls": {
"default": "https://open.spotify.com/artist/$1"
}
},
"itunes_artist_id": {
"category_name": "iTunes",
"url_name": "iTunes artist",
"urls": {
"default": "https://music.apple.com/us/artist/$1"
}
},
"soundcloud_id": {
"category_name": "Soundcloud",
"url_name": "Soundcloud artist",
"urls": {
"default": "https://soundcloud.com/$1"
}
},
"netflix_id": {
"category_name": "Netflix",
"url_name": "Netflix movie",
"urls": {
"default": "https://www.netflix.com/watch/$1"
}
},
"github_profile": {
"category_name": "Github",
"url_name": "Github profile",
"urls": {
"default": "https://wwww.github.com/$1"
}
},
"musicbrainz_artist": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz artist",
"urls": {
"default": "http://musicbrainz.org/artist/$1"
}
},
"musicbrainz_work": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz work",
"urls": {
"default": "http://musicbrainz.org/work/$1"
}
},
"musicbrainz_release_group": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz release group",
"urls": {
"default": "http://musicbrainz.org/release-group/$1"
}
},
"musicbrainz_label": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz label",
"urls": {
"default": "http://musicbrainz.org/label/$1"
}
},
"wikimedia_image": {
"category_name": "Wikipedia",
"url_name": "Wikipedia image",
"urls": {
"default": "https://commons.wikimedia.org/wiki/Special:FilePath/$1?width=500&height=400"
}
},
"map": {
"category_name": "Map",
"url_name": "geo map",
"urls": {
"default": "https://www.openstreetmap.org/?lat=${latitude}&lon=${longitude}&zoom=${zoom}&layers=M"
}
}
}

BIN
searx/data/lid.176.ftz Executable file

Binary file not shown.

71
searx/data/locales.json Normal file
View File

@@ -0,0 +1,71 @@
{
"LOCALE_NAMES": {
"af": "Afrikaans",
"ar": "العربية (Arabic)",
"bg": "Български (Bulgarian)",
"bn": "বাংলা (Bangla)",
"bo": "བོད་སྐད་ (Tibetan)",
"ca": "Català (Catalan)",
"cs": "Čeština (Czech)",
"cy": "Cymraeg (Welsh)",
"da": "Dansk (Danish)",
"de": "Deutsch (German)",
"dv": "ދިވެހި (Dhivehi)",
"el-GR": "Ελληνικά, Ελλάδα (Greek, Greece)",
"en": "English",
"eo": "Esperanto",
"es": "Español (Spanish)",
"et": "Eesti (Estonian)",
"eu": "Euskara (Basque)",
"fa-IR": "فارسی, ایران (Persian, Iran)",
"fi": "Suomi (Finnish)",
"fil": "Filipino",
"fr": "Français (French)",
"ga": "Gaeilge (Irish)",
"gl": "Galego (Galician)",
"he": "עברית (Hebrew)",
"hr": "Hrvatski (Croatian)",
"hu": "Magyar (Hungarian)",
"ia": "Interlingua",
"id": "Indonesia (Indonesian)",
"it": "Italiano (Italian)",
"ja": "日本語 (Japanese)",
"ko": "한국어 (Korean)",
"lt": "Lietuvių (Lithuanian)",
"lv": "Latviešu (Latvian)",
"ml": "മലയാളം (Malayalam)",
"ms": "Melayu (Malay)",
"nb-NO": "Norsk bokmål, Norge (Norwegian bokmål, Norway)",
"nl": "Nederlands (Dutch)",
"nl-BE": "Nederlands, België (Dutch, Belgium)",
"oc": "Occitan",
"pa": "ਪੰਜਾਬੀ (Punjabi)",
"pap": "Papiamento",
"pl": "Polski (Polish)",
"pt": "Português (Portuguese)",
"pt-BR": "Português, Brasil (Portuguese, Brazil)",
"ro": "Română (Romanian)",
"ru": "Русский (Russian)",
"si": "සිංහල (Sinhala)",
"sk": "Slovenčina (Slovak)",
"sl": "Slovenščina (Slovenian)",
"sr": "Српски (Serbian)",
"sv": "Svenska (Swedish)",
"szl": "Ślōnski (Silesian)",
"ta": "தமிழ் (Tamil)",
"te": "తెలుగు (Telugu)",
"th": "ไทย (Thai)",
"tr": "Türkçe (Turkish)",
"tt": "Татар (Tatar)",
"uk": "Українська (Ukrainian)",
"vi": "Tiếng việt (Vietnamese)",
"zh-HK": "中文, 中國香港特別行政區 (Chinese, Hong Kong SAR China)",
"zh-Hans-CN": "中文, 中国 (Chinese, China)",
"zh-Hant-TW": "中文, 台灣 (Chinese, Taiwan)"
},
"RTL_LOCALES": [
"ar",
"fa-IR",
"he"
]
}

63227
searx/data/osm_keys_tags.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,142 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Simple implementation to store TrackerPatterns data in a SQL database."""
from __future__ import annotations
import typing
__all__ = ["TrackerPatternsDB"]
import re
import pathlib
from collections.abc import Iterator
from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
import httpx
from searx.data.core import get_cache, log
RuleType = tuple[str, list[str], list[str]]
class TrackerPatternsDB:
# pylint: disable=missing-class-docstring
ctx_name = "data_tracker_patterns"
json_file = pathlib.Path(__file__).parent / "tracker_patterns.json"
CLEAR_LIST_URL = [
# ClearURL rule lists, the first one that responds HTTP 200 is used
"https://rules1.clearurls.xyz/data.minify.json",
"https://rules2.clearurls.xyz/data.minify.json",
"https://raw.githubusercontent.com/ClearURLs/Rules/refs/heads/master/data.min.json",
]
class Fields:
# pylint: disable=too-few-public-methods, invalid-name
url_regexp: typing.Final = 0 # URL (regular expression) match condition of the link
url_ignore: typing.Final = 1 # URL (regular expression) to ignore
del_args: typing.Final = 2 # list of URL arguments (regular expression) to delete
def __init__(self):
self.cache = get_cache()
def init(self):
if self.cache.properties("tracker_patterns loaded") != "OK":
self.load()
self.cache.properties.set("tracker_patterns loaded", "OK")
# F I X M E:
# do we need a maintenance .. rember: database is stored
# in /tmp and will be rebuild during the reboot anyway
def load(self):
log.debug("init searx.data.TRACKER_PATTERNS")
for rule in self.iter_clear_list():
self.add(rule)
def add(self, rule: RuleType):
self.cache.set(
key=rule[self.Fields.url_regexp],
value=(
rule[self.Fields.url_ignore],
rule[self.Fields.del_args],
),
ctx=self.ctx_name,
expire=None,
)
def rules(self) -> Iterator[RuleType]:
self.init()
for key, value in self.cache.pairs(ctx=self.ctx_name):
yield key, value[0], value[1]
def iter_clear_list(self) -> Iterator[RuleType]:
resp = None
for url in self.CLEAR_LIST_URL:
resp = httpx.get(url, timeout=3)
if resp.status_code == 200:
break
log.warning(f"TRACKER_PATTERNS: ClearURL ignore HTTP {resp.status_code} {url}")
if resp is None:
log.error("TRACKER_PATTERNS: failed fetching ClearURL rule lists")
return
for rule in resp.json()["providers"].values():
yield (
rule["urlPattern"].replace("\\\\", "\\"), # fix javascript regex syntax
[exc.replace("\\\\", "\\") for exc in rule.get("exceptions", [])],
rule.get("rules", []),
)
def clean_url(self, url: str) -> bool | str:
"""The URL arguments are normalized and cleaned of tracker parameters.
Returns bool ``True`` to use URL unchanged (``False`` to ignore URL).
If URL should be modified, the returned string is the new URL to use.
"""
new_url = url
parsed_new_url = urlparse(url=new_url)
for rule in self.rules():
if not re.match(rule[self.Fields.url_regexp], new_url):
# no match / ignore pattern
continue
do_ignore = False
for pattern in rule[self.Fields.url_ignore]:
if re.match(pattern, new_url):
do_ignore = True
break
if do_ignore:
# pattern is in the list of exceptions / ignore pattern
# HINT:
# we can't break the outer pattern loop since we have
# overlapping urlPattern like ".*"
continue
# remove tracker arguments from the url-query part
query_args: list[tuple[str, str]] = list(parse_qsl(parsed_new_url.query))
for name, val in query_args.copy():
# remove URL arguments
for pattern in rule[self.Fields.del_args]:
if re.match(pattern, name):
log.debug("TRACKER_PATTERNS: %s remove tracker arg: %s='%s'", parsed_new_url.netloc, name, val)
query_args.remove((name, val))
parsed_new_url = parsed_new_url._replace(query=urlencode(query_args))
new_url = urlunparse(parsed_new_url)
if new_url != url:
return new_url
return True
if __name__ == "__main__":
db = TrackerPatternsDB()
for r in db.rules():
print(r)

View File

@@ -0,0 +1,11 @@
{
"os": [
"Windows NT 10.0; Win64; x64",
"X11; Linux x86_64"
],
"ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}",
"versions": [
"139.0",
"138.0"
]
}

File diff suppressed because it is too large Load Diff