first commit
This commit is contained in:
264
searx/engines/duckduckgo_definitions.py
Normal file
264
searx/engines/duckduckgo_definitions.py
Normal file
@@ -0,0 +1,264 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
DuckDuckGo Instant Answer API
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
|
||||
reverse engineering we can see that some services (e.g. instant answers) still
|
||||
in use from the DDG search engine.
|
||||
|
||||
As far we can say the *instant answers* API does not support languages, or at
|
||||
least we could not find out how language support should work. It seems that
|
||||
most of the features are based on English terms.
|
||||
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from urllib.parse import urlencode, urlparse, urljoin
|
||||
from lxml import html
|
||||
|
||||
from searx.data import WIKIDATA_UNITS
|
||||
from searx.utils import extract_text, html_to_text, get_string_replaces_function
|
||||
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
"wikidata_id": 'Q12805',
|
||||
"official_api_documentation": 'https://duckduckgo.com/api',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
|
||||
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
|
||||
|
||||
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
|
||||
|
||||
replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
|
||||
|
||||
|
||||
def is_broken_text(text):
|
||||
"""duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
|
||||
|
||||
The href URL is broken, the "Related website" may contains some HTML.
|
||||
|
||||
The best solution seems to ignore these results.
|
||||
"""
|
||||
return text.startswith('http') and ' ' in text
|
||||
|
||||
|
||||
def result_to_text(text, htmlResult):
|
||||
# TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
|
||||
result = None
|
||||
dom = html.fromstring(htmlResult)
|
||||
a = dom.xpath('//a')
|
||||
if len(a) >= 1:
|
||||
result = extract_text(a[0])
|
||||
else:
|
||||
result = text
|
||||
if not is_broken_text(result):
|
||||
return result
|
||||
return None
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = URL.format(query=urlencode({'q': query}))
|
||||
return params
|
||||
|
||||
|
||||
def response(resp) -> EngineResults:
|
||||
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
|
||||
results = EngineResults()
|
||||
|
||||
search_res = resp.json()
|
||||
|
||||
# search_res.get('Entity') possible values (not exhaustive) :
|
||||
# * continent / country / department / location / waterfall
|
||||
# * actor / musician / artist
|
||||
# * book / performing art / film / television / media franchise / concert tour / playwright
|
||||
# * prepared food
|
||||
# * website / software / os / programming language / file format / software engineer
|
||||
# * company
|
||||
|
||||
content = ''
|
||||
heading = search_res.get('Heading', '')
|
||||
attributes = []
|
||||
urls = []
|
||||
infobox_id = None
|
||||
relatedTopics = []
|
||||
|
||||
# add answer if there is one
|
||||
answer = search_res.get('Answer', '')
|
||||
if answer:
|
||||
answer_type = search_res.get('AnswerType')
|
||||
logger.debug('AnswerType="%s" Answer="%s"', answer_type, answer)
|
||||
if isinstance(answer, str) and answer_type not in ['calc', 'ip']:
|
||||
results.add(
|
||||
results.types.Answer(
|
||||
answer=html_to_text(answer),
|
||||
url=search_res.get('AbstractURL', ''),
|
||||
)
|
||||
)
|
||||
|
||||
# add infobox
|
||||
if 'Definition' in search_res:
|
||||
content = content + search_res.get('Definition', '')
|
||||
|
||||
if 'Abstract' in search_res:
|
||||
content = content + search_res.get('Abstract', '')
|
||||
|
||||
# image
|
||||
image = search_res.get('Image')
|
||||
image = None if image == '' else image
|
||||
if image is not None and urlparse(image).netloc == '':
|
||||
image = urljoin('https://duckduckgo.com', image)
|
||||
|
||||
# urls
|
||||
# Official website, Wikipedia page
|
||||
for ddg_result in search_res.get('Results', []):
|
||||
firstURL = ddg_result.get('FirstURL')
|
||||
text = ddg_result.get('Text')
|
||||
if firstURL is not None and text is not None:
|
||||
urls.append({'title': text, 'url': firstURL})
|
||||
results.append({'title': heading, 'url': firstURL})
|
||||
|
||||
# related topics
|
||||
for ddg_result in search_res.get('RelatedTopics', []):
|
||||
if 'FirstURL' in ddg_result:
|
||||
firstURL = ddg_result.get('FirstURL')
|
||||
text = ddg_result.get('Text')
|
||||
if not is_broken_text(text):
|
||||
suggestion = result_to_text(text, ddg_result.get('Result'))
|
||||
if suggestion != heading and suggestion is not None:
|
||||
results.append({'suggestion': suggestion})
|
||||
elif 'Topics' in ddg_result:
|
||||
suggestions = []
|
||||
relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
|
||||
for topic_result in ddg_result.get('Topics', []):
|
||||
suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
|
||||
if suggestion != heading and suggestion is not None:
|
||||
suggestions.append(suggestion)
|
||||
|
||||
# abstract
|
||||
abstractURL = search_res.get('AbstractURL', '')
|
||||
if abstractURL != '':
|
||||
# add as result ? problem always in english
|
||||
infobox_id = abstractURL
|
||||
urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
|
||||
results.append({'url': abstractURL, 'title': heading})
|
||||
|
||||
# definition
|
||||
definitionURL = search_res.get('DefinitionURL', '')
|
||||
if definitionURL != '':
|
||||
# add as result ? as answer ? problem always in english
|
||||
infobox_id = definitionURL
|
||||
urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
|
||||
|
||||
# to merge with wikidata's infobox
|
||||
if infobox_id:
|
||||
infobox_id = replace_http_by_https(infobox_id)
|
||||
|
||||
# attributes
|
||||
# some will be converted to urls
|
||||
if 'Infobox' in search_res:
|
||||
infobox = search_res.get('Infobox')
|
||||
if 'content' in infobox:
|
||||
osm_zoom = 17
|
||||
coordinates = None
|
||||
for info in infobox.get('content'):
|
||||
data_type = info.get('data_type')
|
||||
data_label = info.get('label')
|
||||
data_value = info.get('value')
|
||||
|
||||
# Workaround: ddg may return a double quote
|
||||
if data_value == '""':
|
||||
continue
|
||||
|
||||
# Is it an external URL ?
|
||||
# * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
|
||||
# * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
|
||||
# * netflix_id
|
||||
external_url = get_external_url(data_type, data_value)
|
||||
if external_url is not None:
|
||||
urls.append({'title': data_label, 'url': external_url})
|
||||
elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
|
||||
# ignore instance: Wikidata value from "Instance Of" (Qxxxx)
|
||||
# ignore wiki_maps_trigger: reference to a javascript
|
||||
# ignore google_play_artist_id: service shutdown
|
||||
pass
|
||||
elif data_type == 'string' and data_label == 'Website':
|
||||
# There is already an URL for the website
|
||||
pass
|
||||
elif data_type == 'area':
|
||||
attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
|
||||
osm_zoom = area_to_osm_zoom(data_value.get('amount'))
|
||||
elif data_type == 'coordinates':
|
||||
if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
|
||||
# coordinate on Earth
|
||||
# get the zoom information from the area
|
||||
coordinates = info
|
||||
else:
|
||||
# coordinate NOT on Earth
|
||||
attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
|
||||
elif data_type == 'string':
|
||||
attributes.append({'label': data_label, 'value': data_value})
|
||||
|
||||
if coordinates:
|
||||
data_label = coordinates.get('label')
|
||||
data_value = coordinates.get('value')
|
||||
latitude = data_value.get('latitude')
|
||||
longitude = data_value.get('longitude')
|
||||
url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
|
||||
urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
|
||||
|
||||
if len(heading) > 0:
|
||||
# TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
|
||||
if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
|
||||
results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
|
||||
else:
|
||||
results.append(
|
||||
{
|
||||
'infobox': heading,
|
||||
'id': infobox_id,
|
||||
'content': content,
|
||||
'img_src': image,
|
||||
'attributes': attributes,
|
||||
'urls': urls,
|
||||
'relatedTopics': relatedTopics,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def unit_to_str(unit):
|
||||
for prefix in WIKIDATA_PREFIX:
|
||||
if unit.startswith(prefix):
|
||||
wikidata_entity = unit[len(prefix) :]
|
||||
real_unit = WIKIDATA_UNITS.get(wikidata_entity)
|
||||
if real_unit is None:
|
||||
return unit
|
||||
return real_unit['symbol']
|
||||
return unit
|
||||
|
||||
|
||||
def area_to_str(area):
|
||||
"""parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
|
||||
unit = unit_to_str(area.get('unit'))
|
||||
if unit is not None:
|
||||
try:
|
||||
amount = float(area.get('amount'))
|
||||
return '{} {}'.format(amount, unit)
|
||||
except ValueError:
|
||||
pass
|
||||
return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))
|
||||
Reference in New Issue
Block a user