first commit
This commit is contained in:
180
searx/engines/mediawiki.py
Normal file
180
searx/engines/mediawiki.py
Normal file
@@ -0,0 +1,180 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
|
||||
the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
|
||||
endpoints that follow this pattern::
|
||||
|
||||
https://{base_url}/w/api.php?action=query&list=search&format=json
|
||||
|
||||
.. note::
|
||||
|
||||
In its actual state, this engine is implemented to parse JSON result
|
||||
(`format=json`_) from a search query (`list=search`_). If you need other
|
||||
``action`` and ``list`` types ask SearXNG developers to extend the
|
||||
implementation according to your needs.
|
||||
|
||||
.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
|
||||
.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
|
||||
.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
|
||||
.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
Request:
|
||||
|
||||
- :py:obj:`base_url`
|
||||
- :py:obj:`search_type`
|
||||
- :py:obj:`srenablerewrites`
|
||||
- :py:obj:`srsort`
|
||||
- :py:obj:`srprop`
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode, quote
|
||||
|
||||
from searx.utils import html_to_text
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": None,
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
paging = True
|
||||
number_of_results = 5
|
||||
|
||||
search_type: str = 'nearmatch'
|
||||
"""Which type of search to perform. One of the following values: ``nearmatch``,
|
||||
``text`` or ``title``.
|
||||
|
||||
See ``srwhat`` argument in `list=search`_ documentation.
|
||||
"""
|
||||
|
||||
srenablerewrites: bool = True
|
||||
"""Enable internal query rewriting (Type: boolean). Some search backends can
|
||||
rewrite the query into another which is thought to provide better results, for
|
||||
instance by correcting spelling errors.
|
||||
|
||||
See ``srenablerewrites`` argument in `list=search`_ documentation.
|
||||
"""
|
||||
|
||||
srsort: str = 'relevance'
|
||||
"""Set the sort order of returned results. One of the following values:
|
||||
``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
|
||||
``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
|
||||
``none``, ``random``, ``relevance``, ``user_random``.
|
||||
|
||||
See ``srenablerewrites`` argument in `list=search`_ documentation.
|
||||
"""
|
||||
|
||||
srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
|
||||
"""Which properties to return.
|
||||
|
||||
See ``srprop`` argument in `list=search`_ documentation.
|
||||
"""
|
||||
|
||||
base_url: str = 'https://{language}.wikipedia.org/'
|
||||
"""Base URL of the Wikimedia wiki.
|
||||
|
||||
``{language}``:
|
||||
ISO 639-1 language code (en, de, fr ..) of the search language.
|
||||
"""
|
||||
|
||||
api_path: str = 'w/api.php'
|
||||
"""The path the PHP api is listening on.
|
||||
|
||||
The default path should work fine usually.
|
||||
"""
|
||||
|
||||
timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
|
||||
"""The longhand version of MediaWiki time strings."""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
# write search-language back to params, required in response
|
||||
|
||||
if params['language'] == 'all':
|
||||
params['language'] = 'en'
|
||||
else:
|
||||
params['language'] = params['language'].split('-')[0]
|
||||
|
||||
api_url = f"{base_url.rstrip('/')}/{api_path}?".format(language=params['language'])
|
||||
offset = (params['pageno'] - 1) * number_of_results
|
||||
|
||||
args = {
|
||||
'action': 'query',
|
||||
'list': 'search',
|
||||
'format': 'json',
|
||||
'srsearch': query,
|
||||
'sroffset': offset,
|
||||
'srlimit': number_of_results,
|
||||
'srwhat': search_type,
|
||||
'srprop': srprop,
|
||||
'srsort': srsort,
|
||||
}
|
||||
if srenablerewrites:
|
||||
args['srenablerewrites'] = '1'
|
||||
|
||||
params['url'] = api_url + urlencode(args)
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
|
||||
results = []
|
||||
search_results = resp.json()
|
||||
|
||||
# return empty array if there are no results
|
||||
if not search_results.get('query', {}).get('search'):
|
||||
return []
|
||||
|
||||
for result in search_results['query']['search']:
|
||||
|
||||
if result.get('snippet', '').startswith('#REDIRECT'):
|
||||
continue
|
||||
|
||||
title = result['title']
|
||||
sectiontitle = result.get('sectiontitle')
|
||||
content = html_to_text(result.get('snippet', ''))
|
||||
metadata = html_to_text(result.get('categorysnippet', ''))
|
||||
timestamp = result.get('timestamp')
|
||||
|
||||
url = (
|
||||
base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
|
||||
)
|
||||
if sectiontitle:
|
||||
# in case of sectiontitle create a link to the section in the wiki page
|
||||
url += '#' + quote(sectiontitle.replace(' ', '_').encode())
|
||||
title += ' / ' + sectiontitle
|
||||
|
||||
item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
|
||||
|
||||
if timestamp:
|
||||
item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
|
||||
|
||||
results.append(item)
|
||||
|
||||
# return results
|
||||
return results
|
||||
Reference in New Issue
Block a user