diff options
Diffstat (limited to 'searx/engines/qwant.py')
-rw-r--r-- | searx/engines/qwant.py | 296 |
1 files changed, 209 insertions, 87 deletions
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 54e9daf..cee2dfa 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -1,116 +1,239 @@ -""" - Qwant (Web, Images, News, Social) +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Qwant (Web, News, Images, Videos) + +This engine uses the Qwant API (https://api.qwant.com/v3). The API is +undocumented but can be reverse engineered by reading the network log of +https://www.qwant.com/ queries. + +This implementation is used by different qwant engines in the settings.yml:: - @website https://qwant.com/ - @provide-api not officially (https://api.qwant.com/api/search/) + - name: qwant + categories: general + ... + - name: qwant news + categories: news + ... + - name: qwant images + categories: images + ... + - name: qwant videos + categories: videos + ... - @using-api yes - @results JSON - @stable yes - @parse url, title, content """ -from datetime import datetime +from datetime import ( + datetime, + timedelta, +) from json import loads -from searx.utils import html_to_text -from searx.url_utils import urlencode +from urllib.parse import urlencode +from flask_babel import gettext + from searx.utils import match_language +from searx.exceptions import SearxEngineAPIException +from searx.raise_for_httperror import raise_for_httperror + + +# about +about = { + "website": 'https://www.qwant.com/', + "wikidata_id": 'Q14657870', + "official_api_documentation": None, + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} # engine dependent config -categories = None +categories = [] paging = True -language_support = True -supported_languages_url = 'https://qwant.com/region' +supported_languages_url = about['website'] -category_to_keyword = {'general': 'web', - 'images': 'images', - 'news': 'news', - 'social media': 'social'} +category_to_keyword = { + 'general': 'web', + 'news': 'news', + 'images': 'images', + 'videos': 'videos', +} # search-url -url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4' +url = 'https://api.qwant.com/v3/search/{keyword}?{query}&count={count}&offset={offset}' -# do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 + """Qwant search request""" + keyword = category_to_keyword[categories[0]] + count = 10 # web: count must be equal to 10 - if categories[0] and categories[0] in category_to_keyword: - - params['url'] = url.format(keyword=category_to_keyword[categories[0]], - query=urlencode({'q': query}), - offset=offset) + if keyword == 'images': + count = 50 + offset = (params['pageno'] - 1) * count + # count + offset must be lower than 250 + offset = min(offset, 199) else: - params['url'] = url.format(keyword='web', - query=urlencode({'q': query}), - offset=offset) + offset = (params['pageno'] - 1) * count + # count + offset must be lower than 50 + offset = min(offset, 40) + + params['url'] = url.format( + keyword=keyword, + query=urlencode({'q': query}), + offset=offset, + count=count, + ) # add language tag - if params['language'] != 'all': - language = match_language(params['language'], supported_languages, language_aliases) - params['url'] += '&locale=' + language.replace('-', '_').lower() + if params['language'] == 'all': + params['url'] += '&locale=en_US' + else: + language = match_language( + params['language'], + supported_languages, + language_aliases, + ) + params['url'] += '&locale=' + language.replace('-', '_') - params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' + params['raise_for_httperror'] = False return params -# get response from search-request def response(resp): + """Get response from Qwant's search request""" + # pylint: disable=too-many-locals, too-many-branches, too-many-statements + + keyword = category_to_keyword[categories[0]] results = [] + # load JSON result search_results = loads(resp.text) + data = search_results.get('data', {}) + + # check for an API error + if search_results.get('status') != 'success': + msg = ",".join( + data.get( + 'message', + [ + 'unknown', + ], + ) + ) + raise SearxEngineAPIException('API error::' + msg) + + # raise for other errors + raise_for_httperror(resp) + + if keyword == 'web': + # The WEB query contains a list named 'mainline'. This list can contain + # different result types (e.g. mainline[0]['type'] returns type of the + # result items in mainline[0]['items'] + mainline = data.get('result', {}).get('items', {}).get('mainline', {}) + else: + # Queries on News, Images and Videos do not have a list named 'mainline' + # in the response. The result items are directly in the list + # result['items']. + mainline = data.get('result', {}).get('items', []) + mainline = [ + {'type': keyword, 'items': mainline}, + ] # return empty array if there are no results - if 'data' not in search_results: + if not mainline: return [] - data = search_results.get('data', {}) + for row in mainline: + + mainline_type = row.get('type', 'web') + if mainline_type != keyword: + continue + + if mainline_type == 'ads': + # ignore adds + continue - res = data.get('result', {}) - - # parse results - for result in res.get('items', {}): - - title = html_to_text(result['title']) - res_url = result['url'] - content = html_to_text(result['desc']) - - if category_to_keyword.get(categories[0], '') == 'web': - results.append({'title': title, - 'content': content, - 'url': res_url}) - - elif category_to_keyword.get(categories[0], '') == 'images': - thumbnail_src = result['thumbnail'] - img_src = result['media'] - results.append({'template': 'images.html', - 'url': res_url, - 'title': title, - 'content': '', - 'thumbnail_src': thumbnail_src, - 'img_src': img_src}) - - elif category_to_keyword.get(categories[0], '') == 'social': - published_date = datetime.fromtimestamp(result['date'], None) - img_src = result.get('img', None) - results.append({'url': res_url, - 'title': title, - 'publishedDate': published_date, - 'content': content, - 'img_src': img_src}) - - elif category_to_keyword.get(categories[0], '') == 'news': - published_date = datetime.fromtimestamp(result['date'], None) - media = result.get('media', []) - if len(media) > 0: - img_src = media[0].get('pict', {}).get('url', None) - else: + mainline_items = row.get('items', []) + for item in mainline_items: + + title = item.get('title', None) + res_url = item.get('url', None) + + if mainline_type == 'web': + content = item['desc'] + results.append( + { + 'title': title, + 'url': res_url, + 'content': content, + } + ) + + elif mainline_type == 'news': + + pub_date = item['date'] + if pub_date is not None: + pub_date = datetime.fromtimestamp(pub_date) + news_media = item.get('media', []) img_src = None - results.append({'url': res_url, - 'title': title, - 'publishedDate': published_date, - 'content': content, - 'img_src': img_src}) + if news_media: + img_src = news_media[0].get('pict', {}).get('url', None) + results.append( + { + 'title': title, + 'url': res_url, + 'publishedDate': pub_date, + 'img_src': img_src, + } + ) + + elif mainline_type == 'images': + thumbnail = item['thumbnail'] + img_src = item['media'] + results.append( + { + 'title': title, + 'url': res_url, + 'template': 'images.html', + 'thumbnail_src': thumbnail, + 'img_src': img_src, + } + ) + + elif mainline_type == 'videos': + # some videos do not have a description: while qwant-video + # returns an empty string, such video from a qwant-web query + # miss the 'desc' key. + d, s, c = item.get('desc'), item.get('source'), item.get('channel') + content_parts = [] + if d: + content_parts.append(d) + if s: + content_parts.append("%s: %s " % (gettext("Source"), s)) + if c: + content_parts.append("%s: %s " % (gettext("Channel"), c)) + content = ' // '.join(content_parts) + length = item['duration'] + if length is not None: + length = timedelta(milliseconds=length) + pub_date = item['date'] + if pub_date is not None: + pub_date = datetime.fromtimestamp(pub_date) + thumbnail = item['thumbnail'] + # from some locations (DE and others?) the s2 link do + # response a 'Please wait ..' but does not deliver the thumbnail + thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1) + results.append( + { + 'title': title, + 'url': res_url, + 'content': content, + 'publishedDate': pub_date, + 'thumbnail': thumbnail, + 'template': 'videos.html', + 'length': length, + } + ) return results @@ -119,16 +242,15 @@ def response(resp): def _fetch_supported_languages(resp): # list of regions is embedded in page as a js object response_text = resp.text - response_text = response_text[response_text.find('regionalisation'):] - response_text = response_text[response_text.find('{'):response_text.find(');')] + response_text = response_text[response_text.find('INITIAL_PROPS'):] + response_text = response_text[response_text.find('{'): response_text.find('</script>')] regions_json = loads(response_text) supported_languages = [] - for lang in regions_json['languages'].values(): - if lang['code'] == 'nb': - lang['code'] = 'no' - for country in lang['countries']: - supported_languages.append(lang['code'] + '-' + country) + for country, langs in regions_json['locales'].items(): + for lang in langs['langs']: + lang_code = "{lang}-{country}".format(lang=lang, country=country) + supported_languages.append(lang_code) return supported_languages |