diff options
Diffstat (limited to 'searx/engines/wikipedia.py')
-rw-r--r-- | searx/engines/wikipedia.py | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py new file mode 100644 index 0000000..db2fdc0 --- /dev/null +++ b/searx/engines/wikipedia.py @@ -0,0 +1,135 @@ +""" + Wikipedia (Web) + + @website https://{language}.wikipedia.org + @provide-api yes + + @using-api yes + @results JSON + @stable yes + @parse url, infobox +""" + +from json import loads +from lxml.html import fromstring +from searx.url_utils import quote, urlencode + +# search-url +base_url = u'https://{language}.wikipedia.org/' +search_url = base_url + u'w/api.php?'\ + 'action=query'\ + '&format=json'\ + '&{query}'\ + '&prop=extracts|pageimages'\ + '&exintro'\ + '&explaintext'\ + '&pithumbsize=300'\ + '&redirects' +supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' + + +# set language in base_url +def url_lang(lang): + lang = lang.split('-')[0] + if lang == 'all' or lang not in supported_languages: + language = 'en' + else: + language = lang + + return language + + +# do search-request +def request(query, params): + if query.islower(): + query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8') + + params['url'] = search_url.format(query=urlencode({'titles': query}), + language=url_lang(params['language'])) + + return params + + +# get first meaningful paragraph +# this should filter out disambiguation pages and notes above first paragraph +# "magic numbers" were obtained by fine tuning +def extract_first_paragraph(content, title, image): + first_paragraph = None + + failed_attempts = 0 + for paragraph in content.split('\n'): + + starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35) + length = len(paragraph) + + if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)): + first_paragraph = paragraph + break + + failed_attempts += 1 + if failed_attempts > 3: + return None + + return first_paragraph + + +# get response from search-request +def response(resp): + results = [] + + search_result = loads(resp.text) + + # wikipedia article's unique id + # first valid id is assumed to be the requested article + for article_id in search_result['query']['pages']: + page = search_result['query']['pages'][article_id] + if int(article_id) > 0: + break + + if int(article_id) < 0: + return [] + + title = page.get('title') + + image = page.get('thumbnail') + if image: + image = image.get('source') + + extract = page.get('extract') + + summary = extract_first_paragraph(extract, title, image) + + # link to wikipedia article + wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ + + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')) + + results.append({'url': wikipedia_link, 'title': title}) + + results.append({'infobox': title, + 'id': wikipedia_link, + 'content': summary, + 'img_src': image, + 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) + + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = {} + dom = fromstring(resp.text) + tables = dom.xpath('//table[contains(@class,"sortable")]') + for table in tables: + # exclude header row + trs = table.xpath('.//tr')[1:] + for tr in trs: + td = tr.xpath('./td') + code = td[3].xpath('./a')[0].text + name = td[2].xpath('./a')[0].text + english_name = td[1].xpath('./a')[0].text + articles = int(td[4].xpath('./a/b')[0].text.replace(',', '')) + # exclude languages with too few articles + if articles >= 100: + supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles} + + return supported_languages |