1 files changed, 135 insertions, 0 deletions
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
new file mode 100644
index 0000000..db2fdc0
--- /dev/null
+++ b/searx/engines/wikipedia.py
@@ -0,0 +1,135 @@
+"""
+ Wikipedia (Web)
+
+ @website     https://{language}.wikipedia.org
+ @provide-api yes
+
+ @using-api   yes
+ @results     JSON
+ @stable      yes
+ @parse       url, infobox
+"""
+
+from json import loads
+from lxml.html import fromstring
+from searx.url_utils import quote, urlencode
+
+# search-url
+base_url = u'https://{language}.wikipedia.org/'
+search_url = base_url + u'w/api.php?'\
+    'action=query'\
+    '&format=json'\
+    '&{query}'\
+    '&prop=extracts|pageimages'\
+    '&exintro'\
+    '&explaintext'\
+    '&pithumbsize=300'\
+    '&redirects'
+supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
+
+
+# set language in base_url
+def url_lang(lang):
+    lang = lang.split('-')[0]
+    if lang == 'all' or lang not in supported_languages:
+        language = 'en'
+    else:
+        language = lang
+
+    return language
+
+
+# do search-request
+def request(query, params):
+    if query.islower():
+        query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')
+
+    params['url'] = search_url.format(query=urlencode({'titles': query}),
+                                      language=url_lang(params['language']))
+
+    return params
+
+
+# get first meaningful paragraph
+# this should filter out disambiguation pages and notes above first paragraph
+# "magic numbers" were obtained by fine tuning
+def extract_first_paragraph(content, title, image):
+    first_paragraph = None
+
+    failed_attempts = 0
+    for paragraph in content.split('\n'):
+
+        starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
+        length = len(paragraph)
+
+        if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
+            first_paragraph = paragraph
+            break
+
+        failed_attempts += 1
+        if failed_attempts > 3:
+            return None
+
+    return first_paragraph
+
+
+# get response from search-request
+def response(resp):
+    results = []
+
+    search_result = loads(resp.text)
+
+    # wikipedia article's unique id
+    # first valid id is assumed to be the requested article
+    for article_id in search_result['query']['pages']:
+        page = search_result['query']['pages'][article_id]
+        if int(article_id) > 0:
+            break
+
+    if int(article_id) < 0:
+        return []
+
+    title = page.get('title')
+
+    image = page.get('thumbnail')
+    if image:
+        image = image.get('source')
+
+    extract = page.get('extract')
+
+    summary = extract_first_paragraph(extract, title, image)
+
+    # link to wikipedia article
+    wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
+        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
+
+    results.append({'url': wikipedia_link, 'title': title})
+
+    results.append({'infobox': title,
+                    'id': wikipedia_link,
+                    'content': summary,
+                    'img_src': image,
+                    'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
+
+    return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+    supported_languages = {}
+    dom = fromstring(resp.text)
+    tables = dom.xpath('//table[contains(@class,"sortable")]')
+    for table in tables:
+        # exclude header row
+        trs = table.xpath('.//tr')[1:]
+        for tr in trs:
+            td = tr.xpath('./td')
+            code = td[3].xpath('./a')[0].text
+            name = td[2].xpath('./a')[0].text
+            english_name = td[1].xpath('./a')[0].text
+            articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
+            # exclude languages with too few articles
+            if articles >= 100:
+                supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
+
+    return supported_languages