summaryrefslogtreecommitdiff
path: root/searx/engines/archlinux.py
blob: cad06f8c6db80837674c15dac00f8a39b5b24803 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-

"""
 Arch Linux Wiki

 @website      https://wiki.archlinux.org
 @provide-api  no (Mediawiki provides API, but Arch Wiki blocks access to it
 @using-api    no
 @results      HTML
 @stable       no (HTML can change)
 @parse        url, title
"""

from lxml import html
from searx.engines.xpath import extract_text
from searx.url_utils import urlencode, urljoin

# engine dependent config
categories = ['it']
language_support = True
paging = True
base_url = 'https://wiki.archlinux.org'

# xpath queries
xpath_results = '//ul[@class="mw-search-results"]/li'
xpath_link = './/div[@class="mw-search-result-heading"]/a'


# cut 'en' from 'en_US', 'de' from 'de_CH', and so on
def locale_to_lang_code(locale):
    if locale.find('-') >= 0:
        locale = locale.split('-')[0]
    return locale


# wikis for some languages were moved off from the main site, we need to make
# requests to correct URLs to be able to get results in those languages
lang_urls = {
    'all': {
        'base': 'https://wiki.archlinux.org',
        'search': '/index.php?title=Special:Search&offset={offset}&{query}'
    },
    'de': {
        'base': 'https://wiki.archlinux.de',
        'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}'
    },
    'fr': {
        'base': 'https://wiki.archlinux.fr',
        'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}'
    },
    'ja': {
        'base': 'https://wiki.archlinuxjp.org',
        'search': '/index.php?title=特別:検索&offset={offset}&{query}'
    },
    'ro': {
        'base': 'http://wiki.archlinux.ro',
        'search': '/index.php?title=Special:Căutare&offset={offset}&{query}'
    },
    'tr': {
        'base': 'http://archtr.org/wiki',
        'search': '/index.php?title=Özel:Ara&offset={offset}&{query}'
    }
}


# get base & search URLs for selected language
def get_lang_urls(language):
    if language in lang_urls:
        return lang_urls[language]
    return lang_urls['all']


# Language names to build search requests for
# those languages which are hosted on the main site.
main_langs = {
    'ar': 'العربية',
    'bg': 'Български',
    'cs': 'Česky',
    'da': 'Dansk',
    'el': 'Ελληνικά',
    'es': 'Español',
    'he': 'עברית',
    'hr': 'Hrvatski',
    'hu': 'Magyar',
    'it': 'Italiano',
    'ko': '한국어',
    'lt': 'Lietuviškai',
    'nl': 'Nederlands',
    'pl': 'Polski',
    'pt': 'Português',
    'ru': 'Русский',
    'sl': 'Slovenský',
    'th': 'ไทย',
    'uk': 'Українська',
    'zh': '简体中文'
}
supported_languages = dict(lang_urls, **main_langs)


# do search-request
def request(query, params):
    # translate the locale (e.g. 'en_US') to language code ('en')
    language = locale_to_lang_code(params['language'])

    # if our language is hosted on the main site, we need to add its name
    # to the query in order to narrow the results to that language
    if language in main_langs:
        query += '(' + main_langs[language] + ')'

    # prepare the request parameters
    query = urlencode({'search': query})
    offset = (params['pageno'] - 1) * 20

    # get request URLs for our language of choice
    urls = get_lang_urls(language)
    search_url = urls['base'] + urls['search']

    params['url'] = search_url.format(query=query, offset=offset)

    return params


# get response from search-request
def response(resp):
    # get the base URL for the language in which request was made
    language = locale_to_lang_code(resp.search_params['language'])
    base_url = get_lang_urls(language)['base']

    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(xpath_results):
        link = result.xpath(xpath_link)[0]
        href = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(link)

        results.append({'url': href,
                        'title': title})

    return results