diff options
Diffstat (limited to 'searx/engines')
28 files changed, 657 insertions, 292 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 023ec40..7a9cc56 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -16,8 +16,9 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. (C) 2013- by Adam Tauber, <asciimoo@gmail.com> ''' -from os.path import realpath, dirname import sys +import threading +from os.path import realpath, dirname from flask_babel import gettext from operator import itemgetter from json import loads @@ -84,6 +85,8 @@ def load_engine(engine_data): for engine_attr in dir(engine): if engine_attr.startswith('_'): continue + if engine_attr == 'inactive' and getattr(engine, engine_attr) is True: + return None if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"' .format(engine.name, engine_attr)) @@ -214,8 +217,24 @@ def get_engines_stats(): ] -def initialize_engines(engine_list): +def load_engines(engine_list): + global engines + engines.clear() for engine_data in engine_list: engine = load_engine(engine_data) if engine is not None: engines[engine.name] = engine + return engines + + +def initialize_engines(engine_list): + load_engines(engine_list) + for engine in engines.items(): + if hasattr(engine, 'init'): + init_fn = getattr(engine, engine_attr) + + def engine_init(): + init_fn() + logger.debug('%s engine initialized', engine_data['name']) + logger.debug('Starting background initialization of %s engine', engine_data['name']) + threading.Thread(target=engine_init).start() diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py new file mode 100644 index 0000000..5ef84f0 --- /dev/null +++ b/searx/engines/arxiv.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +""" + ArXiV (Scientific preprints) + @website https://arxiv.org + @provide-api yes (export.arxiv.org/api/query) + @using-api yes + @results XML-RSS + @stable yes + @parse url, title, publishedDate, content + More info on api: https://arxiv.org/help/api/user-manual +""" + +from lxml import html +from datetime import datetime +from searx.url_utils import urlencode + + +categories = ['science'] + +base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ + + '{query}&start={offset}&max_results={number_of_results}' + +# engine dependent config +number_of_results = 10 + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=query, + offset=offset, + number_of_results=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.content) + search_results = dom.xpath('//entry') + + for entry in search_results: + title = entry.xpath('.//title')[0].text + + url = entry.xpath('.//id')[0].text + + content_string = '{doi_content}{abstract_content}' + + abstract = entry.xpath('.//summary')[0].text + + # If a doi is available, add it to the snipppet + try: + doi_content = entry.xpath('.//link[@title="doi"]')[0].text + content = content_string.format(doi_content=doi_content, abstract_content=abstract) + except: + content = content_string.format(doi_content="", abstract_content=abstract) + + if len(content) > 300: + content = content[0:300] + "..." + # TODO: center snippet on query term + + publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ') + + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/engines/base.py b/searx/engines/base.py index ff006a3..be0b7d2 100755 --- a/searx/engines/base.py +++ b/searx/engines/base.py @@ -73,7 +73,7 @@ def request(query, params): def response(resp): results = [] - search_results = etree.XML(resp.text) + search_results = etree.XML(resp.content) for entry in search_results.xpath('./result/doc'): content = "No description available" diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 6300c94..1567905 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -18,7 +18,6 @@ from lxml import html from json import loads import re -from searx.engines.bing import _fetch_supported_languages, supported_languages_url from searx.url_utils import urlencode # engine dependent config @@ -26,6 +25,8 @@ categories = ['images'] paging = True safesearch = True time_range_support = True +language_support = True +supported_languages_url = 'https://www.bing.com/account/general' # search-url base_url = 'https://www.bing.com/' @@ -45,23 +46,41 @@ safesearch_types = {2: 'STRICT', _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U) +# get supported region code +def get_region_code(lang, lang_list=None): + region = None + if lang in (lang_list or supported_languages): + region = lang + elif lang.startswith('no'): + region = 'nb-NO' + else: + # try to get a supported country code with language + lang = lang.split('-')[0] + for lc in (lang_list or supported_languages): + if lang == lc.split('-')[0]: + region = lc + break + if region: + return region.lower() + else: + return 'en-us' + + # do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - # required for cookie - if params['language'] == 'all': - language = 'en-US' - else: - language = params['language'] - search_path = search_string.format( query=urlencode({'q': query}), offset=offset) + language = get_region_code(params['language']) + params['cookies']['SRCHHPGUSR'] = \ - 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\ - '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + params['cookies']['_EDGE_S'] = 'mkt=' + language +\ + '&ui=' + language + '&F=1' params['url'] = base_url + search_path if params['time_range'] in time_range_dict: @@ -106,3 +125,22 @@ def response(resp): # return results return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = html.fromstring(resp.text) + + regions_xpath = '//div[@id="region-section-content"]' \ + + '//ul[@class="b_vList"]/li/a/@href' + + regions = dom.xpath(regions_xpath) + for region in regions: + code = re.search('setmkt=[^\&]+', region).group()[7:] + if code == 'nb-NO': + code = 'no-NO' + + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py new file mode 100644 index 0000000..bd91bce --- /dev/null +++ b/searx/engines/bing_videos.py @@ -0,0 +1,99 @@ +""" + Bing (Videos) + + @website https://www.bing.com/videos + @provide-api yes (http://datamarket.azure.com/dataset/bing/search) + + @using-api no + @results HTML + @stable no + @parse url, title, content, thumbnail +""" + +from json import loads +from lxml import html +from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + + +categories = ['videos'] +paging = True +safesearch = True +time_range_support = True +number_of_results = 10 +language_support = True + +search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\ + 'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5' +time_range_string = '&qft=+filterui:videoage-lt{interval}' +time_range_dict = {'day': '1440', + 'week': '10080', + 'month': '43200', + 'year': '525600'} + +# safesearch definitions +safesearch_types = {2: 'STRICT', + 1: 'DEMOTE', + 0: 'OFF'} + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + # safesearch cookie + params['cookies']['SRCHHPGUSR'] = \ + 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + # language cookie + region = get_region_code(params['language'], lang_list=supported_languages) + params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1' + + # query and paging + params['url'] = search_url.format(query=urlencode({'q': query}), + offset=offset, + number_of_results=number_of_results) + + # time range + if params['time_range'] in time_range_dict: + params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="dg_u"]'): + + # try to extract the url + url_container = result.xpath('.//div[@class="sa_wrapper"]/@data-eventpayload') + if len(url_container) > 0: + url = loads(url_container[0])['purl'] + else: + url = result.xpath('./a/@href')[0] + + # discard results that do not return an external url + # very recent results sometimes don't return the video's url + if url.startswith('/videos/search?'): + continue + + title = extract_text(result.xpath('./a//div[@class="tl"]')) + content = extract_text(result.xpath('.//div[@class="pubInfo"]')) + thumbnail = result.xpath('.//div[@class="vthumb"]/img/@src')[0] + + results.append({'url': url, + 'title': title, + 'content': content, + 'thumbnail': thumbnail, + 'template': 'videos.html'}) + + # first page ignores requested number of results + if len(results) >= number_of_results: + break + + return results diff --git a/searx/engines/blekko_images.py b/searx/engines/blekko_images.py deleted file mode 100644 index f716456..0000000 --- a/searx/engines/blekko_images.py +++ /dev/null @@ -1,70 +0,0 @@ -""" - Blekko (Images) - - @website https://blekko.com - @provide-api yes (inofficial) - - @using-api yes - @results JSON - @stable yes - @parse url, title, img_src -""" - -from json import loads -from searx.url_utils import urlencode - -# engine dependent config -categories = ['images'] -paging = True -safesearch = True - -# search-url -base_url = 'https://blekko.com' -search_url = '/api/images?{query}&c={c}' - -# safesearch definitions -safesearch_types = {2: '1', - 1: '', - 0: '0'} - - -# do search-request -def request(query, params): - c = (params['pageno'] - 1) * 48 - - params['url'] = base_url +\ - search_url.format(query=urlencode({'q': query}), - c=c) - - if params['pageno'] != 1: - params['url'] += '&page={pageno}'.format(pageno=(params['pageno'] - 1)) - - # let Blekko know we wan't have profiling - params['cookies']['tag_lesslogging'] = '1' - - # parse safesearch argument - params['cookies']['safesearch'] = safesearch_types.get(params['safesearch'], '') - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_results = loads(resp.text) - - # return empty array if there are no results - if not search_results: - return [] - - for result in search_results: - # append result - results.append({'url': result['page_url'], - 'title': result['title'], - 'content': '', - 'img_src': result['url'], - 'template': 'images.html'}) - - # return results - return results diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index 1218d48..1bb4e60 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -10,7 +10,7 @@ if sys.version_info[0] == 3: unicode = str categories = [] -url = 'https://download.finance.yahoo.com/d/quotes.csv?e=.csv&f=sl1d1t1&s={query}=X' +url = 'https://finance.google.com/finance/converter?a=1&from={0}&to={1}' weight = 100 parser_re = re.compile(b'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) @@ -44,15 +44,15 @@ def request(query, params): # wrong query return params - ammount, from_currency, to_currency = m.groups() - ammount = float(ammount) + amount, from_currency, to_currency = m.groups() + amount = float(amount) from_currency = name_to_iso4217(from_currency.strip()) to_currency = name_to_iso4217(to_currency.strip()) q = (from_currency + to_currency).upper() - params['url'] = url.format(query=q) - params['ammount'] = ammount + params['url'] = url.format(from_currency, to_currency) + params['amount'] = amount params['from'] = from_currency params['to'] = to_currency params['from_name'] = iso4217_to_name(from_currency, 'en') @@ -63,30 +63,27 @@ def request(query, params): def response(resp): results = [] + pat = '<span class=bld>(.+) {0}</span>'.format( + resp.search_params['to'].upper()) + try: - _, conversion_rate, _ = resp.text.split(',', 2) + conversion_rate = re.findall(pat, resp.text)[0] conversion_rate = float(conversion_rate) except: return results answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format( - resp.search_params['ammount'], + resp.search_params['amount'], resp.search_params['from'], - resp.search_params['ammount'] * conversion_rate, + resp.search_params['amount'] * conversion_rate, resp.search_params['to'], conversion_rate, resp.search_params['from_name'], resp.search_params['to_name'], ) - now_date = datetime.now().strftime('%Y%m%d') - url = 'https://finance.yahoo.com/currency/converter-results/{0}/{1}-{2}-to-{3}.html' # noqa - url = url.format( - now_date, - resp.search_params['ammount'], - resp.search_params['from'].lower(), - resp.search_params['to'].lower() - ) + url = 'https://finance.google.com/finance?q={0}{1}'.format( + resp.search_params['from'].upper(), resp.search_params['to']) results.append({'answer': answer, 'url': url}) diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 7c34786..7cc44df 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -37,7 +37,7 @@ def request(query, params): params['url'] = url.format(from_lang=from_lang[2], to_lang=to_lang[2], - query=query) + query=query.decode('utf-8')) return params diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 606747a..4369ccb 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -10,6 +10,8 @@ @parse url, title, content, publishedDate, thumbnail """ +import random +import string from dateutil import parser from json import loads from lxml import html @@ -30,12 +32,17 @@ title_xpath = './/h2//a//text()' content_xpath = './/p//text()' pubdate_xpath = './/time' +digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\ + string.digits + "+_" + # do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 params['url'] = search_url.format(position=offset, query=quote_plus(query)) + params['cookies']['frontend.auid'] = ''.join(random.choice( + digg_cookie_chars) for _ in range(22)) return params diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 8b6411c..921e29f 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -14,9 +14,9 @@ """ from lxml.html import fromstring -from requests import get from json import loads from searx.engines.xpath import extract_text +from searx.poolrequests import get from searx.url_utils import urlencode # engine dependent config @@ -42,7 +42,7 @@ content_xpath = './/a[@class="result__snippet"]' # match query's language to a region code that duckduckgo will accept -def get_region_code(lang): +def get_region_code(lang, lang_list=None): # custom fixes for languages if lang == 'all': region_code = None @@ -66,7 +66,7 @@ def get_region_code(lang): else: # tries to get a country code from language region_code = region_code[0].lower() - for lc in supported_languages: + for lc in (lang_list or supported_languages): lc = lc.split('-') if region_code == lc[0]: region_code = lc[1].lower() + '-' + lc[0].lower() @@ -134,4 +134,4 @@ def _fetch_supported_languages(resp): regions_json = loads(response_page) supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) - return supported_languages + return list(supported_languages) diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index f355523..dbd78b0 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -13,10 +13,10 @@ @todo avoid extra request """ -from requests import get from json import loads from searx.engines.xpath import extract_text from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code +from searx.poolrequests import get from searx.url_utils import urlencode # engine dependent config @@ -52,7 +52,7 @@ def request(query, params): safesearch = params['safesearch'] - 1 - region_code = get_region_code(params['language']) + region_code = get_region_code(params['language'], lang_list=supported_languages) if region_code: params['url'] = images_url.format( query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd) diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py index e24d1b7..7ce3a6c 100644 --- a/searx/engines/faroo.py +++ b/searx/engines/faroo.py @@ -4,7 +4,7 @@ @website http://www.faroo.com @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key - @using-api yes + @using-api no @results JSON @stable yes @parse url, title, content, publishedDate, img_src @@ -20,18 +20,16 @@ categories = ['general', 'news'] paging = True language_support = True number_of_results = 10 -api_key = None # search-url url = 'http://www.faroo.com/' -search_url = url + 'api?{query}'\ - '&start={offset}'\ - '&length={number_of_results}'\ - '&l={language}'\ - '&src={categorie}'\ - '&i=false'\ - '&f=json'\ - '&key={api_key}' # noqa +search_url = url + 'instant.json?{query}'\ + '&start={offset}'\ + '&length={number_of_results}'\ + '&l={language}'\ + '&src={categorie}'\ + '&i=false'\ + '&c=false' search_category = {'general': 'web', 'news': 'news'} @@ -57,21 +55,15 @@ def request(query, params): number_of_results=number_of_results, query=urlencode({'q': query}), language=language, - categorie=categorie, - api_key=api_key) + categorie=categorie) - # using searx User-Agent - params['headers']['User-Agent'] = searx_useragent() + params['headers']['Referer'] = url return params # get response from search-request def response(resp): - # HTTP-Code 401: api-key is not valide - if resp.status_code == 401: - raise Exception("API key is not valide") - # HTTP-Code 429: rate limit exceeded if resp.status_code == 429: raise Exception("rate limit has been exceeded!") @@ -86,31 +78,19 @@ def response(resp): # parse results for result in search_res['results']: + publishedDate = None + result_json = {'url': result['url'], 'title': result['title'], + 'content': result['kwic']} if result['news']: - # timestamp (milliseconds since 1970) - publishedDate = datetime.datetime.fromtimestamp(result['date'] / 1000.0) # noqa - - # append news result - results.append({'url': result['url'], - 'title': result['title'], - 'publishedDate': publishedDate, - 'content': result['kwic']}) - - else: - # append general result - # TODO, publishedDate correct? - results.append({'url': result['url'], - 'title': result['title'], - 'content': result['kwic']}) + result_json['publishedDate'] = \ + datetime.datetime.fromtimestamp(result['date'] / 1000.0) # append image result if image url is set - # TODO, show results with an image like in faroo if result['iurl']: - results.append({'template': 'images.html', - 'url': result['url'], - 'title': result['title'], - 'content': result['kwic'], - 'img_src': result['iurl']}) + result_json['template'] = 'videos.html' + result_json['thumbnail'] = result['iurl'] + + results.append(result_json) # return results return results diff --git a/searx/engines/generalfile.py b/searx/engines/generalfile.py deleted file mode 100644 index 3bb2744..0000000 --- a/searx/engines/generalfile.py +++ /dev/null @@ -1,62 +0,0 @@ -""" - General Files (Files) - - @website http://www.general-files.org - @provide-api no (nothing found) - - @using-api no (because nothing found) - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content - - @todo detect torrents? -""" - -from lxml import html - -# engine dependent config -categories = ['files'] -paging = True - -# search-url -base_url = 'http://www.general-file.com' -search_url = base_url + '/files-{letter}/{query}/{pageno}' - -# specific xpath variables -result_xpath = '//table[@class="block-file"]' -title_xpath = './/h2/a//text()' -url_xpath = './/h2/a/@href' -content_xpath = './/p//text()' - - -# do search-request -def request(query, params): - - params['url'] = search_url.format(query=query, - letter=query[0], - pageno=params['pageno']) - - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - # parse results - for result in dom.xpath(result_xpath): - url = result.xpath(url_xpath)[0] - - # skip fast download links - if not url.startswith('/'): - continue - - # append result - results.append({'url': base_url + url, - 'title': ''.join(result.xpath(title_xpath)), - 'content': ''.join(result.xpath(content_xpath))}) - - # return results - return results diff --git a/searx/engines/genius.py b/searx/engines/genius.py new file mode 100644 index 0000000..b265e9d --- /dev/null +++ b/searx/engines/genius.py @@ -0,0 +1,88 @@ +""" +Genius + + @website https://www.genius.com/ + @provide-api yes (https://docs.genius.com/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, thumbnail, publishedDate +""" + +from json import loads +from searx.url_utils import urlencode +from datetime import datetime + +# engine dependent config +categories = ['music'] +paging = True +language_support = False +page_size = 5 + +url = 'https://genius.com/api/' +search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}' + + +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), + index='multi', + page_size=page_size, + pageno=params['pageno']) + return params + + +def parse_lyric(hit): + try: + content = hit['highlights'][0]['value'] + except: + content = None + timestamp = hit['result']['lyrics_updated_at'] + result = {'url': hit['result']['url'], + 'title': hit['result']['full_title'], + 'content': content, + 'thumbnail': hit['result']['song_art_image_thumbnail_url'], + 'template': 'videos.html'} + if timestamp: + result.update({'publishedDate': datetime.fromtimestamp(timestamp)}) + return result + + +def parse_artist(hit): + result = {'url': hit['result']['url'], + 'title': hit['result']['name'], + 'content': None, + 'thumbnail': hit['result']['image_url'], + 'template': 'videos.html'} + return result + + +def parse_album(hit): + result = {'url': hit['result']['url'], + 'title': hit['result']['full_title'], + 'thumbnail': hit['result']['cover_art_url'], + # 'thumbnail': hit['result']['cover_art_thumbnail_url'], + 'template': 'videos.html'} + try: + year = hit['result']['release_date_components']['year'] + except: + pass + else: + if year: + result.update({'content': 'Released: {}'.format(year)}) + return result + +parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album} + + +def response(resp): + results = [] + json = loads(resp.text) + hits = [hit for section in json['response']['sections'] for hit in section['hits']] + for hit in hits: + try: + func = parse[hit['type']] + except KeyError: + continue + results.append(func(hit)) + return results diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 37933c6..2bdc97f 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -10,6 +10,7 @@ @parse url, title, content """ +import random from json import loads from time import time from lxml.html import fromstring @@ -32,7 +33,8 @@ search_string = 'search?{query}'\ '&qh=0'\ '&qlang={lang}'\ '&ff={safesearch}'\ - '&rxikd={rxikd}' # random number - 9 digits + '&rxieu={rxieu}'\ + '&rand={rxikd}' # current unix timestamp # specific xpath variables results_xpath = '//response//result' @@ -59,10 +61,12 @@ def request(query, params): else: safesearch = 0 + # rxieu is some kind of hash from the search query, but accepts random atm search_path = search_string.format(query=urlencode({'q': query}), offset=offset, number_of_results=number_of_results, - rxikd=str(time())[:9], + rxikd=int(time() * 1000), + rxieu=random.randint(1000000000, 9999999999), lang=language, safesearch=safesearch) diff --git a/searx/engines/google.py b/searx/engines/google.py index 934f5c2..de2717d 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -165,10 +165,9 @@ def request(query, params): offset = (params['pageno'] - 1) * 10 if params['language'] == 'all': - language = 'en' - country = 'US' - url_lang = '' - elif params['language'][:2] == 'jv': + params['language'] = 'en-GB' + + if params['language'][:2] == 'jv': language = 'jw' country = 'ID' url_lang = 'lang_jw' diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 9692f4b..a380170 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -74,7 +74,7 @@ def response(resp): for result in dom.xpath('//div[@data-ved]'): try: - metadata = loads(''.join(result.xpath('./div[@class="rg_meta"]/text()'))) + metadata = loads(''.join(result.xpath('./div[contains(@class, "rg_meta")]/text()'))) except: continue diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 7344b52..8881d0d 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -67,8 +67,8 @@ def response(resp): for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'): try: r = { - 'url': result.xpath('.//div[@class="_cnc"]//a/@href')[0], - 'title': ''.join(result.xpath('.//div[@class="_cnc"]//h3//text()')), + 'url': result.xpath('.//a[@class="l _PMs"]')[0].attrib.get("href"), + 'title': ''.join(result.xpath('.//a[@class="l _PMs"]//text()')), 'content': ''.join(result.xpath('.//div[@class="st"]//text()')), } except: diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py new file mode 100644 index 0000000..310b314 --- /dev/null +++ b/searx/engines/google_videos.py @@ -0,0 +1,83 @@ +""" + Google (Videos) + + @website https://www.google.com + @provide-api yes (https://developers.google.com/custom-search/) + + @using-api no + @results HTML + @stable no + @parse url, title, content +""" + +from datetime import date, timedelta +from json import loads +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + + +# engine dependent config +categories = ['videos'] +paging = True +safesearch = True +time_range_support = True +number_of_results = 10 + +search_url = 'https://www.google.com/search'\ + '?{query}'\ + '&tbm=vid'\ + '&{search_options}' +time_range_attr = "qdr:{range}" +time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm'} + + +# do search-request +def request(query, params): + search_options = { + 'ijn': params['pageno'] - 1, + 'start': (params['pageno'] - 1) * number_of_results + } + + if params['time_range'] in time_range_dict: + search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) + elif params['time_range'] == 'year': + now = date.today() + then = now - timedelta(days=365) + start = then.strftime('%m/%d/%Y') + end = now.strftime('%m/%d/%Y') + search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) + + if safesearch and params['safesearch']: + search_options['safe'] = 'on' + + params['url'] = search_url.format(query=urlencode({'q': query}), + search_options=urlencode(search_options)) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('//div[@class="g"]'): + + title = extract_text(result.xpath('.//h3/a')) + url = result.xpath('.//h3/a/@href')[0] + content = extract_text(result.xpath('.//span[@class="st"]')) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'thumbnail': '', + 'template': 'videos.html'}) + + return results diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py index 67d6a5a..eeae5c2 100644 --- a/searx/engines/json_engine.py +++ b/searx/engines/json_engine.py @@ -98,10 +98,19 @@ def response(resp): results = [] json = loads(resp.text) if results_query: - for result in query(json, results_query)[0]: - url = query(result, url_query)[0] - title = query(result, title_query)[0] - content = query(result, content_query)[0] + rs = query(json, results_query) + if not len(rs): + return results + for result in rs[0]: + try: + url = query(result, url_query)[0] + title = query(result, title_query)[0] + except: + continue + try: + content = query(result, content_query)[0] + except: + content = "" results.append({'url': url, 'title': title, 'content': content}) else: for url, title, content in zip( diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py index 272c712..6a8e598 100644 --- a/searx/engines/nyaa.py +++ b/searx/engines/nyaa.py @@ -1,7 +1,7 @@ """ - Nyaa.se (Anime Bittorrent tracker) + Nyaa.si (Anime Bittorrent tracker) - @website http://www.nyaa.se/ + @website http://www.nyaa.si/ @provide-api no @using-api no @results HTML @@ -12,50 +12,25 @@ from lxml import html from searx.engines.xpath import extract_text from searx.url_utils import urlencode +from searx.utils import get_torrent_size, int_or_zero # engine dependent config categories = ['files', 'images', 'videos', 'music'] paging = True # search-url -base_url = 'http://www.nyaa.se/' +base_url = 'http://www.nyaa.si/' search_url = base_url + '?page=search&{query}&offset={offset}' # xpath queries -xpath_results = '//table[@class="tlist"]//tr[contains(@class, "tlistrow")]' -xpath_category = './/td[@class="tlisticon"]/a' -xpath_title = './/td[@class="tlistname"]/a' -xpath_torrent_file = './/td[@class="tlistdownload"]/a' -xpath_filesize = './/td[@class="tlistsize"]/text()' -xpath_seeds = './/td[@class="tlistsn"]/text()' -xpath_leeches = './/td[@class="tlistln"]/text()' -xpath_downloads = './/td[@class="tlistdn"]/text()' - - -# convert a variable to integer or return 0 if it's not a number -def int_or_zero(num): - if isinstance(num, list): - if len(num) < 1: - return 0 - num = num[0] - if num.isdigit(): - return int(num) - return 0 - - -# get multiplier to convert torrent size to bytes -def get_filesize_mul(suffix): - return { - 'KB': 1024, - 'MB': 1024 ** 2, - 'GB': 1024 ** 3, - 'TB': 1024 ** 4, - - 'KIB': 1024, - 'MIB': 1024 ** 2, - 'GIB': 1024 ** 3, - 'TIB': 1024 ** 4 - }[str(suffix).upper()] +xpath_results = '//table[contains(@class, "torrent-list")]//tr[not(th)]' +xpath_category = './/td[1]/a[1]' +xpath_title = './/td[2]/a[last()]' +xpath_torrent_links = './/td[3]/a' +xpath_filesize = './/td[4]/text()' +xpath_seeds = './/td[6]/text()' +xpath_leeches = './/td[7]/text()' +xpath_downloads = './/td[8]/text()' # do search-request @@ -72,25 +47,32 @@ def response(resp): dom = html.fromstring(resp.text) for result in dom.xpath(xpath_results): + # defaults + filesize = 0 + magnet_link = "" + torrent_link = "" + # category in which our torrent belongs - category = result.xpath(xpath_category)[0].attrib.get('title') + try: + category = result.xpath(xpath_category)[0].attrib.get('title') + except: + pass # torrent title page_a = result.xpath(xpath_title)[0] title = extract_text(page_a) # link to the page - href = page_a.attrib.get('href') + href = base_url + page_a.attrib.get('href') - # link to the torrent file - torrent_link = result.xpath(xpath_torrent_file)[0].attrib.get('href') - - # torrent size - try: - file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') - file_size = int(float(file_size) * get_filesize_mul(suffix)) - except: - file_size = None + for link in result.xpath(xpath_torrent_links): + url = link.attrib.get('href') + if 'magnet' in url: + # link to the magnet + magnet_link = url + else: + # link to the torrent file + torrent_link = url # seed count seed = int_or_zero(result.xpath(xpath_seeds)) @@ -101,6 +83,14 @@ def response(resp): # torrent downloads count downloads = int_or_zero(result.xpath(xpath_downloads)) + # let's try to calculate the torrent size + try: + filesize_info = result.xpath(xpath_filesize)[0] + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + except: + pass + # content string contains all information not included into template content = 'Category: "{category}". Downloaded {downloads} times.' content = content.format(category=category, downloads=downloads) @@ -110,8 +100,9 @@ def response(resp): 'content': content, 'seed': seed, 'leech': leech, - 'filesize': file_size, + 'filesize': filesize, 'torrentfile': torrent_link, + 'magnetlink': magnet_link, 'template': 'torrent.html'}) return results diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py new file mode 100644 index 0000000..6451f14 --- /dev/null +++ b/searx/engines/pubmed.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +""" + PubMed (Scholar publications) + @website https://www.ncbi.nlm.nih.gov/pubmed/ + @provide-api yes (https://www.ncbi.nlm.nih.gov/home/develop/api/) + @using-api yes + @results XML + @stable yes + @parse url, title, publishedDate, content + More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/ +""" + +from flask_babel import gettext +from lxml import etree +from datetime import datetime +from searx.url_utils import urlencode +from searx.poolrequests import get + + +categories = ['science'] + +base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'\ + + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' + +# engine dependent config +number_of_results = 10 +pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/' + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'term': query}), + offset=offset, + hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + # First retrieve notice of each result + pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\ + + 'db=pubmed&retmode=xml&id={pmids_string}' + + pmids_results = etree.XML(resp.content) + pmids = pmids_results.xpath('//eSearchResult/IdList/Id') + pmids_string = '' + + for item in pmids: + pmids_string += item.text + ',' + + retrieve_notice_args = dict(pmids_string=pmids_string) + + retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) + + search_results_xml = get(retrieve_url_encoded).content + search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') + + for entry in search_results: + title = entry.xpath('.//Article/ArticleTitle')[0].text + + pmid = entry.xpath('.//PMID')[0].text + url = pubmed_url + pmid + + try: + content = entry.xpath('.//Abstract/AbstractText')[0].text + except: + content = gettext('No abstract is available for this publication.') + + # If a doi is available, add it to the snipppet + try: + doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text + content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) + except: + pass + + if len(content) > 300: + content = content[0:300] + "..." + # TODO: center snippet on query term + + publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text + + '-' + entry.xpath('.//DateCreated/Month')[0].text + + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d') + + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/engines/searx_engine.py b/searx/engines/searx_engine.py index 91c2644..d4c85bd 100644 --- a/searx/engines/searx_engine.py +++ b/searx/engines/searx_engine.py @@ -2,7 +2,7 @@ Searx (all) @website https://github.com/asciimoo/searx - @provide-api yes (https://asciimoo.ithub.io/searx/dev/search_api.html) + @provide-api yes (https://asciimoo.github.io/searx/dev/search_api.html) @using-api yes @results JSON diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index 41b40da..d59755e 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -41,6 +41,7 @@ embedded_url = '<iframe width="100%" height="166" ' +\ 'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>' cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U) +guest_client_id = '' def get_client_id(): @@ -63,8 +64,10 @@ def get_client_id(): return "" -# api-key -guest_client_id = get_client_id() +def init(): + global guest_client_id + # api-key + guest_client_id = get_client_id() # do search-request diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index e9c13ca..00346a7 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -118,7 +118,7 @@ def _fetch_supported_languages(resp): dom = fromstring(resp.text) options = dom.xpath('//div[@id="regions-popup"]//ul/li/a') for option in options: - code = option.xpath('./@data-val')[0] + code = option.xpath('./@data-search-language')[0] if code.startswith('nb-'): code = code.replace('nb', 'no', 1) supported_languages.append(code) diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py index 9a6b5e5..7732120 100644 --- a/searx/engines/tokyotoshokan.py +++ b/searx/engines/tokyotoshokan.py @@ -14,8 +14,8 @@ import re from lxml import html from searx.engines.xpath import extract_text from datetime import datetime -from searx.engines.nyaa import int_or_zero, get_filesize_mul from searx.url_utils import urlencode +from searx.utils import get_torrent_size, int_or_zero # engine dependent config categories = ['files', 'videos', 'music'] @@ -76,8 +76,7 @@ def response(resp): try: # ('1.228', 'GB') groups = size_re.match(item).groups() - multiplier = get_filesize_mul(groups[1]) - params['filesize'] = int(multiplier * float(groups[0])) + params['filesize'] = get_torrent_size(groups[0], groups[1]) except: pass elif item.startswith('Date:'): diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py index dda56fc..fd4164a 100644 --- a/searx/engines/torrentz.py +++ b/searx/engines/torrentz.py @@ -1,7 +1,7 @@ """ - Torrentz.eu (BitTorrent meta-search engine) + Torrentz2.eu (BitTorrent meta-search engine) - @website https://torrentz.eu/ + @website https://torrentz2.eu/ @provide-api no @using-api no @@ -14,24 +14,24 @@ import re from lxml import html from datetime import datetime -from searx.engines.nyaa import int_or_zero, get_filesize_mul from searx.engines.xpath import extract_text from searx.url_utils import urlencode +from searx.utils import get_torrent_size # engine dependent config categories = ['files', 'videos', 'music'] paging = True # search-url -# https://torrentz.eu/search?f=EXAMPLE&p=6 -base_url = 'https://torrentz.eu/' +# https://torrentz2.eu/search?f=EXAMPLE&p=6 +base_url = 'https://torrentz2.eu/' search_url = base_url + 'search?{query}' # do search-request def request(query, params): page = params['pageno'] - 1 - query = urlencode({'q': query, 'p': page}) + query = urlencode({'f': query, 'p': page}) params['url'] = search_url.format(query=query) return params @@ -54,22 +54,29 @@ def response(resp): # extract url and remove a slash in the beginning link = links[0].attrib.get('href').lstrip('/') - seed = result.xpath('./dd/span[@class="u"]/text()')[0].replace(',', '') - leech = result.xpath('./dd/span[@class="d"]/text()')[0].replace(',', '') + seed = 0 + leech = 0 + try: + seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', '')) + leech = int(result.xpath('./dd/span[5]/text()')[0].replace(',', '')) + except: + pass params = { 'url': base_url + link, 'title': title, - 'seed': int_or_zero(seed), - 'leech': int_or_zero(leech), + 'seed': seed, + 'leech': leech, 'template': 'torrent.html' } # let's try to calculate the torrent size try: - size_str = result.xpath('./dd/span[@class="s"]/text()')[0] - size, suffix = size_str.split() - params['filesize'] = int(size) * get_filesize_mul(suffix) + filesize_info = result.xpath('./dd/span[3]/text()')[0] + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + + params['filesize'] = filesize except: pass @@ -80,9 +87,8 @@ def response(resp): # extract and convert creation date try: - date_str = result.xpath('./dd/span[@class="a"]/span')[0].attrib.get('title') - # Fri, 25 Mar 2016 16:29:01 - date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S') + date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title') + date = datetime.fromtimestamp(float(date_ts)) params['publishedDate'] = date except: pass diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 2a8642f..2cbbc5a 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -55,7 +55,8 @@ def obtain_token(): return token -obtain_token() +def init(): + obtain_token() # do search-request |