diff options
Diffstat (limited to 'searx/engines')
75 files changed, 7686 insertions, 0 deletions
diff --git a/searx/engines/1337x.py b/searx/engines/1337x.py new file mode 100644 index 0000000..0de04bd --- /dev/null +++ b/searx/engines/1337x.py @@ -0,0 +1,39 @@ +from lxml import html +from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size +from searx.url_utils import quote, urljoin + +url = 'https://1337x.to/' +search_url = url + 'search/{search_term}/{pageno}/' +categories = ['videos'] +paging = True + + +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno']) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//table[contains(@class, "table-list")]/tbody//tr'): + href = urljoin(url, result.xpath('./td[contains(@class, "name")]/a[2]/@href')[0]) + title = extract_text(result.xpath('./td[contains(@class, "name")]/a[2]')) + seed = extract_text(result.xpath('.//td[contains(@class, "seeds")]')) + leech = extract_text(result.xpath('.//td[contains(@class, "leeches")]')) + filesize_info = extract_text(result.xpath('.//td[contains(@class, "size")]/text()')) + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + + results.append({'url': href, + 'title': title, + 'seed': seed, + 'leech': leech, + 'filesize': filesize, + 'template': 'torrent.html'}) + + return results diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py new file mode 100644 index 0000000..023ec40 --- /dev/null +++ b/searx/engines/__init__.py @@ -0,0 +1,221 @@ + +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, <asciimoo@gmail.com> +''' + +from os.path import realpath, dirname +import sys +from flask_babel import gettext +from operator import itemgetter +from json import loads +from requests import get +from searx import settings +from searx import logger +from searx.utils import load_module + + +logger = logger.getChild('engines') + +engine_dir = dirname(realpath(__file__)) + +engines = {} + +categories = {'general': []} + +languages = loads(open(engine_dir + '/../data/engines_languages.json').read()) + +engine_shortcuts = {} +engine_default_args = {'paging': False, + 'categories': ['general'], + 'language_support': True, + 'supported_languages': [], + 'safesearch': False, + 'timeout': settings['outgoing']['request_timeout'], + 'shortcut': '-', + 'disabled': False, + 'suspend_end_time': 0, + 'continuous_errors': 0, + 'time_range_support': False} + + +def load_engine(engine_data): + + if '_' in engine_data['name']: + logger.error('Engine name conains underscore: "{}"'.format(engine_data['name'])) + sys.exit(1) + + engine_module = engine_data['engine'] + + try: + engine = load_module(engine_module + '.py', engine_dir) + except: + logger.exception('Cannot load engine "{}"'.format(engine_module)) + return None + + for param_name in engine_data: + if param_name == 'engine': + continue + if param_name == 'categories': + if engine_data['categories'] == 'none': + engine.categories = [] + else: + engine.categories = list(map(str.strip, engine_data['categories'].split(','))) + continue + setattr(engine, param_name, engine_data[param_name]) + + for arg_name, arg_value in engine_default_args.items(): + if not hasattr(engine, arg_name): + setattr(engine, arg_name, arg_value) + + # checking required variables + for engine_attr in dir(engine): + if engine_attr.startswith('_'): + continue + if getattr(engine, engine_attr) is None: + logger.error('Missing engine config attribute: "{0}.{1}"' + .format(engine.name, engine_attr)) + sys.exit(1) + + # assign supported languages from json file + if engine_data['name'] in languages: + setattr(engine, 'supported_languages', languages[engine_data['name']]) + + # assign language fetching method if auxiliary method exists + if hasattr(engine, '_fetch_supported_languages'): + setattr(engine, 'fetch_supported_languages', + lambda: engine._fetch_supported_languages(get(engine.supported_languages_url))) + + engine.stats = { + 'result_count': 0, + 'search_count': 0, + 'page_load_time': 0, + 'page_load_count': 0, + 'engine_time': 0, + 'engine_time_count': 0, + 'score_count': 0, + 'errors': 0 + } + + for category_name in engine.categories: + categories.setdefault(category_name, []).append(engine) + + if engine.shortcut in engine_shortcuts: + logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut)) + sys.exit(1) + + engine_shortcuts[engine.shortcut] = engine.name + + return engine + + +def to_percentage(stats, maxvalue): + for engine_stat in stats: + if maxvalue: + engine_stat['percentage'] = int(engine_stat['avg'] / maxvalue * 100) + else: + engine_stat['percentage'] = 0 + return stats + + +def get_engines_stats(): + # TODO refactor + pageloads = [] + engine_times = [] + results = [] + scores = [] + errors = [] + scores_per_result = [] + + max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0 # noqa + for engine in engines.values(): + if engine.stats['search_count'] == 0: + continue + results_num = \ + engine.stats['result_count'] / float(engine.stats['search_count']) + + if engine.stats['page_load_count'] != 0: + load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa + else: + load_times = 0 + + if engine.stats['engine_time_count'] != 0: + this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa + else: + this_engine_time = 0 + + if results_num: + score = engine.stats['score_count'] / float(engine.stats['search_count']) # noqa + score_per_result = score / results_num + else: + score = score_per_result = 0.0 + + max_pageload = max(load_times, max_pageload) + max_engine_times = max(this_engine_time, max_engine_times) + max_results = max(results_num, max_results) + max_score = max(score, max_score) + max_score_per_result = max(score_per_result, max_score_per_result) + max_errors = max(max_errors, engine.stats['errors']) + + pageloads.append({'avg': load_times, 'name': engine.name}) + engine_times.append({'avg': this_engine_time, 'name': engine.name}) + results.append({'avg': results_num, 'name': engine.name}) + scores.append({'avg': score, 'name': engine.name}) + errors.append({'avg': engine.stats['errors'], 'name': engine.name}) + scores_per_result.append({ + 'avg': score_per_result, + 'name': engine.name + }) + + pageloads = to_percentage(pageloads, max_pageload) + engine_times = to_percentage(engine_times, max_engine_times) + results = to_percentage(results, max_results) + scores = to_percentage(scores, max_score) + scores_per_result = to_percentage(scores_per_result, max_score_per_result) + erros = to_percentage(errors, max_errors) + + return [ + ( + gettext('Engine time (sec)'), + sorted(engine_times, key=itemgetter('avg')) + ), + ( + gettext('Page loads (sec)'), + sorted(pageloads, key=itemgetter('avg')) + ), + ( + gettext('Number of results'), + sorted(results, key=itemgetter('avg'), reverse=True) + ), + ( + gettext('Scores'), + sorted(scores, key=itemgetter('avg'), reverse=True) + ), + ( + gettext('Scores per result'), + sorted(scores_per_result, key=itemgetter('avg'), reverse=True) + ), + ( + gettext('Errors'), + sorted(errors, key=itemgetter('avg'), reverse=True) + ), + ] + + +def initialize_engines(engine_list): + for engine_data in engine_list: + engine = load_engine(engine_data) + if engine is not None: + engines[engine.name] = engine diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py new file mode 100644 index 0000000..cad06f8 --- /dev/null +++ b/searx/engines/archlinux.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- + +""" + Arch Linux Wiki + + @website https://wiki.archlinux.org + @provide-api no (Mediawiki provides API, but Arch Wiki blocks access to it + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['it'] +language_support = True +paging = True +base_url = 'https://wiki.archlinux.org' + +# xpath queries +xpath_results = '//ul[@class="mw-search-results"]/li' +xpath_link = './/div[@class="mw-search-result-heading"]/a' + + +# cut 'en' from 'en_US', 'de' from 'de_CH', and so on +def locale_to_lang_code(locale): + if locale.find('-') >= 0: + locale = locale.split('-')[0] + return locale + + +# wikis for some languages were moved off from the main site, we need to make +# requests to correct URLs to be able to get results in those languages +lang_urls = { + 'all': { + 'base': 'https://wiki.archlinux.org', + 'search': '/index.php?title=Special:Search&offset={offset}&{query}' + }, + 'de': { + 'base': 'https://wiki.archlinux.de', + 'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}' + }, + 'fr': { + 'base': 'https://wiki.archlinux.fr', + 'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}' + }, + 'ja': { + 'base': 'https://wiki.archlinuxjp.org', + 'search': '/index.php?title=特別:検索&offset={offset}&{query}' + }, + 'ro': { + 'base': 'http://wiki.archlinux.ro', + 'search': '/index.php?title=Special:Căutare&offset={offset}&{query}' + }, + 'tr': { + 'base': 'http://archtr.org/wiki', + 'search': '/index.php?title=Özel:Ara&offset={offset}&{query}' + } +} + + +# get base & search URLs for selected language +def get_lang_urls(language): + if language in lang_urls: + return lang_urls[language] + return lang_urls['all'] + + +# Language names to build search requests for +# those languages which are hosted on the main site. +main_langs = { + 'ar': 'العربية', + 'bg': 'Български', + 'cs': 'Česky', + 'da': 'Dansk', + 'el': 'Ελληνικά', + 'es': 'Español', + 'he': 'עברית', + 'hr': 'Hrvatski', + 'hu': 'Magyar', + 'it': 'Italiano', + 'ko': '한국어', + 'lt': 'Lietuviškai', + 'nl': 'Nederlands', + 'pl': 'Polski', + 'pt': 'Português', + 'ru': 'Русский', + 'sl': 'Slovenský', + 'th': 'ไทย', + 'uk': 'Українська', + 'zh': '简体中文' +} +supported_languages = dict(lang_urls, **main_langs) + + +# do search-request +def request(query, params): + # translate the locale (e.g. 'en_US') to language code ('en') + language = locale_to_lang_code(params['language']) + + # if our language is hosted on the main site, we need to add its name + # to the query in order to narrow the results to that language + if language in main_langs: + query += '(' + main_langs[language] + ')' + + # prepare the request parameters + query = urlencode({'search': query}) + offset = (params['pageno'] - 1) * 20 + + # get request URLs for our language of choice + urls = get_lang_urls(language) + search_url = urls['base'] + urls['search'] + + params['url'] = search_url.format(query=query, offset=offset) + + return params + + +# get response from search-request +def response(resp): + # get the base URL for the language in which request was made + language = locale_to_lang_code(resp.search_params['language']) + base_url = get_lang_urls(language)['base'] + + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(xpath_results): + link = result.xpath(xpath_link)[0] + href = urljoin(base_url, link.attrib.get('href')) + title = extract_text(link) + + results.append({'url': href, + 'title': title}) + + return results diff --git a/searx/engines/base.py b/searx/engines/base.py new file mode 100755 index 0000000..ff006a3 --- /dev/null +++ b/searx/engines/base.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python + +""" + BASE (Scholar publications) + + @website https://base-search.net + @provide-api yes with authorization (https://api.base-search.net/) + + @using-api yes + @results XML + @stable ? + @parse url, title, publishedDate, content + More info on api: http://base-search.net/about/download/base_interface.pdf +""" + +from lxml import etree +from datetime import datetime +import re +from searx.url_utils import urlencode +from searx.utils import searx_useragent + + +categories = ['science'] + +base_url = 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'\ + + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}' + +# engine dependent config +paging = True +number_of_results = 10 + +# shortcuts for advanced search +shorcut_dict = { + # user-friendly keywords + 'format:': 'dcformat:', + 'author:': 'dccreator:', + 'collection:': 'dccollection:', + 'hdate:': 'dchdate:', + 'contributor:': 'dccontributor:', + 'coverage:': 'dccoverage:', + 'date:': 'dcdate:', + 'abstract:': 'dcdescription:', + 'urls:': 'dcidentifier:', + 'language:': 'dclanguage:', + 'publisher:': 'dcpublisher:', + 'relation:': 'dcrelation:', + 'rights:': 'dcrights:', + 'source:': 'dcsource:', + 'subject:': 'dcsubject:', + 'title:': 'dctitle:', + 'type:': 'dcdctype:' +} + + +def request(query, params): + # replace shortcuts with API advanced search keywords + for key in shorcut_dict.keys(): + query = re.sub(str(key), str(shorcut_dict[key]), query) + + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'query': query}), + offset=offset, + hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + params['headers']['User-Agent'] = searx_useragent() + return params + + +def response(resp): + results = [] + + search_results = etree.XML(resp.text) + + for entry in search_results.xpath('./result/doc'): + content = "No description available" + + date = datetime.now() # needed in case no dcdate is available for an item + for item in entry: + if item.attrib["name"] == "dchdate": + harvestDate = item.text + + elif item.attrib["name"] == "dcdate": + date = item.text + + elif item.attrib["name"] == "dctitle": + title = item.text + + elif item.attrib["name"] == "dclink": + url = item.text + + elif item.attrib["name"] == "dcdescription": + content = item.text[:300] + if len(item.text) > 300: + content += "..." + +# dates returned by the BASE API are not several formats + publishedDate = None + for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']: + try: + publishedDate = datetime.strptime(date, date_format) + break + except: + pass + + if publishedDate is not None: + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + else: + res_dict = {'url': url, + 'title': title, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/engines/bing.py b/searx/engines/bing.py new file mode 100644 index 0000000..052d567 --- /dev/null +++ b/searx/engines/bing.py @@ -0,0 +1,101 @@ +""" + Bing (Web) + + @website https://www.bing.com + @provide-api yes (http://datamarket.azure.com/dataset/bing/search), + max. 5000 query/month + + @using-api no (because of query limit) + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content + + @todo publishedDate +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['general'] +paging = True +language_support = True +supported_languages_url = 'https://www.bing.com/account/general' + +# search-url +base_url = 'https://www.bing.com/' +search_string = 'search?{query}&first={offset}' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + if params['language'] != 'all': + lang = params['language'].split('-')[0].upper() + else: + lang = 'EN' + + query = u'language:{} {}'.format(lang, query.decode('utf-8')).encode('utf-8') + + search_path = search_string.format( + query=urlencode({'q': query}), + offset=offset) + + params['url'] = base_url + search_path + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + try: + results.append({'number_of_results': int(dom.xpath('//span[@class="sb_count"]/text()')[0] + .split()[0].replace(',', ''))}) + except: + pass + + # parse results + for result in dom.xpath('//div[@class="sa_cc"]'): + link = result.xpath('.//h3/a')[0] + url = link.attrib.get('href') + title = extract_text(link) + content = extract_text(result.xpath('.//p')) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # parse results again if nothing is found yet + for result in dom.xpath('//li[@class="b_algo"]'): + link = result.xpath('.//h2/a')[0] + url = link.attrib.get('href') + title = extract_text(link) + content = extract_text(result.xpath('.//p')) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = html.fromstring(resp.text) + options = dom.xpath('//div[@id="limit-languages"]//input') + for option in options: + code = option.xpath('./@id')[0].replace('_', '-') + if code == 'nb': + code = 'no' + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py new file mode 100644 index 0000000..6300c94 --- /dev/null +++ b/searx/engines/bing_images.py @@ -0,0 +1,108 @@ +""" + Bing (Images) + + @website https://www.bing.com/images + @provide-api yes (http://datamarket.azure.com/dataset/bing/search), + max. 5000 query/month + + @using-api no (because of query limit) + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, img_src + + @todo currently there are up to 35 images receive per page, + because bing does not parse count=10. + limited response to 10 images +""" + +from lxml import html +from json import loads +import re +from searx.engines.bing import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode + +# engine dependent config +categories = ['images'] +paging = True +safesearch = True +time_range_support = True + +# search-url +base_url = 'https://www.bing.com/' +search_string = 'images/search?{query}&count=10&first={offset}' +time_range_string = '&qft=+filterui:age-lt{interval}' +time_range_dict = {'day': '1440', + 'week': '10080', + 'month': '43200', + 'year': '525600'} + +# safesearch definitions +safesearch_types = {2: 'STRICT', + 1: 'DEMOTE', + 0: 'OFF'} + + +_quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U) + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + # required for cookie + if params['language'] == 'all': + language = 'en-US' + else: + language = params['language'] + + search_path = search_string.format( + query=urlencode({'q': query}), + offset=offset) + + params['cookies']['SRCHHPGUSR'] = \ + 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\ + '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + params['url'] = base_url + search_path + if params['time_range'] in time_range_dict: + params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('//div[@id="mmComponent_images_1"]/ul/li/div/div[@class="imgpt"]'): + link = result.xpath('./a')[0] + + # TODO find actual title + title = link.xpath('.//img/@alt')[0] + + # parse json-data (it is required to add a space, to make it parsable) + json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('m'))) + + url = json_data.get('purl') + img_src = json_data.get('murl') + + thumb_json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('mad'))) + thumbnail = thumb_json_data.get('turl') + + # append result + results.append({'template': 'images.html', + 'url': url, + 'title': title, + 'content': '', + 'thumbnail_src': thumbnail, + 'img_src': img_src}) + + # TODO stop parsing if 10 images are found + # if len(results) >= 10: + # break + + # return results + return results diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py new file mode 100644 index 0000000..b999b2a --- /dev/null +++ b/searx/engines/bing_news.py @@ -0,0 +1,127 @@ +""" + Bing (News) + + @website https://www.bing.com/news + @provide-api yes (http://datamarket.azure.com/dataset/bing/search), + max. 5000 query/month + + @using-api no (because of query limit) + @results RSS (using search portal) + @stable yes (except perhaps for the images) + @parse url, title, content, publishedDate, thumbnail +""" + +from datetime import datetime +from dateutil import parser +from lxml import etree +from searx.utils import list_get +from searx.engines.bing import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode, urlparse, parse_qsl + +# engine dependent config +categories = ['news'] +paging = True +language_support = True +time_range_support = True + +# search-url +base_url = 'https://www.bing.com/' +search_string = 'news/search?{query}&first={offset}&format=RSS' +search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS' +time_range_dict = {'day': '7', + 'week': '8', + 'month': '9'} + + +# remove click +def url_cleanup(url_string): + parsed_url = urlparse(url_string) + if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': + query = dict(parse_qsl(parsed_url.query)) + return query.get('url', None) + return url_string + + +# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=... +def image_url_cleanup(url_string): + parsed_url = urlparse(url_string) + if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th': + query = dict(parse_qsl(parsed_url.query)) + return "https://www.bing.com/th?id=" + query.get('id') + return url_string + + +def _get_url(query, language, offset, time_range): + if time_range in time_range_dict: + search_path = search_string_with_time.format( + query=urlencode({'q': query, 'setmkt': language}), + offset=offset, + interval=time_range_dict[time_range]) + else: + search_path = search_string.format( + query=urlencode({'q': query, 'setmkt': language}), + offset=offset) + return base_url + search_path + + +# do search-request +def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + + offset = (params['pageno'] - 1) * 10 + 1 + + if params['language'] == 'all': + language = 'en-US' + else: + language = params['language'] + + params['url'] = _get_url(query, language, offset, params['time_range']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + rss = etree.fromstring(resp.content) + + ns = rss.nsmap + + # parse results + for item in rss.xpath('./channel/item'): + # url / title / content + url = url_cleanup(item.xpath('./link/text()')[0]) + title = list_get(item.xpath('./title/text()'), 0, url) + content = list_get(item.xpath('./description/text()'), 0, '') + + # publishedDate + publishedDate = list_get(item.xpath('./pubDate/text()'), 0) + try: + publishedDate = parser.parse(publishedDate, dayfirst=False) + except TypeError: + publishedDate = datetime.now() + except ValueError: + publishedDate = datetime.now() + + # thumbnail + thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0) + if thumbnail is not None: + thumbnail = image_url_cleanup(thumbnail) + + # append result + if thumbnail is not None: + results.append({'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content, + 'img_src': thumbnail}) + else: + results.append({'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content}) + + # return results + return results diff --git a/searx/engines/blekko_images.py b/searx/engines/blekko_images.py new file mode 100644 index 0000000..f716456 --- /dev/null +++ b/searx/engines/blekko_images.py @@ -0,0 +1,70 @@ +""" + Blekko (Images) + + @website https://blekko.com + @provide-api yes (inofficial) + + @using-api yes + @results JSON + @stable yes + @parse url, title, img_src +""" + +from json import loads +from searx.url_utils import urlencode + +# engine dependent config +categories = ['images'] +paging = True +safesearch = True + +# search-url +base_url = 'https://blekko.com' +search_url = '/api/images?{query}&c={c}' + +# safesearch definitions +safesearch_types = {2: '1', + 1: '', + 0: '0'} + + +# do search-request +def request(query, params): + c = (params['pageno'] - 1) * 48 + + params['url'] = base_url +\ + search_url.format(query=urlencode({'q': query}), + c=c) + + if params['pageno'] != 1: + params['url'] += '&page={pageno}'.format(pageno=(params['pageno'] - 1)) + + # let Blekko know we wan't have profiling + params['cookies']['tag_lesslogging'] = '1' + + # parse safesearch argument + params['cookies']['safesearch'] = safesearch_types.get(params['safesearch'], '') + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if not search_results: + return [] + + for result in search_results: + # append result + results.append({'url': result['page_url'], + 'title': result['title'], + 'content': '', + 'img_src': result['url'], + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py new file mode 100644 index 0000000..4043867 --- /dev/null +++ b/searx/engines/btdigg.py @@ -0,0 +1,92 @@ +""" + BTDigg (Videos, Music, Files) + + @website https://btdigg.org + @provide-api yes (on demand) + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content, seed, leech, magnetlink +""" + +from lxml import html +from operator import itemgetter +from searx.engines.xpath import extract_text +from searx.url_utils import quote, urljoin +from searx.utils import get_torrent_size + +# engine dependent config +categories = ['videos', 'music', 'files'] +paging = True + +# search-url +url = 'https://btdigg.org' +search_url = url + '/search?q={search_term}&p={pageno}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), + pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + search_res = dom.xpath('//div[@id="search_res"]/table/tr') + + # return empty array if nothing is found + if not search_res: + return [] + + # parse results + for result in search_res: + link = result.xpath('.//td[@class="torrent_name"]//a')[0] + href = urljoin(url, link.attrib.get('href')) + title = extract_text(link) + content = extract_text(result.xpath('.//pre[@class="snippet"]')[0]) + content = "<br />".join(content.split("\n")) + + filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0] + filesize_multiplier = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[1] + files = result.xpath('.//span[@class="attr_val"]/text()')[1] + seed = result.xpath('.//span[@class="attr_val"]/text()')[2] + + # convert seed to int if possible + if seed.isdigit(): + seed = int(seed) + else: + seed = 0 + + leech = 0 + + # convert filesize to byte if possible + filesize = get_torrent_size(filesize, filesize_multiplier) + + # convert files to int if possible + if files.isdigit(): + files = int(files) + else: + files = None + + magnetlink = result.xpath('.//td[@class="ttth"]//a')[0].attrib['href'] + + # append result + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': filesize, + 'files': files, + 'magnetlink': magnetlink, + 'template': 'torrent.html'}) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py new file mode 100644 index 0000000..1218d48 --- /dev/null +++ b/searx/engines/currency_convert.py @@ -0,0 +1,105 @@ +import json +import re +import os +import sys +import unicodedata + +from datetime import datetime + +if sys.version_info[0] == 3: + unicode = str + +categories = [] +url = 'https://download.finance.yahoo.com/d/quotes.csv?e=.csv&f=sl1d1t1&s={query}=X' +weight = 100 + +parser_re = re.compile(b'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) + +db = 1 + + +def normalize_name(name): + name = name.decode('utf-8').lower().replace('-', ' ').rstrip('s') + name = re.sub(' +', ' ', name) + return unicodedata.normalize('NFKD', name).lower() + + +def name_to_iso4217(name): + global db + + name = normalize_name(name) + currencies = db['names'].get(name, [name]) + return currencies[0] + + +def iso4217_to_name(iso4217, language): + global db + + return db['iso4217'].get(iso4217, {}).get(language, iso4217) + + +def request(query, params): + m = parser_re.match(query) + if not m: + # wrong query + return params + + ammount, from_currency, to_currency = m.groups() + ammount = float(ammount) + from_currency = name_to_iso4217(from_currency.strip()) + to_currency = name_to_iso4217(to_currency.strip()) + + q = (from_currency + to_currency).upper() + + params['url'] = url.format(query=q) + params['ammount'] = ammount + params['from'] = from_currency + params['to'] = to_currency + params['from_name'] = iso4217_to_name(from_currency, 'en') + params['to_name'] = iso4217_to_name(to_currency, 'en') + + return params + + +def response(resp): + results = [] + try: + _, conversion_rate, _ = resp.text.split(',', 2) + conversion_rate = float(conversion_rate) + except: + return results + + answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format( + resp.search_params['ammount'], + resp.search_params['from'], + resp.search_params['ammount'] * conversion_rate, + resp.search_params['to'], + conversion_rate, + resp.search_params['from_name'], + resp.search_params['to_name'], + ) + + now_date = datetime.now().strftime('%Y%m%d') + url = 'https://finance.yahoo.com/currency/converter-results/{0}/{1}-{2}-to-{3}.html' # noqa + url = url.format( + now_date, + resp.search_params['ammount'], + resp.search_params['from'].lower(), + resp.search_params['to'].lower() + ) + + results.append({'answer': answer, 'url': url}) + + return results + + +def load(): + global db + + current_dir = os.path.dirname(os.path.realpath(__file__)) + json_data = open(current_dir + "/../data/currencies.json").read() + + db = json.loads(json_data) + + +load() diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py new file mode 100644 index 0000000..fad7e59 --- /dev/null +++ b/searx/engines/dailymotion.py @@ -0,0 +1,97 @@ +""" + Dailymotion (Videos) + + @website https://www.dailymotion.com + @provide-api yes (http://www.dailymotion.com/developer) + + @using-api yes + @results JSON + @stable yes + @parse url, title, thumbnail, publishedDate, embedded + + @todo set content-parameter with correct data +""" + +from json import loads +from datetime import datetime +from searx.url_utils import urlencode + +# engine dependent config +categories = ['videos'] +paging = True +language_support = True + +# search-url +# see http://www.dailymotion.com/doc/api/obj-video.html +search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa +embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\ + 'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>' + +supported_languages_url = 'https://api.dailymotion.com/languages' + + +# do search-request +def request(query, params): + if params['language'] == 'all': + locale = 'en-US' + else: + locale = params['language'] + + params['url'] = search_url.format( + query=urlencode({'search': query, 'localization': locale}), + pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # return empty array if there are no results + if 'list' not in search_res: + return [] + + # parse results + for res in search_res['list']: + title = res['title'] + url = res['url'] + content = res['description'] + thumbnail = res['thumbnail_360_url'] + publishedDate = datetime.fromtimestamp(res['created_time'], None) + embedded = embedded_url.format(videoid=res['id']) + + # http to https + thumbnail = thumbnail.replace("http://", "https://") + + results.append({'template': 'videos.html', + 'url': url, + 'title': title, + 'content': content, + 'publishedDate': publishedDate, + 'embedded': embedded, + 'thumbnail': thumbnail}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = {} + + response_json = loads(resp.text) + + for language in response_json['list']: + supported_languages[language['code']] = {} + + name = language['native_name'] + if name: + supported_languages[language['code']]['name'] = name + english_name = language['name'] + if english_name: + supported_languages[language['code']]['english_name'] = english_name + + return supported_languages diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py new file mode 100644 index 0000000..af63478 --- /dev/null +++ b/searx/engines/deezer.py @@ -0,0 +1,67 @@ +""" + Deezer (Music) + + @website https://deezer.com + @provide-api yes (http://developers.deezer.com/api/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, embedded +""" + +from json import loads +from searx.url_utils import urlencode + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +url = 'https://api.deezer.com/' +search_url = url + 'search?{query}&index={offset}' + +embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\ + 'data-src="https://www.deezer.com/plugins/player?type=tracks&id={audioid}" ' +\ + 'width="540" height="80"></iframe>' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 25 + + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('data', []): + if result['type'] == 'track': + title = result['title'] + url = result['link'] + + if url.startswith('http://'): + url = 'https' + url[4:] + + content = u'{} - {} - {}'.format( + result['artist']['name'], + result['album']['title'], + result['title']) + + embedded = embedded_url.format(audioid=result['id']) + + # append result + results.append({'url': url, + 'title': title, + 'embedded': embedded, + 'content': content}) + + # return results + return results diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py new file mode 100644 index 0000000..bb85c6d --- /dev/null +++ b/searx/engines/deviantart.py @@ -0,0 +1,84 @@ +""" + Deviantart (Images) + + @website https://www.deviantart.com/ + @provide-api yes (https://www.deviantart.com/developers/) (RSS) + + @using-api no (TODO, rewrite to api) + @results HTML + @stable no (HTML can change) + @parse url, title, thumbnail_src, img_src + + @todo rewrite to api +""" + +from lxml import html +import re +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['images'] +paging = True +time_range_support = True + +# search-url +base_url = 'https://www.deviantart.com/' +search_url = base_url + 'browse/all/?offset={offset}&{query}' +time_range_url = '&order={range}' + +time_range_dict = {'day': 11, + 'week': 14, + 'month': 15} + + +# do search-request +def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + + offset = (params['pageno'] - 1) * 24 + + params['url'] = search_url.format(offset=offset, + query=urlencode({'q': query})) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + # return empty array if a redirection code is returned + if resp.status_code == 302: + return [] + + dom = html.fromstring(resp.text) + + regex = re.compile(r'\/200H\/') + + # parse results + for result in dom.xpath('.//span[@class="thumb wide"]'): + link = result.xpath('.//a[@class="torpedo-thumb-link"]')[0] + url = link.attrib.get('href') + title = extract_text(result.xpath('.//span[@class="title"]')) + thumbnail_src = link.xpath('.//img')[0].attrib.get('src') + img_src = regex.sub('/', thumbnail_src) + + # http to https, remove domain sharding + thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) + thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) + + url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py new file mode 100644 index 0000000..7c34786 --- /dev/null +++ b/searx/engines/dictzone.py @@ -0,0 +1,68 @@ +""" + Dictzone + + @website https://dictzone.com/ + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +import re +from lxml import html +from searx.utils import is_valid_lang +from searx.url_utils import urljoin + +categories = ['general'] +url = u'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +weight = 100 + +parser_re = re.compile(b'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) +results_xpath = './/table[@id="r"]/tr' + + +def request(query, params): + m = parser_re.match(query) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return params + + params['url'] = url.format(from_lang=from_lang[2], + to_lang=to_lang[2], + query=query) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for k, result in enumerate(dom.xpath(results_xpath)[1:]): + try: + from_result, to_results_raw = result.xpath('./td') + except: + continue + + to_results = [] + for to_result in to_results_raw.xpath('./p/a'): + t = to_result.text_content() + if t.strip(): + to_results.append(to_result.text_content()) + + results.append({ + 'url': urljoin(resp.url, '?%d' % k), + 'title': from_result.text_content(), + 'content': '; '.join(to_results) + }) + + return results diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py new file mode 100644 index 0000000..ff2f945 --- /dev/null +++ b/searx/engines/digbt.py @@ -0,0 +1,62 @@ +""" + DigBT (Videos, Music, Files) + + @website https://digbt.org + @provide-api no + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content, magnetlink +""" + +from sys import version_info +from lxml import html +from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size +from searx.url_utils import urljoin + +if version_info[0] == 3: + unicode = str + +categories = ['videos', 'music', 'files'] +paging = True + +URL = 'https://digbt.org' +SEARCH_URL = URL + '/search/{query}-time-{pageno}' +FILESIZE = 3 +FILESIZE_MULTIPLIER = 4 + + +def request(query, params): + params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno']) + + return params + + +def response(resp): + dom = html.fromstring(resp.text) + search_res = dom.xpath('.//td[@class="x-item"]') + + if not search_res: + return list() + + results = list() + for result in search_res: + url = urljoin(URL, result.xpath('.//a[@title]/@href')[0]) + title = extract_text(result.xpath('.//a[@title]')) + content = extract_text(result.xpath('.//div[@class="files"]')) + files_data = extract_text(result.xpath('.//div[@class="tail"]')).split() + filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER]) + magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0] + + results.append({'url': url, + 'title': title, + 'content': content, + 'filesize': filesize, + 'magnetlink': magnetlink, + 'seed': 'N/A', + 'leech': 'N/A', + 'template': 'torrent.html'}) + + return results diff --git a/searx/engines/digg.py b/searx/engines/digg.py new file mode 100644 index 0000000..606747a --- /dev/null +++ b/searx/engines/digg.py @@ -0,0 +1,74 @@ +""" + Digg (News, Social media) + + @website https://digg.com/ + @provide-api no + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content, publishedDate, thumbnail +""" + +from dateutil import parser +from json import loads +from lxml import html +from searx.url_utils import quote_plus + +# engine dependent config +categories = ['news', 'social media'] +paging = True + +# search-url +base_url = 'https://digg.com/' +search_url = base_url + 'api/search/{query}.json?position={position}&format=html' + +# specific xpath variables +results_xpath = '//article' +link_xpath = './/small[@class="time"]//a' +title_xpath = './/h2//a//text()' +content_xpath = './/p//text()' +pubdate_xpath = './/time' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + params['url'] = search_url.format(position=offset, + query=quote_plus(query)) + return params + + +# get response from search-request +def response(resp): + results = [] + + search_result = loads(resp.text) + + if 'html' not in search_result or search_result['html'] == '': + return results + + dom = html.fromstring(search_result['html']) + + # parse results + for result in dom.xpath(results_xpath): + url = result.attrib.get('data-contenturl') + thumbnail = result.xpath('.//img')[0].attrib.get('src') + title = ''.join(result.xpath(title_xpath)) + content = ''.join(result.xpath(content_xpath)) + pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime') + publishedDate = parser.parse(pubdate) + + # http to https + thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com") + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'publishedDate': publishedDate, + 'thumbnail': thumbnail}) + + # return results + return results diff --git a/searx/engines/doku.py b/searx/engines/doku.py new file mode 100644 index 0000000..a391be4 --- /dev/null +++ b/searx/engines/doku.py @@ -0,0 +1,84 @@ +# Doku Wiki +# +# @website https://www.dokuwiki.org/ +# @provide-api yes +# (https://www.dokuwiki.org/devel:xmlrpc) +# +# @using-api no +# @results HTML +# @stable yes +# @parse (general) url, title, content + +from lxml.html import fromstring +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['general'] # TODO , 'images', 'music', 'videos', 'files' +paging = False +language_support = False +number_of_results = 5 + +# search-url +# Doku is OpenSearch compatible +base_url = 'http://localhost:8090' +search_url = '/?do=search'\ + '&{query}' +# TODO '&startRecord={offset}'\ +# TODO '&maximumRecords={limit}'\ + + +# do search-request +def request(query, params): + + params['url'] = base_url +\ + search_url.format(query=urlencode({'id': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + doc = fromstring(resp.text) + + # parse results + # Quickhits + for r in doc.xpath('//div[@class="search_quickresult"]/ul/li'): + try: + res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1] + except: + continue + + if not res_url: + continue + + title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title')) + + # append result + results.append({'title': title, + 'content': "", + 'url': base_url + res_url}) + + # Search results + for r in doc.xpath('//dl[@class="search_results"]/*'): + try: + if r.tag == "dt": + res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1] + title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title')) + elif r.tag == "dd": + content = extract_text(r.xpath('.')) + + # append result + results.append({'title': title, + 'content': content, + 'url': base_url + res_url}) + except: + continue + + if not res_url: + continue + + # return results + return results diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py new file mode 100644 index 0000000..8b6411c --- /dev/null +++ b/searx/engines/duckduckgo.py @@ -0,0 +1,137 @@ +""" + DuckDuckGo (Web) + + @website https://duckduckgo.com/ + @provide-api yes (https://duckduckgo.com/api), + but not all results from search-site + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content + + @todo rewrite to api +""" + +from lxml.html import fromstring +from requests import get +from json import loads +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['general'] +paging = True +language_support = True +supported_languages_url = 'https://duckduckgo.com/d2030.js' +time_range_support = True + +# search-url +url = 'https://duckduckgo.com/html?{query}&s={offset}&api=/d.js&o=json&dc={dc_param}' +time_range_url = '&df={range}' + +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm'} + +# specific xpath variables +result_xpath = '//div[@class="result results_links results_links_deep web-result "]' # noqa +url_xpath = './/a[@class="result__a"]/@href' +title_xpath = './/a[@class="result__a"]' +content_xpath = './/a[@class="result__snippet"]' + + +# match query's language to a region code that duckduckgo will accept +def get_region_code(lang): + # custom fixes for languages + if lang == 'all': + region_code = None + elif lang[:2] == 'ja': + region_code = 'jp-jp' + elif lang[:2] == 'sl': + region_code = 'sl-sl' + elif lang == 'zh-TW': + region_code = 'tw-tzh' + elif lang == 'zh-HK': + region_code = 'hk-tzh' + elif lang[-2:] == 'SA': + region_code = 'xa-' + lang.split('-')[0] + elif lang[-2:] == 'GB': + region_code = 'uk-' + lang.split('-')[0] + else: + region_code = lang.split('-') + if len(region_code) == 2: + # country code goes first + region_code = region_code[1].lower() + '-' + region_code[0].lower() + else: + # tries to get a country code from language + region_code = region_code[0].lower() + for lc in supported_languages: + lc = lc.split('-') + if region_code == lc[0]: + region_code = lc[1].lower() + '-' + lc[0].lower() + break + return region_code + + +# do search-request +def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + + offset = (params['pageno'] - 1) * 30 + + region_code = get_region_code(params['language']) + if region_code: + params['url'] = url.format( + query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + else: + params['url'] = url.format( + query=urlencode({'q': query}), offset=offset, dc_param=offset) + + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + doc = fromstring(resp.text) + + # parse results + for r in doc.xpath(result_xpath): + try: + res_url = r.xpath(url_xpath)[-1] + except: + continue + + if not res_url: + continue + + title = extract_text(r.xpath(title_xpath)) + content = extract_text(r.xpath(content_xpath)) + + # append result + results.append({'title': title, + 'content': content, + 'url': res_url}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + + # response is a js file with regions as an embedded object + response_page = resp.text + response_page = response_page[response_page.find('regions:{') + 8:] + response_page = response_page[:response_page.find('}') + 1] + + regions_json = loads(response_page) + supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) + + return supported_languages diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py new file mode 100644 index 0000000..21c6a65 --- /dev/null +++ b/searx/engines/duckduckgo_definitions.py @@ -0,0 +1,157 @@ +import json +from lxml import html +from re import compile +from searx.engines.xpath import extract_text +from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode +from searx.utils import html_to_text + +url = 'https://api.duckduckgo.com/'\ + + '?{query}&format=json&pretty=0&no_redirect=1&d=1' + +http_regex = compile(r'^http:') + + +def result_to_text(url, text, htmlResult): + # TODO : remove result ending with "Meaning" or "Category" + dom = html.fromstring(htmlResult) + a = dom.xpath('//a') + if len(a) >= 1: + return extract_text(a[0]) + else: + return text + + +def request(query, params): + params['url'] = url.format(query=urlencode({'q': query})) + params['headers']['Accept-Language'] = params['language'].split('-')[0] + return params + + +def response(resp): + results = [] + + search_res = json.loads(resp.text) + + content = '' + heading = search_res.get('Heading', '') + attributes = [] + urls = [] + infobox_id = None + relatedTopics = [] + + # add answer if there is one + answer = search_res.get('Answer', '') + if answer != '': + results.append({'answer': html_to_text(answer)}) + + # add infobox + if 'Definition' in search_res: + content = content + search_res.get('Definition', '') + + if 'Abstract' in search_res: + content = content + search_res.get('Abstract', '') + + # image + image = search_res.get('Image', '') + image = None if image == '' else image + + # attributes + if 'Infobox' in search_res: + infobox = search_res.get('Infobox', None) + if 'content' in infobox: + for info in infobox.get('content'): + attributes.append({'label': info.get('label'), + 'value': info.get('value')}) + + # urls + for ddg_result in search_res.get('Results', []): + if 'FirstURL' in ddg_result: + firstURL = ddg_result.get('FirstURL', '') + text = ddg_result.get('Text', '') + urls.append({'title': text, 'url': firstURL}) + results.append({'title': heading, 'url': firstURL}) + + # related topics + for ddg_result in search_res.get('RelatedTopics', []): + if 'FirstURL' in ddg_result: + suggestion = result_to_text(ddg_result.get('FirstURL', None), + ddg_result.get('Text', None), + ddg_result.get('Result', None)) + if suggestion != heading: + results.append({'suggestion': suggestion}) + elif 'Topics' in ddg_result: + suggestions = [] + relatedTopics.append({'name': ddg_result.get('Name', ''), + 'suggestions': suggestions}) + for topic_result in ddg_result.get('Topics', []): + suggestion = result_to_text(topic_result.get('FirstURL', None), + topic_result.get('Text', None), + topic_result.get('Result', None)) + if suggestion != heading: + suggestions.append(suggestion) + + # abstract + abstractURL = search_res.get('AbstractURL', '') + if abstractURL != '': + # add as result ? problem always in english + infobox_id = abstractURL + urls.append({'title': search_res.get('AbstractSource'), + 'url': abstractURL}) + + # definition + definitionURL = search_res.get('DefinitionURL', '') + if definitionURL != '': + # add as result ? as answer ? problem always in english + infobox_id = definitionURL + urls.append({'title': search_res.get('DefinitionSource'), + 'url': definitionURL}) + + # to merge with wikidata's infobox + if infobox_id: + infobox_id = http_regex.sub('https:', infobox_id) + + # entity + entity = search_res.get('Entity', None) + # TODO continent / country / department / location / waterfall / + # mountain range : + # link to map search, get weather, near by locations + # TODO musician : link to music search + # TODO concert tour : ?? + # TODO film / actor / television / media franchise : + # links to IMDB / rottentomatoes (or scrap result) + # TODO music : link tu musicbrainz / last.fm + # TODO book : ?? + # TODO artist / playwright : ?? + # TODO compagny : ?? + # TODO software / os : ?? + # TODO software engineer : ?? + # TODO prepared food : ?? + # TODO website : ?? + # TODO performing art : ?? + # TODO prepared food : ?? + # TODO programming language : ?? + # TODO file format : ?? + + if len(heading) > 0: + # TODO get infobox.meta.value where .label='article_title' + if image is None and len(attributes) == 0 and len(urls) == 1 and\ + len(relatedTopics) == 0 and len(content) == 0: + results.append({ + 'url': urls[0]['url'], + 'title': heading, + 'content': content + }) + else: + results.append({ + 'infobox': heading, + 'id': infobox_id, + 'entity': entity, + 'content': content, + 'img_src': image, + 'attributes': attributes, + 'urls': urls, + 'relatedTopics': relatedTopics + }) + + return results diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py new file mode 100644 index 0000000..f355523 --- /dev/null +++ b/searx/engines/duckduckgo_images.py @@ -0,0 +1,91 @@ +""" + DuckDuckGo (Images) + + @website https://duckduckgo.com/ + @provide-api yes (https://duckduckgo.com/api), + but images are not supported + + @using-api no + @results JSON (site requires js to get images) + @stable no (JSON can change) + @parse url, title, img_src + + @todo avoid extra request +""" + +from requests import get +from json import loads +from searx.engines.xpath import extract_text +from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code +from searx.url_utils import urlencode + +# engine dependent config +categories = ['images'] +paging = True +language_support = True +safesearch = True + +# search-url +images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' +site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images' + + +# run query in site to get vqd number needed for requesting images +# TODO: find a way to get this number without an extra request (is it a hash of the query?) +def get_vqd(query): + res = get(site_url.format(query=urlencode({'q': query}))) + content = res.text + vqd = content[content.find('vqd=\'') + 5:] + vqd = vqd[:vqd.find('\'')] + return vqd + + +# do search-request +def request(query, params): + # to avoid running actual external requests when testing + if 'is_test' not in params: + vqd = get_vqd(query) + else: + vqd = '12345' + + offset = (params['pageno'] - 1) * 50 + + safesearch = params['safesearch'] - 1 + + region_code = get_region_code(params['language']) + if region_code: + params['url'] = images_url.format( + query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd) + else: + params['url'] = images_url.format( + query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd) + + return params + + +# get response from search-request +def response(resp): + results = [] + + content = resp.text + try: + res_json = loads(content) + except: + return [] + + # parse results + for result in res_json['results']: + title = result['title'] + url = result['url'] + thumbnail = result['thumbnail'] + image = result['image'] + + # append result + results.append({'template': 'images.html', + 'title': title, + 'content': '', + 'thumbnail_src': thumbnail, + 'img_src': image, + 'url': url}) + + return results diff --git a/searx/engines/dummy.py b/searx/engines/dummy.py new file mode 100644 index 0000000..50b56ef --- /dev/null +++ b/searx/engines/dummy.py @@ -0,0 +1,16 @@ +""" + Dummy + + @results empty array + @stable yes +""" + + +# do search-request +def request(query, params): + return params + + +# get response from search-request +def response(resp): + return [] diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py new file mode 100644 index 0000000..e24d1b7 --- /dev/null +++ b/searx/engines/faroo.py @@ -0,0 +1,116 @@ +""" + Faroo (Web, News) + + @website http://www.faroo.com + @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, publishedDate, img_src +""" + +from json import loads +import datetime +from searx.utils import searx_useragent +from searx.url_utils import urlencode + +# engine dependent config +categories = ['general', 'news'] +paging = True +language_support = True +number_of_results = 10 +api_key = None + +# search-url +url = 'http://www.faroo.com/' +search_url = url + 'api?{query}'\ + '&start={offset}'\ + '&length={number_of_results}'\ + '&l={language}'\ + '&src={categorie}'\ + '&i=false'\ + '&f=json'\ + '&key={api_key}' # noqa + +search_category = {'general': 'web', + 'news': 'news'} + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * number_of_results + 1 + categorie = search_category.get(params['category'], 'web') + + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].split('_')[0] + + # if language is not supported, put it in english + if language != 'en' and\ + language != 'de' and\ + language != 'zh': + language = 'en' + + params['url'] = search_url.format(offset=offset, + number_of_results=number_of_results, + query=urlencode({'q': query}), + language=language, + categorie=categorie, + api_key=api_key) + + # using searx User-Agent + params['headers']['User-Agent'] = searx_useragent() + + return params + + +# get response from search-request +def response(resp): + # HTTP-Code 401: api-key is not valide + if resp.status_code == 401: + raise Exception("API key is not valide") + + # HTTP-Code 429: rate limit exceeded + if resp.status_code == 429: + raise Exception("rate limit has been exceeded!") + + results = [] + + search_res = loads(resp.text) + + # return empty array if there are no results + if not search_res.get('results', {}): + return [] + + # parse results + for result in search_res['results']: + if result['news']: + # timestamp (milliseconds since 1970) + publishedDate = datetime.datetime.fromtimestamp(result['date'] / 1000.0) # noqa + + # append news result + results.append({'url': result['url'], + 'title': result['title'], + 'publishedDate': publishedDate, + 'content': result['kwic']}) + + else: + # append general result + # TODO, publishedDate correct? + results.append({'url': result['url'], + 'title': result['title'], + 'content': result['kwic']}) + + # append image result if image url is set + # TODO, show results with an image like in faroo + if result['iurl']: + results.append({'template': 'images.html', + 'url': result['url'], + 'title': result['title'], + 'content': result['kwic'], + 'img_src': result['iurl']}) + + # return results + return results diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py new file mode 100644 index 0000000..a6b01a8 --- /dev/null +++ b/searx/engines/fdroid.py @@ -0,0 +1,51 @@ +""" + F-Droid (a repository of FOSS applications for Android) + + @website https://f-droid.org/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +base_url = 'https://f-droid.org/' +search_url = base_url + 'repository/browse/?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'fdfilter': query, 'fdpage': params['pageno']}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for app in dom.xpath('//div[@id="appheader"]'): + url = app.xpath('./ancestor::a/@href')[0] + title = app.xpath('./p/span/text()')[0] + img_src = app.xpath('.//img/@src')[0] + + content = extract_text(app.xpath('./p')[0]) + content = content.replace(title, '', 1).strip() + + results.append({'url': url, + 'title': title, + 'content': content, + 'img_src': img_src}) + + return results diff --git a/searx/engines/filecrop.py b/searx/engines/filecrop.py new file mode 100644 index 0000000..ed57a6b --- /dev/null +++ b/searx/engines/filecrop.py @@ -0,0 +1,88 @@ +from searx.url_utils import urlencode + +try: + from HTMLParser import HTMLParser +except: + from html.parser import HTMLParser + +url = 'http://www.filecrop.com/' +search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa + +paging = True + + +class FilecropResultParser(HTMLParser): + + def __init__(self): + HTMLParser.__init__(self) + self.__start_processing = False + + self.results = [] + self.result = {} + + self.tr_counter = 0 + self.data_counter = 0 + + def handle_starttag(self, tag, attrs): + + if tag == 'tr': + if ('bgcolor', '#edeff5') in attrs or\ + ('bgcolor', '#ffffff') in attrs: + self.__start_processing = True + + if not self.__start_processing: + return + + if tag == 'label': + self.result['title'] = [attr[1] for attr in attrs + if attr[0] == 'title'][0] + elif tag == 'a' and ('rel', 'nofollow') in attrs\ + and ('class', 'sourcelink') in attrs: + if 'content' in self.result: + self.result['content'] += [attr[1] for attr in attrs + if attr[0] == 'title'][0] + else: + self.result['content'] = [attr[1] for attr in attrs + if attr[0] == 'title'][0] + self.result['content'] += ' ' + elif tag == 'a': + self.result['url'] = url + [attr[1] for attr in attrs + if attr[0] == 'href'][0] + + def handle_endtag(self, tag): + if self.__start_processing is False: + return + + if tag == 'tr': + self.tr_counter += 1 + + if self.tr_counter == 2: + self.__start_processing = False + self.tr_counter = 0 + self.data_counter = 0 + self.results.append(self.result) + self.result = {} + + def handle_data(self, data): + if not self.__start_processing: + return + + if 'content' in self.result: + self.result['content'] += data + ' ' + else: + self.result['content'] = data + ' ' + + self.data_counter += 1 + + +def request(query, params): + index = 1 + (params['pageno'] - 1) * 30 + params['url'] = search_url.format(query=urlencode({'w': query}), index=index) + return params + + +def response(resp): + parser = FilecropResultParser() + parser.feed(resp.text) + + return parser.results diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py new file mode 100644 index 0000000..de17693 --- /dev/null +++ b/searx/engines/flickr.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +""" + Flickr (Images) + + @website https://www.flickr.com + @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) + + @using-api yes + @results JSON + @stable yes + @parse url, title, thumbnail, img_src + More info on api-key : https://www.flickr.com/services/apps/create/ +""" + +from json import loads +from searx.url_utils import urlencode + +categories = ['images'] + +nb_per_page = 15 +paging = True +api_key = None + + +url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search' +\ + '&api_key={api_key}&{text}&sort=relevance' +\ + '&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z' +\ + '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}' +photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' + +paging = True + + +def build_flickr_url(user_id, photo_id): + return photo_url.format(userid=user_id, photoid=photo_id) + + +def request(query, params): + params['url'] = url.format(text=urlencode({'text': query}), + api_key=api_key, + nb_per_page=nb_per_page, + page=params['pageno']) + return params + + +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if 'photos' not in search_results: + return [] + + if 'photo' not in search_results['photos']: + return [] + + photos = search_results['photos']['photo'] + + # parse results + for photo in photos: + if 'url_o' in photo: + img_src = photo['url_o'] + elif 'url_z' in photo: + img_src = photo['url_z'] + else: + continue + +# For a bigger thumbnail, keep only the url_z, not the url_n + if 'url_n' in photo: + thumbnail_src = photo['url_n'] + elif 'url_z' in photo: + thumbnail_src = photo['url_z'] + else: + thumbnail_src = img_src + + url = build_flickr_url(photo['owner'], photo['id']) + + # append result + results.append({'url': url, + 'title': photo['title'], + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'content': photo['description']['_content'], + 'author': photo['ownername'], + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py new file mode 100644 index 0000000..08f07f7 --- /dev/null +++ b/searx/engines/flickr_noapi.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +""" + Flickr (Images) + + @website https://www.flickr.com + @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) + + @using-api no + @results HTML + @stable no + @parse url, title, thumbnail, img_src +""" + +from json import loads +from time import time +import re +from searx.engines import logger +from searx.url_utils import urlencode + + +logger = logger.getChild('flickr-noapi') + +categories = ['images'] + +url = 'https://www.flickr.com/' +search_url = url + 'search?{query}&page={page}' +time_range_url = '&min_upload_date={start}&max_upload_date={end}' +photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' +regex = re.compile(r"\"search-photos-lite-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL) +image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's') + +paging = True +time_range_support = True +time_range_dict = {'day': 60 * 60 * 24, + 'week': 60 * 60 * 24 * 7, + 'month': 60 * 60 * 24 * 7 * 4, + 'year': 60 * 60 * 24 * 7 * 52} + + +def build_flickr_url(user_id, photo_id): + return photo_url.format(userid=user_id, photoid=photo_id) + + +def _get_time_range_url(time_range): + if time_range in time_range_dict: + return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range])) + return '' + + +def request(query, params): + params['url'] = (search_url.format(query=urlencode({'text': query}), page=params['pageno']) + + _get_time_range_url(params['time_range'])) + return params + + +def response(resp): + results = [] + + matches = regex.search(resp.text) + + if matches is None: + return results + + match = matches.group(1) + search_results = loads(match) + + if '_data' not in search_results: + return [] + + photos = search_results['_data'] + + for photo in photos: + + # In paged configuration, the first pages' photos + # are represented by a None object + if photo is None: + continue + + img_src = None + # From the biggest to the lowest format + for image_size in image_sizes: + if image_size in photo['sizes']: + img_src = photo['sizes'][image_size]['url'] + break + + if not img_src: + logger.debug('cannot find valid image size: {0}'.format(repr(photo))) + continue + + if 'ownerNsid' not in photo: + continue + + # For a bigger thumbnail, keep only the url_z, not the url_n + if 'n' in photo['sizes']: + thumbnail_src = photo['sizes']['n']['url'] + elif 'z' in photo['sizes']: + thumbnail_src = photo['sizes']['z']['url'] + else: + thumbnail_src = img_src + + url = build_flickr_url(photo['ownerNsid'], photo['id']) + + title = photo.get('title', '') + + author = photo['username'] + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'content': '', + 'author': author, + 'template': 'images.html'}) + + return results diff --git a/searx/engines/framalibre.py b/searx/engines/framalibre.py new file mode 100644 index 0000000..146cdae --- /dev/null +++ b/searx/engines/framalibre.py @@ -0,0 +1,69 @@ +""" + FramaLibre (It) + + @website https://framalibre.org/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, thumbnail, img_src +""" + +from cgi import escape +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urljoin, urlencode + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +base_url = 'https://framalibre.org/' +search_url = base_url + 'recherche-par-crit-res?{query}&page={offset}' + +# specific xpath variables +results_xpath = '//div[@class="nodes-list-row"]/div[contains(@typeof,"sioc:Item")]' +link_xpath = './/h3[@class="node-title"]/a[@href]' +thumbnail_xpath = './/img[@class="media-object img-responsive"]/@src' +content_xpath = './/div[@class="content"]//p' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) + params['url'] = search_url.format(query=urlencode({'keys': query}), + offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] + href = urljoin(base_url, link.attrib.get('href')) + # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this... + title = escape(extract_text(link)) + thumbnail_tags = result.xpath(thumbnail_xpath) + thumbnail = None + if len(thumbnail_tags) > 0: + thumbnail = extract_text(thumbnail_tags[0]) + if thumbnail[0] == '/': + thumbnail = base_url + thumbnail + content = escape(extract_text(result.xpath(content_xpath))) + + # append result + results.append({'url': href, + 'title': title, + 'img_src': thumbnail, + 'content': content}) + + # return results + return results diff --git a/searx/engines/frinkiac.py b/searx/engines/frinkiac.py new file mode 100644 index 0000000..a67b42d --- /dev/null +++ b/searx/engines/frinkiac.py @@ -0,0 +1,44 @@ +""" +Frinkiac (Images) + +@website https://www.frinkiac.com +@provide-api no +@using-api no +@results JSON +@stable no +@parse url, title, img_src +""" + +from json import loads +from searx.url_utils import urlencode + +categories = ['images'] + +BASE = 'https://frinkiac.com/' +SEARCH_URL = '{base}api/search?{query}' +RESULT_URL = '{base}?{query}' +THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg' +IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg' + + +def request(query, params): + params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query})) + return params + + +def response(resp): + results = [] + response_data = loads(resp.text) + for result in response_data: + episode = result['Episode'] + timestamp = result['Timestamp'] + + results.append({'template': 'images.html', + 'url': RESULT_URL.format(base=BASE, + query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})), + 'title': episode, + 'content': '', + 'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp), + 'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp)}) + + return results diff --git a/searx/engines/generalfile.py b/searx/engines/generalfile.py new file mode 100644 index 0000000..3bb2744 --- /dev/null +++ b/searx/engines/generalfile.py @@ -0,0 +1,62 @@ +""" + General Files (Files) + + @website http://www.general-files.org + @provide-api no (nothing found) + + @using-api no (because nothing found) + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content + + @todo detect torrents? +""" + +from lxml import html + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +base_url = 'http://www.general-file.com' +search_url = base_url + '/files-{letter}/{query}/{pageno}' + +# specific xpath variables +result_xpath = '//table[@class="block-file"]' +title_xpath = './/h2/a//text()' +url_xpath = './/h2/a/@href' +content_xpath = './/p//text()' + + +# do search-request +def request(query, params): + + params['url'] = search_url.format(query=query, + letter=query[0], + pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(result_xpath): + url = result.xpath(url_xpath)[0] + + # skip fast download links + if not url.startswith('/'): + continue + + # append result + results.append({'url': base_url + url, + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))}) + + # return results + return results diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py new file mode 100644 index 0000000..37933c6 --- /dev/null +++ b/searx/engines/gigablast.py @@ -0,0 +1,106 @@ +""" + Gigablast (Web) + + @website https://gigablast.com + @provide-api yes (https://gigablast.com/api.html) + + @using-api yes + @results XML + @stable yes + @parse url, title, content +""" + +from json import loads +from time import time +from lxml.html import fromstring +from searx.url_utils import urlencode + +# engine dependent config +categories = ['general'] +paging = True +number_of_results = 10 +language_support = True +safesearch = True + +# search-url +base_url = 'https://gigablast.com/' +search_string = 'search?{query}'\ + '&n={number_of_results}'\ + '&c=main'\ + '&s={offset}'\ + '&format=json'\ + '&qh=0'\ + '&qlang={lang}'\ + '&ff={safesearch}'\ + '&rxikd={rxikd}' # random number - 9 digits + +# specific xpath variables +results_xpath = '//response//result' +url_xpath = './/url' +title_xpath = './/title' +content_xpath = './/sum' + +supported_languages_url = 'https://gigablast.com/search?&rxikd=1' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * number_of_results + + if params['language'] == 'all': + language = 'xx' + else: + language = params['language'].replace('-', '_').lower() + if language.split('-')[0] != 'zh': + language = language.split('-')[0] + + if params['safesearch'] >= 1: + safesearch = 1 + else: + safesearch = 0 + + search_path = search_string.format(query=urlencode({'q': query}), + offset=offset, + number_of_results=number_of_results, + rxikd=str(time())[:9], + lang=language, + safesearch=safesearch) + + params['url'] = base_url + search_path + + return params + + +# get response from search-request +def response(resp): + results = [] + + # parse results + response_json = loads(resp.text) + + for result in response_json['results']: + # append result + results.append({'url': result['url'], + 'title': result['title'], + 'content': result['sum']}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = fromstring(resp.text) + links = dom.xpath('//span[@id="menu2"]/a') + for link in links: + href = link.xpath('./@href')[0].split('lang%3A') + if len(href) == 2: + code = href[1].split('_') + if len(code) == 2: + code = code[0] + '-' + code[1].upper() + else: + code = code[0] + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/github.py b/searx/engines/github.py new file mode 100644 index 0000000..eaa00da --- /dev/null +++ b/searx/engines/github.py @@ -0,0 +1,60 @@ +""" + Github (It) + + @website https://github.com/ + @provide-api yes (https://developer.github.com/v3/) + + @using-api yes + @results JSON + @stable yes (using api) + @parse url, title, content +""" + +from json import loads +from searx.url_utils import urlencode + +# engine dependent config +categories = ['it'] + +# search-url +search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa + +accept_header = 'application/vnd.github.preview.text-match+json' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + params['headers']['Accept'] = accept_header + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # check if items are recieved + if 'items' not in search_res: + return [] + + # parse results + for res in search_res['items']: + title = res['name'] + url = res['html_url'] + + if res['description']: + content = res['description'][:500] + else: + content = '' + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results + return results diff --git a/searx/engines/google.py b/searx/engines/google.py new file mode 100644 index 0000000..934f5c2 --- /dev/null +++ b/searx/engines/google.py @@ -0,0 +1,388 @@ +# Google (Web) +# +# @website https://www.google.com +# @provide-api yes (https://developers.google.com/custom-search/) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content, suggestion + +import re +from lxml import html, etree +from searx.engines.xpath import extract_text, extract_url +from searx import logger +from searx.url_utils import urlencode, urlparse, parse_qsl + +logger = logger.getChild('google engine') + + +# engine dependent config +categories = ['general'] +paging = True +language_support = True +use_locale_domain = True +time_range_support = True + +# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests +default_hostname = 'www.google.com' + +country_to_hostname = { + 'BG': 'www.google.bg', # Bulgaria + 'CZ': 'www.google.cz', # Czech Republic + 'DE': 'www.google.de', # Germany + 'DK': 'www.google.dk', # Denmark + 'AT': 'www.google.at', # Austria + 'CH': 'www.google.ch', # Switzerland + 'GR': 'www.google.gr', # Greece + 'AU': 'www.google.com.au', # Australia + 'CA': 'www.google.ca', # Canada + 'GB': 'www.google.co.uk', # United Kingdom + 'ID': 'www.google.co.id', # Indonesia + 'IE': 'www.google.ie', # Ireland + 'IN': 'www.google.co.in', # India + 'MY': 'www.google.com.my', # Malaysia + 'NZ': 'www.google.co.nz', # New Zealand + 'PH': 'www.google.com.ph', # Philippines + 'SG': 'www.google.com.sg', # Singapore + # 'US': 'www.google.us', # United States, redirect to .com + 'ZA': 'www.google.co.za', # South Africa + 'AR': 'www.google.com.ar', # Argentina + 'CL': 'www.google.cl', # Chile + 'ES': 'www.google.es', # Spain + 'MX': 'www.google.com.mx', # Mexico + 'EE': 'www.google.ee', # Estonia + 'FI': 'www.google.fi', # Finland + 'BE': 'www.google.be', # Belgium + 'FR': 'www.google.fr', # France + 'IL': 'www.google.co.il', # Israel + 'HR': 'www.google.hr', # Croatia + 'HU': 'www.google.hu', # Hungary + 'IT': 'www.google.it', # Italy + 'JP': 'www.google.co.jp', # Japan + 'KR': 'www.google.co.kr', # South Korea + 'LT': 'www.google.lt', # Lithuania + 'LV': 'www.google.lv', # Latvia + 'NO': 'www.google.no', # Norway + 'NL': 'www.google.nl', # Netherlands + 'PL': 'www.google.pl', # Poland + 'BR': 'www.google.com.br', # Brazil + 'PT': 'www.google.pt', # Portugal + 'RO': 'www.google.ro', # Romania + 'RU': 'www.google.ru', # Russia + 'SK': 'www.google.sk', # Slovakia + 'SL': 'www.google.si', # Slovenia (SL -> si) + 'SE': 'www.google.se', # Sweden + 'TH': 'www.google.co.th', # Thailand + 'TR': 'www.google.com.tr', # Turkey + 'UA': 'www.google.com.ua', # Ukraine + # 'CN': 'www.google.cn', # China, only from China ? + 'HK': 'www.google.com.hk', # Hong Kong + 'TW': 'www.google.com.tw' # Taiwan +} + +# osm +url_map = 'https://www.openstreetmap.org/'\ + + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' + +# search-url +search_path = '/search' +search_url = ('https://{hostname}' + + search_path + + '?{query}&start={offset}&gws_rd=cr&gbv=1&lr={lang}&ei=x') + +time_range_search = "&tbs=qdr:{range}" +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y'} + +# other URLs +map_hostname_start = 'maps.google.' +maps_path = '/maps' +redirect_path = '/url' +images_path = '/images' +supported_languages_url = 'https://www.google.com/preferences?#languages' + +# specific xpath variables +results_xpath = '//div[@class="g"]' +url_xpath = './/h3/a/@href' +title_xpath = './/h3' +content_xpath = './/span[@class="st"]' +content_misc_xpath = './/div[@class="f slp"]' +suggestion_xpath = '//p[@class="_Bmc"]' +spelling_suggestion_xpath = '//a[@class="spell"]' + +# map : detail location +map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()' +map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span' +map_website_url_xpath = 'h3[2]/a/@href' +map_website_title_xpath = 'h3[2]' + +# map : near the location +map_near = 'table[@class="ts"]//tr' +map_near_title = './/h4' +map_near_url = './/h4/a/@href' +map_near_phone = './/span[@class="nobr"]' + +# images +images_xpath = './/div/a' +image_url_xpath = './@href' +image_img_src_xpath = './img/@src' + +# property names +# FIXME : no translation +property_address = "Address" +property_phone = "Phone number" + + +# remove google-specific tracking-url +def parse_url(url_string, google_hostname): + # sanity check + if url_string is None: + return url_string + + # normal case + parsed_url = urlparse(url_string) + if (parsed_url.netloc in [google_hostname, ''] + and parsed_url.path == redirect_path): + query = dict(parse_qsl(parsed_url.query)) + return query['q'] + else: + return url_string + + +# returns extract_text on the first result selected by the xpath or None +def extract_text_from_dom(result, xpath): + r = result.xpath(xpath) + if len(r) > 0: + return extract_text(r[0]) + return None + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + + if params['language'] == 'all': + language = 'en' + country = 'US' + url_lang = '' + elif params['language'][:2] == 'jv': + language = 'jw' + country = 'ID' + url_lang = 'lang_jw' + else: + language_array = params['language'].lower().split('-') + if len(language_array) == 2: + country = language_array[1] + else: + country = 'US' + language = language_array[0] + ',' + language_array[0] + '-' + country + url_lang = 'lang_' + language_array[0] + + if use_locale_domain: + google_hostname = country_to_hostname.get(country.upper(), default_hostname) + else: + google_hostname = default_hostname + + params['url'] = search_url.format(offset=offset, + query=urlencode({'q': query}), + hostname=google_hostname, + lang=url_lang) + if params['time_range'] in time_range_dict: + params['url'] += time_range_search.format(range=time_range_dict[params['time_range']]) + + params['headers']['Accept-Language'] = language + params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + + params['google_hostname'] = google_hostname + + return params + + +# get response from search-request +def response(resp): + results = [] + + # detect google sorry + resp_url = urlparse(resp.url) + if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': + raise RuntimeWarning('sorry.google.com') + + # which hostname ? + google_hostname = resp.search_params.get('google_hostname') + google_url = "https://" + google_hostname + + # convert the text to dom + dom = html.fromstring(resp.text) + + instant_answer = dom.xpath('//div[@id="_vBb"]//text()') + if instant_answer: + results.append({'answer': u' '.join(instant_answer)}) + try: + results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0] + .split()[1].replace(',', '')) + results.append({'number_of_results': results_num}) + except: + pass + + # parse results + for result in dom.xpath(results_xpath): + try: + title = extract_text(result.xpath(title_xpath)[0]) + url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) + parsed_url = urlparse(url, google_hostname) + + # map result + if parsed_url.netloc == google_hostname: + # TODO fix inside links + continue + # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): + # print "yooooo"*30 + # x = result.xpath(map_near) + # if len(x) > 0: + # # map : near the location + # results = results + parse_map_near(parsed_url, x, google_hostname) + # else: + # # map : detail about a location + # results = results + parse_map_detail(parsed_url, result, google_hostname) + # # google news + # elif parsed_url.path == search_path: + # # skipping news results + # pass + + # # images result + # elif parsed_url.path == images_path: + # # only thumbnail image provided, + # # so skipping image results + # # results = results + parse_images(result, google_hostname) + # pass + + else: + # normal result + content = extract_text_from_dom(result, content_xpath) + if content is None: + continue + content_misc = extract_text_from_dom(result, content_misc_xpath) + if content_misc is not None: + content = content_misc + "<br />" + content + # append result + results.append({'url': url, + 'title': title, + 'content': content + }) + except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) + continue + + # parse suggestion + for suggestion in dom.xpath(suggestion_xpath): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + for correction in dom.xpath(spelling_suggestion_xpath): + results.append({'correction': extract_text(correction)}) + + # return results + return results + + +def parse_images(result, google_hostname): + results = [] + for image in result.xpath(images_xpath): + url = parse_url(extract_text(image.xpath(image_url_xpath)[0]), google_hostname) + img_src = extract_text(image.xpath(image_img_src_xpath)[0]) + + # append result + results.append({'url': url, + 'title': '', + 'content': '', + 'img_src': img_src, + 'template': 'images.html' + }) + + return results + + +def parse_map_near(parsed_url, x, google_hostname): + results = [] + + for result in x: + title = extract_text_from_dom(result, map_near_title) + url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname) + attributes = [] + phone = extract_text_from_dom(result, map_near_phone) + add_attributes(attributes, property_phone, phone, 'tel:' + phone) + results.append({'title': title, + 'url': url, + 'content': attributes_to_html(attributes) + }) + + return results + + +def parse_map_detail(parsed_url, result, google_hostname): + results = [] + + # try to parse the geoloc + m = re.search(r'@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path) + if m is None: + m = re.search(r'll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query) + + if m is not None: + # geoloc found (ignored) + lon = float(m.group(2)) # noqa + lat = float(m.group(1)) # noqa + zoom = int(m.group(3)) # noqa + + # attributes + attributes = [] + address = extract_text_from_dom(result, map_address_xpath) + phone = extract_text_from_dom(result, map_phone_xpath) + add_attributes(attributes, property_address, address, 'geo:' + str(lat) + ',' + str(lon)) + add_attributes(attributes, property_phone, phone, 'tel:' + phone) + + # title / content / url + website_title = extract_text_from_dom(result, map_website_title_xpath) + content = extract_text_from_dom(result, content_xpath) + website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname) + + # add a result if there is a website + if website_url is not None: + results.append({'title': website_title, + 'content': (content + '<br />' if content is not None else '') + + attributes_to_html(attributes), + 'url': website_url + }) + + return results + + +def add_attributes(attributes, name, value, url): + if value is not None and len(value) > 0: + attributes.append({'label': name, 'value': value, 'url': url}) + + +def attributes_to_html(attributes): + retval = '<table class="table table-striped">' + for a in attributes: + value = a.get('value') + if 'url' in a: + value = '<a href="' + a.get('url') + '">' + value + '</a>' + retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>' + retval = retval + '</table>' + return retval + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = {} + dom = html.fromstring(resp.text) + options = dom.xpath('//table//td/font/label/span') + for option in options: + code = option.xpath('./@id')[0][1:] + name = option.text.title() + supported_languages[code] = {"name": name} + + return supported_languages diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py new file mode 100644 index 0000000..9692f4b --- /dev/null +++ b/searx/engines/google_images.py @@ -0,0 +1,95 @@ +""" + Google (Images) + + @website https://www.google.com + @provide-api yes (https://developers.google.com/custom-search/) + + @using-api no + @results HTML chunks with JSON inside + @stable no + @parse url, title, img_src +""" + +from datetime import date, timedelta +from json import loads +from lxml import html +from searx.url_utils import urlencode + + +# engine dependent config +categories = ['images'] +paging = True +safesearch = True +time_range_support = True +number_of_results = 100 + +search_url = 'https://www.google.com/search'\ + '?{query}'\ + '&asearch=ichunk'\ + '&async=_id:rg_s,_pms:s'\ + '&tbm=isch'\ + '&yv=2'\ + '&{search_options}' +time_range_attr = "qdr:{range}" +time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm'} + + +# do search-request +def request(query, params): + search_options = { + 'ijn': params['pageno'] - 1, + 'start': (params['pageno'] - 1) * number_of_results + } + + if params['time_range'] in time_range_dict: + search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) + elif params['time_range'] == 'year': + now = date.today() + then = now - timedelta(days=365) + start = then.strftime('%m/%d/%Y') + end = now.strftime('%m/%d/%Y') + search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) + + if safesearch and params['safesearch']: + search_options['safe'] = 'on' + + params['url'] = search_url.format(query=urlencode({'q': query}), + search_options=urlencode(search_options)) + + return params + + +# get response from search-request +def response(resp): + results = [] + + g_result = loads(resp.text) + + dom = html.fromstring(g_result[1][1]) + + # parse results + for result in dom.xpath('//div[@data-ved]'): + + try: + metadata = loads(''.join(result.xpath('./div[@class="rg_meta"]/text()'))) + except: + continue + + thumbnail_src = metadata['tu'] + + # http to https + thumbnail_src = thumbnail_src.replace("http://", "https://") + + # append result + results.append({'url': metadata['ru'], + 'title': metadata['pt'], + 'content': metadata['s'], + 'thumbnail_src': thumbnail_src, + 'img_src': metadata['ou'], + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py new file mode 100644 index 0000000..7344b52 --- /dev/null +++ b/searx/engines/google_news.py @@ -0,0 +1,84 @@ +""" + Google (News) + + @website https://news.google.com + @provide-api no + + @using-api no + @results HTML + @stable no + @parse url, title, content, publishedDate +""" + +from lxml import html +from searx.engines.google import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode + +# search-url +categories = ['news'] +paging = True +language_support = True +safesearch = True +time_range_support = True +number_of_results = 10 + +search_url = 'https://www.google.com/search'\ + '?{query}'\ + '&tbm=nws'\ + '&gws_rd=cr'\ + '&{search_options}' +time_range_attr = "qdr:{range}" +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y'} + + +# do search-request +def request(query, params): + + search_options = { + 'start': (params['pageno'] - 1) * number_of_results + } + + if params['time_range'] in time_range_dict: + search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) + + if safesearch and params['safesearch']: + search_options['safe'] = 'on' + + params['url'] = search_url.format(query=urlencode({'q': query}), + search_options=urlencode(search_options)) + + if params['language'] != 'all': + language_array = params['language'].lower().split('-') + params['url'] += '&lr=lang_' + language_array[0] + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'): + try: + r = { + 'url': result.xpath('.//div[@class="_cnc"]//a/@href')[0], + 'title': ''.join(result.xpath('.//div[@class="_cnc"]//h3//text()')), + 'content': ''.join(result.xpath('.//div[@class="st"]//text()')), + } + except: + continue + + imgs = result.xpath('.//img/@src') + if len(imgs) and not imgs[0].startswith('data'): + r['img_src'] = imgs[0] + + results.append(r) + + # return results + return results diff --git a/searx/engines/ina.py b/searx/engines/ina.py new file mode 100644 index 0000000..37a05f0 --- /dev/null +++ b/searx/engines/ina.py @@ -0,0 +1,87 @@ +# INA (Videos) +# +# @website https://www.ina.fr/ +# @provide-api no +# +# @using-api no +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, publishedDate, thumbnail +# +# @todo set content-parameter with correct data +# @todo embedded (needs some md5 from video page) + +from json import loads +from lxml import html +from dateutil import parser +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +try: + from HTMLParser import HTMLParser +except: + from html.parser import HTMLParser + +# engine dependent config +categories = ['videos'] +paging = True +page_size = 48 + +# search-url +base_url = 'https://www.ina.fr' +search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' + +# specific xpath variables +results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]' +url_xpath = './/a/@href' +title_xpath = './/h3[@class="h3--title media-heading"]' +thumbnail_xpath = './/img/@src' +publishedDate_xpath = './/span[@class="broadcast"]' +content_xpath = './/p[@class="media-body__summary"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(ps=page_size, + start=params['pageno'] * page_size, + query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + # we get html in a JSON container... + response = loads(resp.text) + if "content" not in response: + return [] + dom = html.fromstring(response["content"]) + p = HTMLParser() + + # parse results + for result in dom.xpath(results_xpath): + videoid = result.xpath(url_xpath)[0] + url = base_url + videoid + title = p.unescape(extract_text(result.xpath(title_xpath))) + thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) + if thumbnail[0] == '/': + thumbnail = base_url + thumbnail + d = extract_text(result.xpath(publishedDate_xpath)[0]) + d = d.split('/') + # force ISO date to avoid wrong parsing + d = "%s-%s-%s" % (d[2], d[1], d[0]) + publishedDate = parser.parse(d) + content = extract_text(result.xpath(content_xpath)) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'publishedDate': publishedDate, + 'thumbnail': thumbnail}) + + # return results + return results diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py new file mode 100644 index 0000000..67d6a5a --- /dev/null +++ b/searx/engines/json_engine.py @@ -0,0 +1,118 @@ +from collections import Iterable +from json import loads +from sys import version_info +from searx.url_utils import urlencode + +if version_info[0] == 3: + unicode = str + +search_url = None +url_query = None +content_query = None +title_query = None +paging = False +suggestion_query = '' +results_query = '' + +# parameters for engines with paging support +# +# number of results on each page +# (only needed if the site requires not a page number, but an offset) +page_size = 1 +# number of the first page (usually 0 or 1) +first_page_num = 1 + + +def iterate(iterable): + if type(iterable) == dict: + it = iterable.items() + + else: + it = enumerate(iterable) + for index, value in it: + yield str(index), value + + +def is_iterable(obj): + if type(obj) == str: + return False + if type(obj) == unicode: + return False + return isinstance(obj, Iterable) + + +def parse(query): + q = [] + for part in query.split('/'): + if part == '': + continue + else: + q.append(part) + return q + + +def do_query(data, q): + ret = [] + if not q: + return ret + + qkey = q[0] + + for key, value in iterate(data): + + if len(q) == 1: + if key == qkey: + ret.append(value) + elif is_iterable(value): + ret.extend(do_query(value, q)) + else: + if not is_iterable(value): + continue + if key == qkey: + ret.extend(do_query(value, q[1:])) + else: + ret.extend(do_query(value, q)) + return ret + + +def query(data, query_string): + q = parse(query_string) + + return do_query(data, q) + + +def request(query, params): + query = urlencode({'q': query})[2:] + + fp = {'query': query} + if paging and search_url.find('{pageno}') >= 0: + fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num + + params['url'] = search_url.format(**fp) + params['query'] = query + + return params + + +def response(resp): + results = [] + json = loads(resp.text) + if results_query: + for result in query(json, results_query)[0]: + url = query(result, url_query)[0] + title = query(result, title_query)[0] + content = query(result, content_query)[0] + results.append({'url': url, 'title': title, 'content': content}) + else: + for url, title, content in zip( + query(json, url_query), + query(json, title_query), + query(json, content_query) + ): + results.append({'url': url, 'title': title, 'content': content}) + + if not suggestion_query: + return results + for suggestion in query(json, suggestion_query): + results.append({'suggestion': suggestion}) + return results diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py new file mode 100644 index 0000000..5e897c9 --- /dev/null +++ b/searx/engines/kickass.py @@ -0,0 +1,92 @@ +""" + Kickass Torrent (Videos, Music, Files) + + @website https://kickass.so + @provide-api no (nothing found) + + @using-api no + @results HTML (using search portal) + @stable yes (HTML can change) + @parse url, title, content, seed, leech, magnetlink +""" + +from lxml import html +from operator import itemgetter +from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size, convert_str_to_int +from searx.url_utils import quote, urljoin + +# engine dependent config +categories = ['videos', 'music', 'files'] +paging = True + +# search-url +url = 'https://kickass.cd/' +search_url = url + 'search/{search_term}/{pageno}/' + +# specific xpath variables +magnet_xpath = './/a[@title="Torrent magnet link"]' +torrent_xpath = './/a[@title="Download torrent file"]' +content_xpath = './/span[@class="font11px lightgrey block"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), + pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + search_res = dom.xpath('//table[@class="data"]//tr') + + # return empty array if nothing is found + if not search_res: + return [] + + # parse results + for result in search_res[1:]: + link = result.xpath('.//a[@class="cellMainLink"]')[0] + href = urljoin(url, link.attrib['href']) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) + seed = extract_text(result.xpath('.//td[contains(@class, "green")]')) + leech = extract_text(result.xpath('.//td[contains(@class, "red")]')) + filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]')) + files = extract_text(result.xpath('.//td[contains(@class, "center")][2]')) + + seed = convert_str_to_int(seed) + leech = convert_str_to_int(leech) + + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + if files.isdigit(): + files = int(files) + else: + files = None + + magnetlink = result.xpath(magnet_xpath)[0].attrib['href'] + + torrentfile = result.xpath(torrent_xpath)[0].attrib['href'] + torrentfileurl = quote(torrentfile, safe="%/:=&?~#+!$,;'@()*") + + # append result + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': filesize, + 'files': files, + 'magnetlink': magnetlink, + 'torrentfile': torrentfileurl, + 'template': 'torrent.html'}) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py new file mode 100644 index 0000000..0607ac9 --- /dev/null +++ b/searx/engines/mediawiki.py @@ -0,0 +1,90 @@ +""" + general mediawiki-engine (Web) + + @website websites built on mediawiki (https://www.mediawiki.org) + @provide-api yes (http://www.mediawiki.org/wiki/API:Search) + + @using-api yes + @results JSON + @stable yes + @parse url, title + + @todo content +""" + +from json import loads +from string import Formatter +from searx.url_utils import urlencode, quote + +# engine dependent config +categories = ['general'] +language_support = True +paging = True +number_of_results = 1 +search_type = 'nearmatch' # possible values: title, text, nearmatch + +# search-url +base_url = 'https://{language}.wikipedia.org/' +search_postfix = 'w/api.php?action=query'\ + '&list=search'\ + '&{query}'\ + '&format=json'\ + '&sroffset={offset}'\ + '&srlimit={limit}'\ + '&srwhat={searchtype}' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'srsearch': query}), + offset=offset, + limit=number_of_results, + searchtype=search_type) + + format_strings = list(Formatter().parse(base_url)) + + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].split('-')[0] + + # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)] + if any(x[1] == 'language' for x in format_strings): + string_args['language'] = language + + # write search-language back to params, required in response + params['language'] = language + + search_url = base_url + search_postfix + + params['url'] = search_url.format(**string_args) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if not search_results.get('query', {}).get('search'): + return [] + + # parse results + for result in search_results['query']['search']: + if result.get('snippet', '').startswith('#REDIRECT'): + continue + url = base_url.format(language=resp.search_params['language']) +\ + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')) + + # append result + results.append({'url': url, + 'title': result['title'], + 'content': ''}) + + # return results + return results diff --git a/searx/engines/mixcloud.py b/searx/engines/mixcloud.py new file mode 100644 index 0000000..470c007 --- /dev/null +++ b/searx/engines/mixcloud.py @@ -0,0 +1,61 @@ +""" + Mixcloud (Music) + + @website https://http://www.mixcloud.com/ + @provide-api yes (http://www.mixcloud.com/developers/ + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, embedded, publishedDate +""" + +from json import loads +from dateutil import parser +from searx.url_utils import urlencode + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +url = 'https://api.mixcloud.com/' +search_url = url + 'search/?{query}&type=cloudcast&limit=10&offset={offset}' + +embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\ + 'data-src="https://www.mixcloud.com/widget/iframe/?feed={url}" width="300" height="300"></iframe>' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + + params['url'] = search_url.format(query=urlencode({'q': query}), + offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('data', []): + title = result['name'] + url = result['url'] + content = result['user']['name'] + embedded = embedded_url.format(url=url) + publishedDate = parser.parse(result['created_time']) + + # append result + results.append({'url': url, + 'title': title, + 'embedded': embedded, + 'publishedDate': publishedDate, + 'content': content}) + + # return results + return results diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py new file mode 100644 index 0000000..272c712 --- /dev/null +++ b/searx/engines/nyaa.py @@ -0,0 +1,117 @@ +""" + Nyaa.se (Anime Bittorrent tracker) + + @website http://www.nyaa.se/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, seed, leech, torrentfile +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['files', 'images', 'videos', 'music'] +paging = True + +# search-url +base_url = 'http://www.nyaa.se/' +search_url = base_url + '?page=search&{query}&offset={offset}' + +# xpath queries +xpath_results = '//table[@class="tlist"]//tr[contains(@class, "tlistrow")]' +xpath_category = './/td[@class="tlisticon"]/a' +xpath_title = './/td[@class="tlistname"]/a' +xpath_torrent_file = './/td[@class="tlistdownload"]/a' +xpath_filesize = './/td[@class="tlistsize"]/text()' +xpath_seeds = './/td[@class="tlistsn"]/text()' +xpath_leeches = './/td[@class="tlistln"]/text()' +xpath_downloads = './/td[@class="tlistdn"]/text()' + + +# convert a variable to integer or return 0 if it's not a number +def int_or_zero(num): + if isinstance(num, list): + if len(num) < 1: + return 0 + num = num[0] + if num.isdigit(): + return int(num) + return 0 + + +# get multiplier to convert torrent size to bytes +def get_filesize_mul(suffix): + return { + 'KB': 1024, + 'MB': 1024 ** 2, + 'GB': 1024 ** 3, + 'TB': 1024 ** 4, + + 'KIB': 1024, + 'MIB': 1024 ** 2, + 'GIB': 1024 ** 3, + 'TIB': 1024 ** 4 + }[str(suffix).upper()] + + +# do search-request +def request(query, params): + query = urlencode({'term': query}) + params['url'] = search_url.format(query=query, offset=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath(xpath_results): + # category in which our torrent belongs + category = result.xpath(xpath_category)[0].attrib.get('title') + + # torrent title + page_a = result.xpath(xpath_title)[0] + title = extract_text(page_a) + + # link to the page + href = page_a.attrib.get('href') + + # link to the torrent file + torrent_link = result.xpath(xpath_torrent_file)[0].attrib.get('href') + + # torrent size + try: + file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') + file_size = int(float(file_size) * get_filesize_mul(suffix)) + except: + file_size = None + + # seed count + seed = int_or_zero(result.xpath(xpath_seeds)) + + # leech count + leech = int_or_zero(result.xpath(xpath_leeches)) + + # torrent downloads count + downloads = int_or_zero(result.xpath(xpath_downloads)) + + # content string contains all information not included into template + content = 'Category: "{category}". Downloaded {downloads} times.' + content = content.format(category=category, downloads=downloads) + + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': file_size, + 'torrentfile': torrent_link, + 'template': 'torrent.html'}) + + return results diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py new file mode 100644 index 0000000..733ba62 --- /dev/null +++ b/searx/engines/openstreetmap.py @@ -0,0 +1,95 @@ +""" + OpenStreetMap (Map) + + @website https://openstreetmap.org/ + @provide-api yes (http://wiki.openstreetmap.org/wiki/Nominatim) + + @using-api yes + @results JSON + @stable yes + @parse url, title +""" + +from json import loads + +# engine dependent config +categories = ['map'] +paging = False + +# search-url +base_url = 'https://nominatim.openstreetmap.org/' +search_string = 'search/{query}?format=json&polygon_geojson=1&addressdetails=1' +result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' + + +# do search-request +def request(query, params): + params['url'] = base_url + search_string.format(query=query) + + return params + + +# get response from search-request +def response(resp): + results = [] + json = loads(resp.text) + + # parse results + for r in json: + if 'display_name' not in r: + continue + + title = r['display_name'] or u'' + osm_type = r.get('osm_type', r.get('type')) + url = result_base_url.format(osm_type=osm_type, + osm_id=r['osm_id']) + + osm = {'type': osm_type, + 'id': r['osm_id']} + + geojson = r.get('geojson') + + # if no geojson is found and osm_type is a node, add geojson Point + if not geojson and osm_type == 'node': + geojson = {u'type': u'Point', u'coordinates': [r['lon'], r['lat']]} + + address_raw = r.get('address') + address = {} + + # get name + if r['class'] == 'amenity' or\ + r['class'] == 'shop' or\ + r['class'] == 'tourism' or\ + r['class'] == 'leisure': + if address_raw.get('address29'): + address = {'name': address_raw.get('address29')} + else: + address = {'name': address_raw.get(r['type'])} + + # add rest of adressdata, if something is already found + if address.get('name'): + address.update({'house_number': address_raw.get('house_number'), + 'road': address_raw.get('road'), + 'locality': address_raw.get('city', + address_raw.get('town', # noqa + address_raw.get('village'))), # noqa + 'postcode': address_raw.get('postcode'), + 'country': address_raw.get('country'), + 'country_code': address_raw.get('country_code')}) + else: + address = None + + # append result + results.append({'template': 'map.html', + 'title': title, + 'content': '', + 'longitude': r['lon'], + 'latitude': r['lat'], + 'boundingbox': r['boundingbox'], + 'geojson': geojson, + 'address': address, + 'osm': osm, + 'url': url}) + + # return results + return results diff --git a/searx/engines/pdbe.py b/searx/engines/pdbe.py new file mode 100644 index 0000000..f784e10 --- /dev/null +++ b/searx/engines/pdbe.py @@ -0,0 +1,109 @@ +""" + PDBe (Protein Data Bank in Europe) + + @website https://www.ebi.ac.uk/pdbe + @provide-api yes (https://www.ebi.ac.uk/pdbe/api/doc/search.html), + unlimited + @using-api yes + @results python dictionary (from json) + @stable yes + @parse url, title, content, img_src +""" + +from json import loads +from flask_babel import gettext + +categories = ['science'] + +hide_obsolete = False + +# status codes of unpublished entries +pdb_unpublished_codes = ['HPUB', 'HOLD', 'PROC', 'WAIT', 'AUTH', 'AUCO', 'REPL', 'POLC', 'REFI', 'TRSF', 'WDRN'] +# url for api query +pdbe_solr_url = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?' +# base url for results +pdbe_entry_url = 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}' +# link to preview image of structure +pdbe_preview_url = 'https://www.ebi.ac.uk/pdbe/static/entry/{pdb_id}_deposited_chain_front_image-200x200.png' + + +def request(query, params): + + params['url'] = pdbe_solr_url + params['method'] = 'POST' + params['data'] = { + 'q': query, + 'wt': "json" # request response in parsable format + } + return params + + +def construct_body(result): + # set title + title = result['title'] + + # construct content body + content = """{title}<br />{authors} {journal} <strong>{volume}</strong> {page} ({year})""" + + # replace placeholders with actual content + try: + if result['journal']: + content = content.format( + title=result['citation_title'], + authors=result['entry_author_list'][0], journal=result['journal'], volume=result['journal_volume'], + page=result['journal_page'], year=result['citation_year']) + else: + content = content.format( + title=result['citation_title'], + authors=result['entry_author_list'][0], journal='', volume='', page='', year=result['release_year']) + img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) + except (KeyError): + content = None + img_src = None + + # construct url for preview image + try: + img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) + except (KeyError): + img_src = None + + return [title, content, img_src] + + +def response(resp): + + results = [] + json = loads(resp.text)['response']['docs'] + + # parse results + for result in json: + # catch obsolete entries and mark them accordingly + if result['status'] in pdb_unpublished_codes: + continue + if hide_obsolete: + continue + if result['status'] == 'OBS': + # expand title to add some sort of warning message + title = gettext('{title} (OBSOLETE)').format(title=result['title']) + superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by']) + + # since we can't construct a proper body from the response, we'll make up our own + msg_superseded = gettext("This entry has been superseded by") + content = '<em>{msg_superseded} \<a href="{url}">{pdb_id}</a></em>'.format( + msg_superseded=msg_superseded, + url=superseded_url, + pdb_id=result['superseded_by'], ) + + # obsoleted entries don't have preview images + img_src = None + else: + title, content, img_src = construct_body(result) + + results.append({ + 'url': pdbe_entry_url.format(pdb_id=result['pdb_id']), + 'title': title, + 'content': content, + 'img_src': img_src + }) + + return results diff --git a/searx/engines/photon.py b/searx/engines/photon.py new file mode 100644 index 0000000..15236f6 --- /dev/null +++ b/searx/engines/photon.py @@ -0,0 +1,131 @@ +""" + Photon (Map) + + @website https://photon.komoot.de + @provide-api yes (https://photon.komoot.de/) + + @using-api yes + @results JSON + @stable yes + @parse url, title +""" + +from json import loads +from searx.utils import searx_useragent +from searx.url_utils import urlencode + +# engine dependent config +categories = ['map'] +paging = False +language_support = True +number_of_results = 10 + +# search-url +base_url = 'https://photon.komoot.de/' +search_string = 'api/?{query}&limit={limit}' +result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' + +# list of supported languages +supported_languages = ['de', 'en', 'fr', 'it'] + + +# do search-request +def request(query, params): + params['url'] = base_url +\ + search_string.format(query=urlencode({'q': query}), + limit=number_of_results) + + if params['language'] != 'all': + language = params['language'].split('_')[0] + if language in supported_languages: + params['url'] = params['url'] + "&lang=" + language + + # using searx User-Agent + params['headers']['User-Agent'] = searx_useragent() + + return params + + +# get response from search-request +def response(resp): + results = [] + json = loads(resp.text) + + # parse results + for r in json.get('features', {}): + + properties = r.get('properties') + + if not properties: + continue + + # get title + title = properties.get('name') + + # get osm-type + if properties.get('osm_type') == 'N': + osm_type = 'node' + elif properties.get('osm_type') == 'W': + osm_type = 'way' + elif properties.get('osm_type') == 'R': + osm_type = 'relation' + else: + # continue if invalide osm-type + continue + + url = result_base_url.format(osm_type=osm_type, + osm_id=properties.get('osm_id')) + + osm = {'type': osm_type, + 'id': properties.get('osm_id')} + + geojson = r.get('geometry') + + if properties.get('extent'): + boundingbox = [properties.get('extent')[3], + properties.get('extent')[1], + properties.get('extent')[0], + properties.get('extent')[2]] + else: + # TODO: better boundingbox calculation + boundingbox = [geojson['coordinates'][1], + geojson['coordinates'][1], + geojson['coordinates'][0], + geojson['coordinates'][0]] + + # address calculation + address = {} + + # get name + if properties.get('osm_key') == 'amenity' or\ + properties.get('osm_key') == 'shop' or\ + properties.get('osm_key') == 'tourism' or\ + properties.get('osm_key') == 'leisure': + address = {'name': properties.get('name')} + + # add rest of adressdata, if something is already found + if address.get('name'): + address.update({'house_number': properties.get('housenumber'), + 'road': properties.get('street'), + 'locality': properties.get('city', + properties.get('town', # noqa + properties.get('village'))), # noqa + 'postcode': properties.get('postcode'), + 'country': properties.get('country')}) + else: + address = None + + # append result + results.append({'template': 'map.html', + 'title': title, + 'content': '', + 'longitude': geojson['coordinates'][0], + 'latitude': geojson['coordinates'][1], + 'boundingbox': boundingbox, + 'geojson': geojson, + 'address': address, + 'osm': osm, + 'url': url}) + + # return results + return results diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py new file mode 100644 index 0000000..a5af8d8 --- /dev/null +++ b/searx/engines/piratebay.py @@ -0,0 +1,96 @@ +# Piratebay (Videos, Music, Files) +# +# @website https://thepiratebay.se +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML (using search portal) +# @stable yes (HTML can change) +# @parse url, title, content, seed, leech, magnetlink + +from lxml import html +from operator import itemgetter +from searx.engines.xpath import extract_text +from searx.url_utils import quote, urljoin + +# engine dependent config +categories = ['videos', 'music', 'files'] +paging = True + +# search-url +url = 'https://thepiratebay.se/' +search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' + +# piratebay specific type-definitions +search_types = {'files': '0', + 'music': '100', + 'videos': '200'} + +# specific xpath variables +magnet_xpath = './/a[@title="Download this torrent using magnet"]' +torrent_xpath = './/a[@title="Download this torrent"]' +content_xpath = './/font[@class="detDesc"]' + + +# do search-request +def request(query, params): + search_type = search_types.get(params['category'], '0') + + params['url'] = search_url.format(search_term=quote(query), + search_type=search_type, + pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + search_res = dom.xpath('//table[@id="searchResult"]//tr') + + # return empty array if nothing is found + if not search_res: + return [] + + # parse results + for result in search_res[1:]: + link = result.xpath('.//div[@class="detName"]//a')[0] + href = urljoin(url, link.attrib.get('href')) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) + seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] + + # convert seed to int if possible + if seed.isdigit(): + seed = int(seed) + else: + seed = 0 + + # convert leech to int if possible + if leech.isdigit(): + leech = int(leech) + else: + leech = 0 + + magnetlink = result.xpath(magnet_xpath)[0] + torrentfile_links = result.xpath(torrent_xpath) + if torrentfile_links: + torrentfile_link = torrentfile_links[0].attrib.get('href') + else: + torrentfile_link = None + + # append result + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'magnetlink': magnetlink.attrib.get('href'), + 'torrentfile': torrentfile_link, + 'template': 'torrent.html'}) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py new file mode 100644 index 0000000..3d266e2 --- /dev/null +++ b/searx/engines/qwant.py @@ -0,0 +1,140 @@ +""" + Qwant (Web, Images, News, Social) + + @website https://qwant.com/ + @provide-api not officially (https://api.qwant.com/api/search/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" + +from datetime import datetime +from json import loads +from searx.utils import html_to_text +from searx.url_utils import urlencode + +# engine dependent config +categories = None +paging = True +language_support = True +supported_languages_url = 'https://qwant.com/region' + +category_to_keyword = {'general': 'web', + 'images': 'images', + 'news': 'news', + 'social media': 'social'} + +# search-url +url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + + if categories[0] and categories[0] in category_to_keyword: + + params['url'] = url.format(keyword=category_to_keyword[categories[0]], + query=urlencode({'q': query}), + offset=offset) + else: + params['url'] = url.format(keyword='web', + query=urlencode({'q': query}), + offset=offset) + + # add language tag if specified + if params['language'] != 'all': + if params['language'] == 'no' or params['language'].startswith('no-'): + params['language'] = params['language'].replace('no', 'nb', 1) + if params['language'].find('-') < 0: + # tries to get a country code from language + for lang in supported_languages: + lc = lang.split('-') + if params['language'] == lc[0]: + params['language'] = lang + break + params['url'] += '&locale=' + params['language'].replace('-', '_').lower() + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if 'data' not in search_results: + return [] + + data = search_results.get('data', {}) + + res = data.get('result', {}) + + # parse results + for result in res.get('items', {}): + + title = html_to_text(result['title']) + res_url = result['url'] + content = html_to_text(result['desc']) + + if category_to_keyword.get(categories[0], '') == 'web': + results.append({'title': title, + 'content': content, + 'url': res_url}) + + elif category_to_keyword.get(categories[0], '') == 'images': + thumbnail_src = result['thumbnail'] + img_src = result['media'] + results.append({'template': 'images.html', + 'url': res_url, + 'title': title, + 'content': '', + 'thumbnail_src': thumbnail_src, + 'img_src': img_src}) + + elif category_to_keyword.get(categories[0], '') == 'social': + published_date = datetime.fromtimestamp(result['date'], None) + img_src = result.get('img', None) + results.append({'url': res_url, + 'title': title, + 'publishedDate': published_date, + 'content': content, + 'img_src': img_src}) + + elif category_to_keyword.get(categories[0], '') == 'news': + published_date = datetime.fromtimestamp(result['date'], None) + media = result.get('media', []) + if len(media) > 0: + img_src = media[0].get('pict', {}).get('url', None) + else: + img_src = None + results.append({'url': res_url, + 'title': title, + 'publishedDate': published_date, + 'content': content, + 'img_src': img_src}) + + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + # list of regions is embedded in page as a js object + response_text = resp.text + response_text = response_text[response_text.find('regionalisation'):] + response_text = response_text[response_text.find('{'):response_text.find(');')] + + regions_json = loads(response_text) + + supported_languages = [] + for lang in regions_json['languages'].values(): + if lang['code'] == 'nb': + lang['code'] = 'no' + for country in lang['countries']: + supported_languages.append(lang['code'] + '-' + country) + + return supported_languages diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py new file mode 100644 index 0000000..d197249 --- /dev/null +++ b/searx/engines/reddit.py @@ -0,0 +1,76 @@ +""" + Reddit + + @website https://www.reddit.com/ + @provide-api yes (https://www.reddit.com/dev/api) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, thumbnail, publishedDate +""" + +import json +from datetime import datetime +from searx.url_utils import urlencode, urljoin, urlparse + +# engine dependent config +categories = ['general', 'images', 'news', 'social media'] +page_size = 25 + +# search-url +base_url = 'https://www.reddit.com/' +search_url = base_url + 'search.json?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'q': query, 'limit': page_size}) + params['url'] = search_url.format(query=query) + + return params + + +# get response from search-request +def response(resp): + img_results = [] + text_results = [] + + search_results = json.loads(resp.text) + + # return empty array if there are no results + if 'data' not in search_results: + return [] + + posts = search_results.get('data', {}).get('children', []) + + # process results + for post in posts: + data = post['data'] + + # extract post information + params = { + 'url': urljoin(base_url, data['permalink']), + 'title': data['title'] + } + + # if thumbnail field contains a valid URL, we need to change template + thumbnail = data['thumbnail'] + url_info = urlparse(thumbnail) + # netloc & path + if url_info[1] != '' and url_info[2] != '': + params['img_src'] = data['url'] + params['thumbnail_src'] = thumbnail + params['template'] = 'images.html' + img_results.append(params) + else: + created = datetime.fromtimestamp(data['created_utc']) + content = data['selftext'] + if len(content) > 500: + content = content[:500] + '...' + params['content'] = content + params['publishedDate'] = created + text_results.append(params) + + # show images first and text results second + return img_results + text_results diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py new file mode 100644 index 0000000..72fd2b3 --- /dev/null +++ b/searx/engines/scanr_structures.py @@ -0,0 +1,76 @@ +""" + ScanR Structures (Science) + + @website https://scanr.enseignementsup-recherche.gouv.fr + @provide-api yes (https://scanr.enseignementsup-recherche.gouv.fr/api/swagger-ui.html) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, img_src +""" + +from json import loads, dumps +from searx.utils import html_to_text + +# engine dependent config +categories = ['science'] +paging = True +page_size = 20 + +# search-url +url = 'https://scanr.enseignementsup-recherche.gouv.fr/' +search_url = url + 'api/structures/search' + + +# do search-request +def request(query, params): + + params['url'] = search_url + params['method'] = 'POST' + params['headers']['Content-type'] = "application/json" + params['data'] = dumps({"query": query, + "searchField": "ALL", + "sortDirection": "ASC", + "sortOrder": "RELEVANCY", + "page": params['pageno'], + "pageSize": page_size}) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # return empty array if there are no results + if search_res.get('total', 0) < 1: + return [] + + # parse results + for result in search_res['results']: + if 'id' not in result: + continue + + # is it thumbnail or img_src?? + thumbnail = None + if 'logo' in result: + thumbnail = result['logo'] + if thumbnail[0] == '/': + thumbnail = url + thumbnail + + content = None + if 'highlights' in result: + content = result['highlights'][0]['value'] + + # append result + results.append({'url': url + 'structure/' + result['id'], + 'title': result['label'], + # 'thumbnail': thumbnail, + 'img_src': thumbnail, + 'content': html_to_text(content)}) + + # return results + return results diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py new file mode 100644 index 0000000..789e8e7 --- /dev/null +++ b/searx/engines/searchcode_code.py @@ -0,0 +1,69 @@ +""" + Searchcode (It) + + @website https://searchcode.com/ + @provide-api yes (https://searchcode.com/api/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" + +from json import loads +from searx.url_utils import urlencode + + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +url = 'https://searchcode.com/' +search_url = url + 'api/codesearch_I/?{query}&p={pageno}' + +# special code-endings which are not recognised by the file ending +code_endings = {'cs': 'c#', + 'h': 'c', + 'hpp': 'cpp', + 'cxx': 'cpp'} + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # parse results + for result in search_results.get('results', []): + href = result['url'] + title = "" + result['name'] + " - " + result['filename'] + repo = result['repo'] + + lines = dict() + for line, code in result['lines'].items(): + lines[int(line)] = code + + code_language = code_endings.get( + result['filename'].split('.')[-1].lower(), + result['filename'].split('.')[-1].lower()) + + # append result + results.append({'url': href, + 'title': title, + 'content': '', + 'repository': repo, + 'codelines': sorted(lines.items()), + 'code_language': code_language, + 'template': 'code.html'}) + + # return results + return results diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py new file mode 100644 index 0000000..4b8e9a8 --- /dev/null +++ b/searx/engines/searchcode_doc.py @@ -0,0 +1,49 @@ +""" + Searchcode (It) + + @website https://searchcode.com/ + @provide-api yes (https://searchcode.com/api/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" + +from json import loads +from searx.url_utils import urlencode + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +url = 'https://searchcode.com/' +search_url = url + 'api/search_IV/?{query}&p={pageno}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # parse results + for result in search_results.get('results', []): + href = result['url'] + title = "[{}] {} {}".format(result['type'], result['namespace'], result['name']) + + # append result + results.append({'url': href, + 'title': title, + 'content': result['description']}) + + # return results + return results diff --git a/searx/engines/searx_engine.py b/searx/engines/searx_engine.py new file mode 100644 index 0000000..91c2644 --- /dev/null +++ b/searx/engines/searx_engine.py @@ -0,0 +1,57 @@ +""" + Searx (all) + + @website https://github.com/asciimoo/searx + @provide-api yes (https://asciimoo.ithub.io/searx/dev/search_api.html) + + @using-api yes + @results JSON + @stable yes (using api) + @parse url, title, content +""" + +from json import loads +from searx.engines import categories as searx_categories + + +categories = searx_categories.keys() + +# search-url +instance_urls = [] +instance_index = 0 + + +# do search-request +def request(query, params): + global instance_index + params['url'] = instance_urls[instance_index % len(instance_urls)] + params['method'] = 'POST' + + instance_index += 1 + + params['data'] = { + 'q': query, + 'pageno': params['pageno'], + 'language': params['language'], + 'time_range': params['time_range'], + 'category': params['category'], + 'format': 'json' + } + + return params + + +# get response from search-request +def response(resp): + + response_json = loads(resp.text) + results = response_json['results'] + + for i in ('answers', 'infoboxes'): + results.extend(response_json[i]) + + results.extend({'suggestion': s} for s in response_json['suggestions']) + + results.append({'number_of_results': response_json['number_of_results']}) + + return results diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py new file mode 100644 index 0000000..3770dac --- /dev/null +++ b/searx/engines/seedpeer.py @@ -0,0 +1,75 @@ +# Seedpeer (Videos, Music, Files) +# +# @website http://seedpeer.eu +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML (using search portal) +# @stable yes (HTML can change) +# @parse url, title, content, seed, leech, magnetlink + +from lxml import html +from operator import itemgetter +from searx.url_utils import quote, urljoin + + +url = 'http://www.seedpeer.eu/' +search_url = url + 'search/{search_term}/7/{page_no}.html' +# specific xpath variables +torrent_xpath = '//*[@id="body"]/center/center/table[2]/tr/td/a' +alternative_torrent_xpath = '//*[@id="body"]/center/center/table[1]/tr/td/a' +title_xpath = '//*[@id="body"]/center/center/table[2]/tr/td/a/text()' +alternative_title_xpath = '//*[@id="body"]/center/center/table/tr/td/a' +seeds_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[4]/font/text()' +alternative_seeds_xpath = '//*[@id="body"]/center/center/table/tr/td[4]/font/text()' +peers_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[5]/font/text()' +alternative_peers_xpath = '//*[@id="body"]/center/center/table/tr/td[5]/font/text()' +age_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[2]/text()' +alternative_age_xpath = '//*[@id="body"]/center/center/table/tr/td[2]/text()' +size_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[3]/text()' +alternative_size_xpath = '//*[@id="body"]/center/center/table/tr/td[3]/text()' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), + page_no=params['pageno'] - 1) + return params + + +# get response from search-request +def response(resp): + results = [] + dom = html.fromstring(resp.text) + torrent_links = dom.xpath(torrent_xpath) + if len(torrent_links) > 0: + seeds = dom.xpath(seeds_xpath) + peers = dom.xpath(peers_xpath) + titles = dom.xpath(title_xpath) + sizes = dom.xpath(size_xpath) + ages = dom.xpath(age_xpath) + else: # under ~5 results uses a different xpath + torrent_links = dom.xpath(alternative_torrent_xpath) + seeds = dom.xpath(alternative_seeds_xpath) + peers = dom.xpath(alternative_peers_xpath) + titles = dom.xpath(alternative_title_xpath) + sizes = dom.xpath(alternative_size_xpath) + ages = dom.xpath(alternative_age_xpath) + # return empty array if nothing is found + if not torrent_links: + return [] + + # parse results + for index, result in enumerate(torrent_links): + link = result.attrib.get('href') + href = urljoin(url, link) + results.append({'url': href, + 'title': titles[index].text_content(), + 'content': '{}, {}'.format(sizes[index], ages[index]), + 'seed': seeds[index], + 'leech': peers[index], + + 'template': 'torrent.html'}) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py new file mode 100644 index 0000000..41b40da --- /dev/null +++ b/searx/engines/soundcloud.py @@ -0,0 +1,104 @@ +""" + Soundcloud (Music) + + @website https://soundcloud.com + @provide-api yes (https://developers.soundcloud.com/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, publishedDate, embedded +""" + +import re +from json import loads +from lxml import html +from dateutil import parser +from searx import logger +from searx.poolrequests import get as http_get +from searx.url_utils import quote_plus, urlencode + +try: + from cStringIO import StringIO +except: + from io import StringIO + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +url = 'https://api.soundcloud.com/' +search_url = url + 'search?{query}'\ + '&facet=model'\ + '&limit=20'\ + '&offset={offset}'\ + '&linked_partitioning=1'\ + '&client_id={client_id}' # noqa + +embedded_url = '<iframe width="100%" height="166" ' +\ + 'scrolling="no" frameborder="no" ' +\ + 'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>' + +cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U) + + +def get_client_id(): + response = http_get("https://soundcloud.com") + + if response.ok: + tree = html.fromstring(response.content) + script_tags = tree.xpath("//script[contains(@src, '/assets/app')]") + app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None] + + # extracts valid app_js urls from soundcloud.com content + for app_js_url in app_js_urls: + # gets app_js and searches for the clientid + response = http_get(app_js_url) + if response.ok: + cids = cid_re.search(response.text) + if cids is not None and len(cids.groups()): + return cids.groups()[0] + logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") + return "" + + +# api-key +guest_client_id = get_client_id() + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 20 + + params['url'] = search_url.format(query=urlencode({'q': query}), + offset=offset, + client_id=guest_client_id) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('collection', []): + if result['kind'] in ('track', 'playlist'): + title = result['title'] + content = result['description'] + publishedDate = parser.parse(result['last_modified']) + uri = quote_plus(result['uri']) + embedded = embedded_url.format(uri=uri) + + # append result + results.append({'url': result['permalink_url'], + 'title': title, + 'publishedDate': publishedDate, + 'embedded': embedded, + 'content': content}) + + # return results + return results diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py new file mode 100644 index 0000000..aed756b --- /dev/null +++ b/searx/engines/spotify.py @@ -0,0 +1,62 @@ +""" + Spotify (Music) + + @website https://spotify.com + @provide-api yes (https://developer.spotify.com/web-api/search-item/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, embedded +""" + +from json import loads +from searx.url_utils import urlencode + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +url = 'https://api.spotify.com/' +search_url = url + 'v1/search?{query}&type=track&offset={offset}' + +embedded_url = '<iframe data-src="https://embed.spotify.com/?uri=spotify:track:{audioid}"\ + width="300" height="80" frameborder="0" allowtransparency="true"></iframe>' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 20 + + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('tracks', {}).get('items', {}): + if result['type'] == 'track': + title = result['name'] + url = result['external_urls']['spotify'] + content = u'{} - {} - {}'.format( + result['artists'][0]['name'], + result['album']['name'], + result['name']) + + embedded = embedded_url.format(audioid=result['id']) + + # append result + results.append({'url': url, + 'title': title, + 'embedded': embedded, + 'content': content}) + + # return results + return results diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py new file mode 100644 index 0000000..25875aa --- /dev/null +++ b/searx/engines/stackoverflow.py @@ -0,0 +1,57 @@ +""" + Stackoverflow (It) + + @website https://stackoverflow.com/ + @provide-api not clear (https://api.stackexchange.com/docs/advanced-search) + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +url = 'https://stackoverflow.com/' +search_url = url + 'search?{query}&page={pageno}' + +# specific xpath variables +results_xpath = '//div[contains(@class,"question-summary")]' +link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a' +content_xpath = './/div[@class="excerpt"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] + href = urljoin(url, link.attrib.get('href')) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) + + # append result + results.append({'url': href, + 'title': title, + 'content': content}) + + # return results + return results diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py new file mode 100644 index 0000000..314b7b9 --- /dev/null +++ b/searx/engines/startpage.py @@ -0,0 +1,123 @@ +# Startpage (Web) +# +# @website https://startpage.com +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo paging + +from lxml import html +from dateutil import parser +from datetime import datetime, timedelta +import re +from searx.engines.xpath import extract_text + +# engine dependent config +categories = ['general'] +# there is a mechanism to block "bot" search +# (probably the parameter qid), require +# storing of qid's between mulitble search-calls + +# paging = False +language_support = True + +# search-url +base_url = 'https://startpage.com/' +search_url = base_url + 'do/search' + +# specific xpath variables +# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] +# not ads: div[@class="result"] are the direct childs of div[@id="results"] +results_xpath = '//div[@class="result"]' +link_xpath = './/h3/a' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + + params['url'] = search_url + params['method'] = 'POST' + params['data'] = {'query': query, + 'startat': offset} + + # set language if specified + if params['language'] != 'all': + params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + links = result.xpath(link_xpath) + if not links: + continue + link = links[0] + url = link.attrib.get('href') + + # block google-ad url's + if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): + continue + + # block startpage search url's + if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): + continue + + # block ixquick search url's + if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): + continue + + title = extract_text(link) + + if result.xpath('./p[@class="desc clk"]'): + content = extract_text(result.xpath('./p[@class="desc clk"]')) + else: + content = '' + + published_date = None + + # check if search result starts with something like: "2 Sep 2014 ... " + if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): + date_pos = content.find('...') + 4 + date_string = content[0:date_pos - 5] + published_date = parser.parse(date_string, dayfirst=True) + + # fix content string + content = content[date_pos:] + + # check if search result starts with something like: "5 days ago ... " + elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): + date_pos = content.find('...') + 4 + date_string = content[0:date_pos - 5] + + # calculate datetime + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) + + # fix content string + content = content[date_pos:] + + if published_date: + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'publishedDate': published_date}) + else: + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results + return results diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py new file mode 100644 index 0000000..2cbc991 --- /dev/null +++ b/searx/engines/subtitleseeker.py @@ -0,0 +1,86 @@ +""" + Subtitleseeker (Video) + + @website http://www.subtitleseeker.com + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html +from searx.languages import language_codes +from searx.engines.xpath import extract_text +from searx.url_utils import quote_plus + +# engine dependent config +categories = ['videos'] +paging = True +language = "" + +# search-url +url = 'http://www.subtitleseeker.com/' +search_url = url + 'search/TITLES/{query}?p={pageno}' + +# specific xpath variables +results_xpath = '//div[@class="boxRows"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=quote_plus(query), + pageno=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + search_lang = "" + + # dirty fix for languages named differenly in their site + if resp.search_params['language'][:2] == 'fa': + search_lang = 'Farsi' + elif resp.search_params['language'] == 'pt-BR': + search_lang = 'Brazilian' + elif resp.search_params['language'] != 'all': + search_lang = [lc[3] + for lc in language_codes + if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]] + search_lang = search_lang[0].split(' (')[0] + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(".//a")[0] + href = link.attrib.get('href') + + if language is not "": + href = href + language + '/' + elif search_lang: + href = href + search_lang + '/' + + title = extract_text(link) + + content = extract_text(result.xpath('.//div[contains(@class,"red")]')) + content = content + " - " + text = extract_text(result.xpath('.//div[contains(@class,"grey-web")]')[0]) + content = content + text + + if result.xpath(".//span") != []: + content = content +\ + " - (" +\ + extract_text(result.xpath(".//span")) +\ + ")" + + # append result + results.append({'url': href, + 'title': title, + 'content': content}) + + # return results + return results diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py new file mode 100644 index 0000000..e9c13ca --- /dev/null +++ b/searx/engines/swisscows.py @@ -0,0 +1,126 @@ +""" + Swisscows (Web, Images) + + @website https://swisscows.ch + @provide-api no + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from json import loads +import re +from lxml.html import fromstring +from searx.url_utils import unquote, urlencode + +# engine dependent config +categories = ['general', 'images'] +paging = True +language_support = True + +# search-url +base_url = 'https://swisscows.ch/' +search_string = '?{query}&page={page}' + +supported_languages_url = base_url + +# regex +regex_json = re.compile(b'initialData: {"Request":(.|\n)*},\s*environment') +regex_json_remove_start = re.compile(b'^initialData:\s*') +regex_json_remove_end = re.compile(b',\s*environment$') +regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=') + + +# do search-request +def request(query, params): + if params['language'] == 'all': + ui_language = 'browser' + region = 'browser' + elif params['language'].split('-')[0] == 'no': + region = 'nb-NO' + else: + region = params['language'] + ui_language = params['language'].split('-')[0] + + search_path = search_string.format( + query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}), + page=params['pageno'] + ) + + # image search query is something like 'image?{query}&page={page}' + if params['category'] == 'images': + search_path = 'image' + search_path + + params['url'] = base_url + search_path + + return params + + +# get response from search-request +def response(resp): + results = [] + + json_regex = regex_json.search(resp.text) + + # check if results are returned + if not json_regex: + return [] + + json_raw = regex_json_remove_end.sub(b'', regex_json_remove_start.sub(b'', json_regex.group())) + json = loads(json_raw.decode('utf-8')) + + # parse results + for result in json['Results'].get('items', []): + result_title = result['Title'].replace(u'\uE000', '').replace(u'\uE001', '') + + # parse image results + if result.get('ContentType', '').startswith('image'): + img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8')) + + # append result + results.append({'url': result['SourceUrl'], + 'title': result['Title'], + 'content': '', + 'img_src': img_url, + 'template': 'images.html'}) + + # parse general results + else: + result_url = result['Url'].replace(u'\uE000', '').replace(u'\uE001', '') + result_content = result['Description'].replace(u'\uE000', '').replace(u'\uE001', '') + + # append result + results.append({'url': result_url, + 'title': result_title, + 'content': result_content}) + + # parse images + for result in json.get('Images', []): + # decode image url + img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8')) + + # append result + results.append({'url': result['SourceUrl'], + 'title': result['Title'], + 'content': '', + 'img_src': img_url, + 'template': 'images.html'}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = fromstring(resp.text) + options = dom.xpath('//div[@id="regions-popup"]//ul/li/a') + for option in options: + code = option.xpath('./@data-val')[0] + if code.startswith('nb-'): + code = code.replace('nb', 'no', 1) + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py new file mode 100644 index 0000000..9a6b5e5 --- /dev/null +++ b/searx/engines/tokyotoshokan.py @@ -0,0 +1,100 @@ +""" + Tokyo Toshokan (A BitTorrent Library for Japanese Media) + + @website https://www.tokyotosho.info/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, publishedDate, seed, leech, + filesize, magnetlink, content +""" + +import re +from lxml import html +from searx.engines.xpath import extract_text +from datetime import datetime +from searx.engines.nyaa import int_or_zero, get_filesize_mul +from searx.url_utils import urlencode + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +base_url = 'https://www.tokyotosho.info/' +search_url = base_url + 'search.php?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'page': params['pageno'], 'terms': query}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]') + + # check if there are no results or page layout was changed so we cannot parse it + # currently there are two rows for each result, so total count must be even + if len(rows) == 0 or len(rows) % 2 != 0: + return [] + + # regular expression for parsing torrent size strings + size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) + + # processing the results, two rows at a time + for i in range(0, len(rows), 2): + # parse the first row + name_row = rows[i] + + links = name_row.xpath('./td[@class="desc-top"]/a') + params = { + 'template': 'torrent.html', + 'url': links[-1].attrib.get('href'), + 'title': extract_text(links[-1]) + } + # I have not yet seen any torrents without magnet links, but + # it's better to be prepared to stumble upon one some day + if len(links) == 2: + magnet = links[0].attrib.get('href') + if magnet.startswith('magnet'): + # okay, we have a valid magnet link, let's add it to the result + params['magnetlink'] = magnet + + # no more info in the first row, start parsing the second one + info_row = rows[i + 1] + desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0]) + for item in desc.split('|'): + item = item.strip() + if item.startswith('Size:'): + try: + # ('1.228', 'GB') + groups = size_re.match(item).groups() + multiplier = get_filesize_mul(groups[1]) + params['filesize'] = int(multiplier * float(groups[0])) + except: + pass + elif item.startswith('Date:'): + try: + # Date: 2016-02-21 21:44 UTC + date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC') + params['publishedDate'] = date + except: + pass + elif item.startswith('Comment:'): + params['content'] = item + stats = info_row.xpath('./td[@class="stats"]/span') + # has the layout not changed yet? + if len(stats) == 3: + params['seed'] = int_or_zero(extract_text(stats[0])) + params['leech'] = int_or_zero(extract_text(stats[1])) + + results.append(params) + + return results diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py new file mode 100644 index 0000000..dda56fc --- /dev/null +++ b/searx/engines/torrentz.py @@ -0,0 +1,92 @@ +""" + Torrentz.eu (BitTorrent meta-search engine) + + @website https://torrentz.eu/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change, although unlikely, + see https://torrentz.eu/torrentz.btsearch) + @parse url, title, publishedDate, seed, leech, filesize, magnetlink +""" + +import re +from lxml import html +from datetime import datetime +from searx.engines.nyaa import int_or_zero, get_filesize_mul +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +# https://torrentz.eu/search?f=EXAMPLE&p=6 +base_url = 'https://torrentz.eu/' +search_url = base_url + 'search?{query}' + + +# do search-request +def request(query, params): + page = params['pageno'] - 1 + query = urlencode({'q': query, 'p': page}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="results"]/dl'): + name_cell = result.xpath('./dt')[0] + title = extract_text(name_cell) + + # skip rows that do not contain a link to a torrent + links = name_cell.xpath('./a') + if len(links) != 1: + continue + + # extract url and remove a slash in the beginning + link = links[0].attrib.get('href').lstrip('/') + + seed = result.xpath('./dd/span[@class="u"]/text()')[0].replace(',', '') + leech = result.xpath('./dd/span[@class="d"]/text()')[0].replace(',', '') + + params = { + 'url': base_url + link, + 'title': title, + 'seed': int_or_zero(seed), + 'leech': int_or_zero(leech), + 'template': 'torrent.html' + } + + # let's try to calculate the torrent size + try: + size_str = result.xpath('./dd/span[@class="s"]/text()')[0] + size, suffix = size_str.split() + params['filesize'] = int(size) * get_filesize_mul(suffix) + except: + pass + + # does our link contain a valid SHA1 sum? + if re.compile('[0-9a-fA-F]{40}').match(link): + # add a magnet link to the result + params['magnetlink'] = 'magnet:?xt=urn:btih:' + link + + # extract and convert creation date + try: + date_str = result.xpath('./dd/span[@class="a"]/span')[0].attrib.get('title') + # Fri, 25 Mar 2016 16:29:01 + date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S') + params['publishedDate'] = date + except: + pass + + results.append(params) + + return results diff --git a/searx/engines/translated.py b/searx/engines/translated.py new file mode 100644 index 0000000..5c7b170 --- /dev/null +++ b/searx/engines/translated.py @@ -0,0 +1,68 @@ +""" + MyMemory Translated + + @website https://mymemory.translated.net/ + @provide-api yes (https://mymemory.translated.net/doc/spec.php) + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" +import re +from sys import version_info +from searx.utils import is_valid_lang + +if version_info[0] == 3: + unicode = str + +categories = ['general'] +url = u'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' +web_url = u'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' +weight = 100 + +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) +api_key = '' + + +def request(query, params): + m = parser_re.match(unicode(query, 'utf8')) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return params + + if api_key: + key_form = '&key=' + api_key + else: + key_form = '' + params['url'] = url.format(from_lang=from_lang[1], + to_lang=to_lang[1], + query=query, + key=key_form) + params['query'] = query + params['from_lang'] = from_lang + params['to_lang'] = to_lang + + return params + + +def response(resp): + results = [] + results.append({ + 'url': web_url.format( + from_lang=resp.search_params['from_lang'][2], + to_lang=resp.search_params['to_lang'][2], + query=resp.search_params['query']), + 'title': '[{0}-{1}] {2}'.format( + resp.search_params['from_lang'][1], + resp.search_params['to_lang'][1], + resp.search_params['query']), + 'content': resp.json()['responseData']['translatedText'] + }) + return results diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py new file mode 100644 index 0000000..d2a8d20 --- /dev/null +++ b/searx/engines/twitter.py @@ -0,0 +1,87 @@ +""" + Twitter (Social media) + + @website https://twitter.com/ + @provide-api yes (https://dev.twitter.com/docs/using-search) + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content + + @todo publishedDate +""" + +from lxml import html +from datetime import datetime +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['social media'] +language_support = True + +# search-url +base_url = 'https://twitter.com/' +search_url = base_url + 'search?' + +# specific xpath variables +results_xpath = '//li[@data-item-type="tweet"]' +avatar_xpath = './/img[contains(@class, "avatar")]/@src' +link_xpath = './/small[@class="time"]//a' +title_xpath = './/span[contains(@class, "username")]' +content_xpath = './/p[contains(@class, "tweet-text")]' +timestamp_xpath = './/span[contains(@class,"_timestamp")]' + + +# do search-request +def request(query, params): + params['url'] = search_url + urlencode({'q': query}) + + # set language if specified + if params['language'] != 'all': + params['cookies']['lang'] = params['language'].split('-')[0] + else: + params['cookies']['lang'] = 'en' + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for tweet in dom.xpath(results_xpath): + try: + link = tweet.xpath(link_xpath)[0] + content = extract_text(tweet.xpath(content_xpath)[0]) + img_src = tweet.xpath(avatar_xpath)[0] + img_src = img_src.replace('_bigger', '_normal') + except Exception: + continue + + url = urljoin(base_url, link.attrib.get('href')) + title = extract_text(tweet.xpath(title_xpath)) + + pubdate = tweet.xpath(timestamp_xpath) + if len(pubdate) > 0: + timestamp = float(pubdate[0].attrib.get('data-time')) + publishedDate = datetime.fromtimestamp(timestamp, None) + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'img_src': img_src, + 'publishedDate': publishedDate}) + else: + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'img_src': img_src}) + + # return results + return results diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py new file mode 100644 index 0000000..1408be8 --- /dev/null +++ b/searx/engines/vimeo.py @@ -0,0 +1,67 @@ +# Vimeo (Videos) +# +# @website https://vimeo.com/ +# @provide-api yes (http://developer.vimeo.com/api), +# they have a maximum count of queries/hour +# +# @using-api no (TODO, rewrite to api) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, publishedDate, thumbnail, embedded +# +# @todo rewrite to api +# @todo set content-parameter with correct data + +from json import loads +from dateutil import parser +from searx.url_utils import urlencode + +# engine dependent config +categories = ['videos'] +paging = True + +# search-url +base_url = 'https://vimeo.com/' +search_url = base_url + '/search/page:{pageno}?{query}' + +embedded_url = '<iframe data-src="//player.vimeo.com/video/{videoid}" ' +\ + 'width="540" height="304" frameborder="0" ' +\ + 'webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(pageno=params['pageno'], + query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + data_start_pos = resp.text.find('{"filtered"') + data_end_pos = resp.text.find(';\n', data_start_pos + 1) + data = loads(resp.text[data_start_pos:data_end_pos]) + + # parse results + for result in data['filtered']['data']: + result = result[result['type']] + videoid = result['uri'].split('/')[-1] + url = base_url + videoid + title = result['name'] + thumbnail = result['pictures']['sizes'][-1]['link'] + publishedDate = parser.parse(result['created_time']) + embedded = embedded_url.format(videoid=videoid) + + # append result + results.append({'url': url, + 'title': title, + 'content': '', + 'template': 'videos.html', + 'publishedDate': publishedDate, + 'embedded': embedded, + 'thumbnail': thumbnail}) + + # return results + return results diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py new file mode 100644 index 0000000..be21746 --- /dev/null +++ b/searx/engines/wikidata.py @@ -0,0 +1,488 @@ +# -*- coding: utf-8 -*- +""" + Wikidata + + @website https://wikidata.org + @provide-api yes (https://wikidata.org/w/api.php) + + @using-api partially (most things require scraping) + @results JSON, HTML + @stable no (html can change) + @parse url, infobox +""" + +from searx import logger +from searx.poolrequests import get +from searx.engines.xpath import extract_text +from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode + +from json import loads +from lxml.html import fromstring + +logger = logger.getChild('wikidata') +result_count = 1 + +# urls +wikidata_host = 'https://www.wikidata.org' +url_search = wikidata_host \ + + '/wiki/Special:ItemDisambiguation?{query}' + +wikidata_api = wikidata_host + '/w/api.php' +url_detail = wikidata_api\ + + '?action=parse&format=json&{query}'\ + + '&redirects=1&prop=text%7Cdisplaytitle%7Clanglinks%7Crevid'\ + + '&disableeditsection=1&disabletidy=1&preview=1§ionpreview=1&disabletoc=1&utf8=1&formatversion=2' + +url_map = 'https://www.openstreetmap.org/'\ + + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' +url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' + +# xpaths +wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title' +title_xpath = '//*[contains(@class,"wikibase-title-label")]' +description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' +property_xpath = '//div[@id="{propertyid}"]' +label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a' +url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]' +wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\ + + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href' +property_row_xpath = './/div[contains(@class,"wikibase-statementview")]' +preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]' +value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ + + '/*/div[contains(@class,"wikibase-snakview-value")]' +language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' +calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' + + +def request(query, params): + language = params['language'].split('-')[0] + if language == 'all': + language = 'en' + + params['url'] = url_search.format( + query=urlencode({'label': query, 'language': language})) + return params + + +def response(resp): + results = [] + html = fromstring(resp.text) + wikidata_ids = html.xpath(wikidata_ids_xpath) + + language = resp.search_params['language'].split('-')[0] + if language == 'all': + language = 'en' + + # TODO: make requests asynchronous to avoid timeout when result_count > 1 + for wikidata_id in wikidata_ids[:result_count]: + url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) + htmlresponse = get(url) + jsonresponse = loads(htmlresponse.text) + results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) + + return results + + +def getDetail(jsonresponse, wikidata_id, language, locale): + results = [] + urls = [] + attributes = [] + + title = jsonresponse.get('parse', {}).get('displaytitle', {}) + result = jsonresponse.get('parse', {}).get('text', {}) + + if not title or not result: + return results + + title = fromstring(title) + for elem in title.xpath(language_fallback_xpath): + elem.getparent().remove(elem) + title = extract_text(title.xpath(title_xpath)) + + result = fromstring(result) + for elem in result.xpath(language_fallback_xpath): + elem.getparent().remove(elem) + + description = extract_text(result.xpath(description_xpath)) + + # URLS + + # official website + add_url(urls, result, 'P856', results=results) + + # wikipedia + wikipedia_link_count = 0 + wikipedia_link = get_wikilink(result, language + 'wiki') + if wikipedia_link: + wikipedia_link_count += 1 + urls.append({'title': 'Wikipedia (' + language + ')', + 'url': wikipedia_link}) + + if language != 'en': + wikipedia_en_link = get_wikilink(result, 'enwiki') + if wikipedia_en_link: + wikipedia_link_count += 1 + urls.append({'title': 'Wikipedia (en)', + 'url': wikipedia_en_link}) + + # TODO: get_wiki_firstlanguage + # if wikipedia_link_count == 0: + + # more wikis + add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') + add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') + add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki') + + add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo') + + # musicbrainz + add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') + add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') + add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') + add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') + + # IMDb + add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') + # source code repository + add_url(urls, result, 'P1324') + # blog + add_url(urls, result, 'P1581') + # social media links + add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') + add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') + add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/') + add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/') + add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/') + + urls.append({'title': 'Wikidata', + 'url': 'https://www.wikidata.org/wiki/' + + wikidata_id + '?uselang=' + language}) + + # INFOBOX ATTRIBUTES (ROWS) + + # DATES + # inception date + add_attribute(attributes, result, 'P571', date=True) + # dissolution date + add_attribute(attributes, result, 'P576', date=True) + # start date + add_attribute(attributes, result, 'P580', date=True) + # end date + add_attribute(attributes, result, 'P582', date=True) + # date of birth + add_attribute(attributes, result, 'P569', date=True) + # date of death + add_attribute(attributes, result, 'P570', date=True) + # date of spacecraft launch + add_attribute(attributes, result, 'P619', date=True) + # date of spacecraft landing + add_attribute(attributes, result, 'P620', date=True) + + # nationality + add_attribute(attributes, result, 'P27') + # country of origin + add_attribute(attributes, result, 'P495') + # country + add_attribute(attributes, result, 'P17') + # headquarters + add_attribute(attributes, result, 'Q180') + + # PLACES + # capital + add_attribute(attributes, result, 'P36', trim=True) + # head of state + add_attribute(attributes, result, 'P35', trim=True) + # head of government + add_attribute(attributes, result, 'P6', trim=True) + # type of government + add_attribute(attributes, result, 'P122') + # official language + add_attribute(attributes, result, 'P37') + # population + add_attribute(attributes, result, 'P1082', trim=True) + # area + add_attribute(attributes, result, 'P2046') + # currency + add_attribute(attributes, result, 'P38', trim=True) + # heigth (building) + add_attribute(attributes, result, 'P2048') + + # MEDIA + # platform (videogames) + add_attribute(attributes, result, 'P400') + # author + add_attribute(attributes, result, 'P50') + # creator + add_attribute(attributes, result, 'P170') + # director + add_attribute(attributes, result, 'P57') + # performer + add_attribute(attributes, result, 'P175') + # developer + add_attribute(attributes, result, 'P178') + # producer + add_attribute(attributes, result, 'P162') + # manufacturer + add_attribute(attributes, result, 'P176') + # screenwriter + add_attribute(attributes, result, 'P58') + # production company + add_attribute(attributes, result, 'P272') + # record label + add_attribute(attributes, result, 'P264') + # publisher + add_attribute(attributes, result, 'P123') + # original network + add_attribute(attributes, result, 'P449') + # distributor + add_attribute(attributes, result, 'P750') + # composer + add_attribute(attributes, result, 'P86') + # publication date + add_attribute(attributes, result, 'P577', date=True) + # genre + add_attribute(attributes, result, 'P136') + # original language + add_attribute(attributes, result, 'P364') + # isbn + add_attribute(attributes, result, 'Q33057') + # software license + add_attribute(attributes, result, 'P275') + # programming language + add_attribute(attributes, result, 'P277') + # version + add_attribute(attributes, result, 'P348', trim=True) + # narrative location + add_attribute(attributes, result, 'P840') + + # LANGUAGES + # number of speakers + add_attribute(attributes, result, 'P1098') + # writing system + add_attribute(attributes, result, 'P282') + # regulatory body + add_attribute(attributes, result, 'P1018') + # language code + add_attribute(attributes, result, 'P218') + + # OTHER + # ceo + add_attribute(attributes, result, 'P169', trim=True) + # founder + add_attribute(attributes, result, 'P112') + # legal form (company/organization) + add_attribute(attributes, result, 'P1454') + # operator + add_attribute(attributes, result, 'P137') + # crew members (tripulation) + add_attribute(attributes, result, 'P1029') + # taxon + add_attribute(attributes, result, 'P225') + # chemical formula + add_attribute(attributes, result, 'P274') + # winner (sports/contests) + add_attribute(attributes, result, 'P1346') + # number of deaths + add_attribute(attributes, result, 'P1120') + # currency code + add_attribute(attributes, result, 'P498') + + image = add_image(result) + + if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: + results.append({ + 'url': urls[0]['url'], + 'title': title, + 'content': description + }) + else: + results.append({ + 'infobox': title, + 'id': wikipedia_link, + 'content': description, + 'img_src': image, + 'attributes': attributes, + 'urls': urls + }) + + return results + + +# only returns first match +def add_image(result): + # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon + property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] + + for property_id in property_ids: + image = result.xpath(property_xpath.replace('{propertyid}', property_id)) + if image: + image_name = image[0].xpath(value_xpath) + image_src = url_image.replace('{filename}', extract_text(image_name[0])) + return image_src + + +# setting trim will only returned high ranked rows OR the first row +def add_attribute(attributes, result, property_id, default_label=None, date=False, trim=False): + attribute = result.xpath(property_xpath.replace('{propertyid}', property_id)) + if attribute: + + if default_label: + label = default_label + else: + label = extract_text(attribute[0].xpath(label_xpath)) + label = label[0].upper() + label[1:] + + if date: + trim = True + # remove calendar name + calendar_name = attribute[0].xpath(calendar_name_xpath) + for calendar in calendar_name: + calendar.getparent().remove(calendar) + + concat_values = "" + values = [] + first_value = None + for row in attribute[0].xpath(property_row_xpath): + if not first_value or not trim or row.xpath(preferred_rank_xpath): + + value = row.xpath(value_xpath) + if not value: + continue + value = extract_text(value) + + # save first value in case no ranked row is found + if trim and not first_value: + first_value = value + else: + # to avoid duplicate values + if value not in values: + concat_values += value + ", " + values.append(value) + + if trim and not values: + attributes.append({'label': label, + 'value': first_value}) + else: + attributes.append({'label': label, + 'value': concat_values[:-2]}) + + +# requires property_id unless it's a wiki link (defined in link_type) +def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, results=None, link_type=None): + links = [] + + # wiki links don't have property in wikidata page + if link_type and 'wiki' in link_type: + links.append(get_wikilink(result, link_type)) + else: + dom_element = result.xpath(property_xpath.replace('{propertyid}', property_id)) + if dom_element: + dom_element = dom_element[0] + if not default_label: + label = extract_text(dom_element.xpath(label_xpath)) + label = label[0].upper() + label[1:] + + if link_type == 'geo': + links.append(get_geolink(dom_element)) + + elif link_type == 'imdb': + links.append(get_imdblink(dom_element, url_prefix)) + + else: + url_results = dom_element.xpath(url_xpath) + for link in url_results: + if link is not None: + if url_prefix: + link = url_prefix + extract_text(link) + else: + link = extract_text(link) + links.append(link) + + # append urls + for url in links: + if url is not None: + urls.append({'title': default_label or label, + 'url': url}) + if results is not None: + results.append({'title': default_label or label, + 'url': url}) + + +def get_imdblink(result, url_prefix): + imdb_id = result.xpath(value_xpath) + if imdb_id: + imdb_id = extract_text(imdb_id) + id_prefix = imdb_id[:2] + if id_prefix == 'tt': + url = url_prefix + 'title/' + imdb_id + elif id_prefix == 'nm': + url = url_prefix + 'name/' + imdb_id + elif id_prefix == 'ch': + url = url_prefix + 'character/' + imdb_id + elif id_prefix == 'co': + url = url_prefix + 'company/' + imdb_id + elif id_prefix == 'ev': + url = url_prefix + 'event/' + imdb_id + else: + url = None + return url + + +def get_geolink(result): + coordinates = result.xpath(value_xpath) + if not coordinates: + return None + coordinates = extract_text(coordinates[0]) + latitude, longitude = coordinates.split(',') + + # convert to decimal + lat = int(latitude[:latitude.find(u'°')]) + if latitude.find('\'') >= 0: + lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0 + if latitude.find('"') >= 0: + lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0 + if latitude.find('S') >= 0: + lat *= -1 + lon = int(longitude[:longitude.find(u'°')]) + if longitude.find('\'') >= 0: + lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0 + if longitude.find('"') >= 0: + lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0 + if longitude.find('W') >= 0: + lon *= -1 + + # TODO: get precision + precision = 0.0002 + # there is no zoom information, deduce from precision (error prone) + # samples : + # 13 --> 5 + # 1 --> 6 + # 0.016666666666667 --> 9 + # 0.00027777777777778 --> 19 + # wolframalpha : + # quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}} + # 14.1186-8.8322 x+0.625447 x^2 + if precision < 0.0003: + zoom = 19 + else: + zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447) + + url = url_map\ + .replace('{latitude}', str(lat))\ + .replace('{longitude}', str(lon))\ + .replace('{zoom}', str(zoom)) + + return url + + +def get_wikilink(result, wikiid): + url = result.xpath(wikilink_xpath.replace('{wikiid}', wikiid)) + if not url: + return None + url = url[0] + if url.startswith('http://'): + url = url.replace('http://', 'https://') + elif url.startswith('//'): + url = 'https:' + url + return url diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py new file mode 100644 index 0000000..db2fdc0 --- /dev/null +++ b/searx/engines/wikipedia.py @@ -0,0 +1,135 @@ +""" + Wikipedia (Web) + + @website https://{language}.wikipedia.org + @provide-api yes + + @using-api yes + @results JSON + @stable yes + @parse url, infobox +""" + +from json import loads +from lxml.html import fromstring +from searx.url_utils import quote, urlencode + +# search-url +base_url = u'https://{language}.wikipedia.org/' +search_url = base_url + u'w/api.php?'\ + 'action=query'\ + '&format=json'\ + '&{query}'\ + '&prop=extracts|pageimages'\ + '&exintro'\ + '&explaintext'\ + '&pithumbsize=300'\ + '&redirects' +supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' + + +# set language in base_url +def url_lang(lang): + lang = lang.split('-')[0] + if lang == 'all' or lang not in supported_languages: + language = 'en' + else: + language = lang + + return language + + +# do search-request +def request(query, params): + if query.islower(): + query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8') + + params['url'] = search_url.format(query=urlencode({'titles': query}), + language=url_lang(params['language'])) + + return params + + +# get first meaningful paragraph +# this should filter out disambiguation pages and notes above first paragraph +# "magic numbers" were obtained by fine tuning +def extract_first_paragraph(content, title, image): + first_paragraph = None + + failed_attempts = 0 + for paragraph in content.split('\n'): + + starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35) + length = len(paragraph) + + if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)): + first_paragraph = paragraph + break + + failed_attempts += 1 + if failed_attempts > 3: + return None + + return first_paragraph + + +# get response from search-request +def response(resp): + results = [] + + search_result = loads(resp.text) + + # wikipedia article's unique id + # first valid id is assumed to be the requested article + for article_id in search_result['query']['pages']: + page = search_result['query']['pages'][article_id] + if int(article_id) > 0: + break + + if int(article_id) < 0: + return [] + + title = page.get('title') + + image = page.get('thumbnail') + if image: + image = image.get('source') + + extract = page.get('extract') + + summary = extract_first_paragraph(extract, title, image) + + # link to wikipedia article + wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ + + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')) + + results.append({'url': wikipedia_link, 'title': title}) + + results.append({'infobox': title, + 'id': wikipedia_link, + 'content': summary, + 'img_src': image, + 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) + + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = {} + dom = fromstring(resp.text) + tables = dom.xpath('//table[contains(@class,"sortable")]') + for table in tables: + # exclude header row + trs = table.xpath('.//tr')[1:] + for tr in trs: + td = tr.xpath('./td') + code = td[3].xpath('./a')[0].text + name = td[2].xpath('./a')[0].text + english_name = td[1].xpath('./a')[0].text + articles = int(td[4].xpath('./a/b')[0].text.replace(',', '')) + # exclude languages with too few articles + if articles >= 100: + supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles} + + return supported_languages diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py new file mode 100644 index 0000000..595c6b7 --- /dev/null +++ b/searx/engines/wolframalpha_api.py @@ -0,0 +1,129 @@ +# Wolfram Alpha (Science) +# +# @website https://www.wolframalpha.com +# @provide-api yes (https://api.wolframalpha.com/v2/) +# +# @using-api yes +# @results XML +# @stable yes +# @parse url, infobox + +from lxml import etree +from searx.url_utils import urlencode + +# search-url +search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}' +site_url = 'https://www.wolframalpha.com/input/?{query}' +api_key = '' # defined in settings.yml + +# xpath variables +failure_xpath = '/queryresult[attribute::success="false"]' +input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext' +pods_xpath = '//pod' +subpods_xpath = './subpod' +pod_primary_xpath = './@primary' +pod_id_xpath = './@id' +pod_title_xpath = './@title' +plaintext_xpath = './plaintext' +image_xpath = './img' +img_src_xpath = './@src' +img_alt_xpath = './@alt' + +# pods to display as image in infobox +# this pods do return a plaintext, but they look better and are more useful as images +image_pods = {'VisualRepresentation', + 'Illustration'} + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'input': query}), api_key=api_key) + params['headers']['Referer'] = site_url.format(query=urlencode({'i': query})) + + return params + + +# replace private user area characters to make text legible +def replace_pua_chars(text): + pua_chars = {u'\uf522': u'\u2192', # rigth arrow + u'\uf7b1': u'\u2115', # set of natural numbers + u'\uf7b4': u'\u211a', # set of rational numbers + u'\uf7b5': u'\u211d', # set of real numbers + u'\uf7bd': u'\u2124', # set of integer numbers + u'\uf74c': 'd', # differential + u'\uf74d': u'\u212f', # euler's number + u'\uf74e': 'i', # imaginary number + u'\uf7d9': '='} # equals sign + + for k, v in pua_chars.items(): + text = text.replace(k, v) + + return text + + +# get response from search-request +def response(resp): + results = [] + + search_results = etree.XML(resp.text) + + # return empty array if there are no results + if search_results.xpath(failure_xpath): + return [] + + try: + infobox_title = search_results.xpath(input_xpath)[0].text + except: + infobox_title = "" + + pods = search_results.xpath(pods_xpath) + result_chunks = [] + result_content = "" + for pod in pods: + pod_id = pod.xpath(pod_id_xpath)[0] + pod_title = pod.xpath(pod_title_xpath)[0] + pod_is_result = pod.xpath(pod_primary_xpath) + + subpods = pod.xpath(subpods_xpath) + if not subpods: + continue + + # Appends either a text or an image, depending on which one is more suitable + for subpod in subpods: + content = subpod.xpath(plaintext_xpath)[0].text + image = subpod.xpath(image_xpath) + + if content and pod_id not in image_pods: + + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = "%s: %s" % (pod_title, content) + + # if no input pod was found, title is first plaintext pod + if not infobox_title: + infobox_title = content + + content = replace_pua_chars(content) + result_chunks.append({'label': pod_title, 'value': content}) + + elif image: + result_chunks.append({'label': pod_title, + 'image': {'src': image[0].xpath(img_src_xpath)[0], + 'alt': image[0].xpath(img_alt_xpath)[0]}}) + + if not result_chunks: + return [] + + title = "Wolfram|Alpha (%s)" % infobox_title + + # append infobox + results.append({'infobox': infobox_title, + 'attributes': result_chunks, + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]}) + + # append link to site + results.append({'url': resp.request.headers['Referer'], + 'title': title, + 'content': result_content}) + + return results diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py new file mode 100644 index 0000000..2a8642f --- /dev/null +++ b/searx/engines/wolframalpha_noapi.py @@ -0,0 +1,120 @@ +# Wolfram|Alpha (Science) +# +# @website https://www.wolframalpha.com/ +# @provide-api yes (https://api.wolframalpha.com/v2/) +# +# @using-api no +# @results JSON +# @stable no +# @parse url, infobox + +from json import loads +from time import time + +from searx.poolrequests import get as http_get +from searx.url_utils import urlencode + +# search-url +url = 'https://www.wolframalpha.com/' + +search_url = url + 'input/json.jsp'\ + '?async=false'\ + '&banners=raw'\ + '&debuggingdata=false'\ + '&format=image,plaintext,imagemap,minput,moutput'\ + '&formattimeout=2'\ + '&{query}'\ + '&output=JSON'\ + '&parsetimeout=2'\ + '&proxycode={token}'\ + '&scantimeout=0.5'\ + '&sponsorcategories=true'\ + '&statemethod=deploybutton' + +referer_url = url + 'input/?{query}' + +token = {'value': '', + 'last_updated': None} + +# pods to display as image in infobox +# this pods do return a plaintext, but they look better and are more useful as images +image_pods = {'VisualRepresentation', + 'Illustration', + 'Symbol'} + + +# seems, wolframalpha resets its token in every hour +def obtain_token(): + update_time = time() - (time() % 3600) + try: + token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0) + token['value'] = loads(token_response.text)['code'] + token['last_updated'] = update_time + except: + pass + return token + + +obtain_token() + + +# do search-request +def request(query, params): + # obtain token if last update was more than an hour + if time() - (token['last_updated'] or 0) > 3600: + obtain_token() + params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) + params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + resp_json = loads(resp.text) + + if not resp_json['queryresult']['success']: + return [] + + # TODO handle resp_json['queryresult']['assumptions'] + result_chunks = [] + infobox_title = "" + result_content = "" + for pod in resp_json['queryresult']['pods']: + pod_id = pod.get('id', '') + pod_title = pod.get('title', '') + pod_is_result = pod.get('primary', None) + + if 'subpods' not in pod: + continue + + if pod_id == 'Input' or not infobox_title: + infobox_title = pod['subpods'][0]['plaintext'] + + for subpod in pod['subpods']: + if subpod['plaintext'] != '' and pod_id not in image_pods: + # append unless it's not an actual answer + if subpod['plaintext'] != '(requires interactivity)': + result_chunks.append({'label': pod_title, 'value': subpod['plaintext']}) + + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = pod_title + ': ' + subpod['plaintext'] + + elif 'img' in subpod: + result_chunks.append({'label': pod_title, 'image': subpod['img']}) + + if not result_chunks: + return [] + + results.append({'infobox': infobox_title, + 'attributes': result_chunks, + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]}) + + results.append({'url': resp.request.headers['Referer'], + 'title': 'Wolfram|Alpha (' + infobox_title + ')', + 'content': result_content}) + + return results diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py new file mode 100644 index 0000000..5088032 --- /dev/null +++ b/searx/engines/www1x.py @@ -0,0 +1,81 @@ +""" + 1x (Images) + + @website http://1x.com/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, thumbnail, img_src, content +""" + +from lxml import html +import re +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['images'] +paging = False + +# search-url +base_url = 'https://1x.com' +search_url = base_url + '/backend/search.php?{query}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + # get links from result-text + regex = re.compile('(</a>|<a)') + results_parts = re.split(regex, resp.text) + + cur_element = '' + + # iterate over link parts + for result_part in results_parts: + # processed start and end of link + if result_part == '<a': + cur_element = result_part + continue + elif result_part != '</a>': + cur_element += result_part + continue + + cur_element += result_part + + # fix xml-error + cur_element = cur_element.replace('"></a>', '"/></a>') + + dom = html.fromstring(cur_element) + link = dom.xpath('//a')[0] + + url = urljoin(base_url, link.attrib.get('href')) + title = link.attrib.get('title', '') + + thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src']) + # TODO: get image with higher resolution + img_src = thumbnail_src + + # check if url is showing to a photo + if '/photo/' not in url: + continue + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'content': '', + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/www500px.py b/searx/engines/www500px.py new file mode 100644 index 0000000..7a2015a --- /dev/null +++ b/searx/engines/www500px.py @@ -0,0 +1,73 @@ +""" + 500px (Images) + + @website https://500px.com + @provide-api yes (https://developers.500px.com/) + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, thumbnail, img_src, content + + @todo rewrite to api +""" + +from json import loads +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['images'] +paging = True + +# search-url +base_url = 'https://500px.com' +search_url = 'https://api.500px.com/v1/photos/search?type=photos'\ + '&{query}'\ + '&image_size%5B%5D=4'\ + '&image_size%5B%5D=20'\ + '&image_size%5B%5D=21'\ + '&image_size%5B%5D=1080'\ + '&image_size%5B%5D=1600'\ + '&image_size%5B%5D=2048'\ + '&include_states=true'\ + '&formats=jpeg%2Clytro'\ + '&include_tags=true'\ + '&exclude_nude=true'\ + '&page={pageno}'\ + '&rpp=50'\ + '&sdk_key=b68e60cff4c929bedea36ca978830c5caca790c3' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(pageno=params['pageno'], + query=urlencode({'term': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + response_json = loads(resp.text) + + # parse results + for result in response_json['photos']: + url = urljoin(base_url, result['url']) + title = result['name'] + # last index is the biggest resolution + img_src = result['image_url'][-1] + thumbnail_src = result['image_url'][0] + content = result['description'] or '' + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'content': content, + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py new file mode 100644 index 0000000..c8c56da --- /dev/null +++ b/searx/engines/xpath.py @@ -0,0 +1,122 @@ +from lxml import html +from lxml.etree import _ElementStringResult, _ElementUnicodeResult +from searx.utils import html_to_text +from searx.url_utils import unquote, urlencode, urljoin, urlparse + +search_url = None +url_xpath = None +content_xpath = None +title_xpath = None +paging = False +suggestion_xpath = '' +results_xpath = '' + +# parameters for engines with paging support +# +# number of results on each page +# (only needed if the site requires not a page number, but an offset) +page_size = 1 +# number of the first page (usually 0 or 1) +first_page_num = 1 + + +''' +if xpath_results is list, extract the text from each result and concat the list +if xpath_results is a xml element, extract all the text node from it + ( text_content() method from lxml ) +if xpath_results is a string element, then it's already done +''' + + +def extract_text(xpath_results): + if type(xpath_results) == list: + # it's list of result : concat everything using recursive call + result = '' + for e in xpath_results: + result = result + extract_text(e) + return result.strip() + elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: + # it's a string + return ''.join(xpath_results) + else: + # it's a element + text = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False) + text = text.strip().replace('\n', ' ') + return ' '.join(text.split()) + + +def extract_url(xpath_results, search_url): + if xpath_results == []: + raise Exception('Empty url resultset') + url = extract_text(xpath_results) + + if url.startswith('//'): + # add http or https to this kind of url //example.com/ + parsed_search_url = urlparse(search_url) + url = u'{0}:{1}'.format(parsed_search_url.scheme, url) + elif url.startswith('/'): + # fix relative url to the search engine + url = urljoin(search_url, url) + + # normalize url + url = normalize_url(url) + + return url + + +def normalize_url(url): + parsed_url = urlparse(url) + + # add a / at this end of the url if there is no path + if not parsed_url.netloc: + raise Exception('Cannot parse url') + if not parsed_url.path: + url += '/' + + # FIXME : hack for yahoo + if parsed_url.hostname == 'search.yahoo.com'\ + and parsed_url.path.startswith('/r'): + p = parsed_url.path + mark = p.find('/**') + if mark != -1: + return unquote(p[mark + 3:]).decode('utf-8') + + return url + + +def request(query, params): + query = urlencode({'q': query})[2:] + + fp = {'query': query} + if paging and search_url.find('{pageno}') >= 0: + fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num + + params['url'] = search_url.format(**fp) + params['query'] = query + + return params + + +def response(resp): + results = [] + dom = html.fromstring(resp.text) + if results_xpath: + for result in dom.xpath(results_xpath): + url = extract_url(result.xpath(url_xpath), search_url) + title = extract_text(result.xpath(title_xpath)) + content = extract_text(result.xpath(content_xpath)) + results.append({'url': url, 'title': title, 'content': content}) + else: + for url, title, content in zip( + (extract_url(x, search_url) for + x in dom.xpath(url_xpath)), + map(extract_text, dom.xpath(title_xpath)), + map(extract_text, dom.xpath(content_xpath)) + ): + results.append({'url': url, 'title': title, 'content': content}) + + if not suggestion_xpath: + return results + for suggestion in dom.xpath(suggestion_xpath): + results.append({'suggestion': extract_text(suggestion)}) + return results diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py new file mode 100644 index 0000000..a62a129 --- /dev/null +++ b/searx/engines/yacy.py @@ -0,0 +1,99 @@ +# Yacy (Web, Images, Videos, Music, Files) +# +# @website http://yacy.net +# @provide-api yes +# (http://www.yacy-websuche.de/wiki/index.php/Dev:APIyacysearch) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse (general) url, title, content, publishedDate +# @parse (images) url, title, img_src +# +# @todo parse video, audio and file results + +from json import loads +from dateutil import parser +from searx.url_utils import urlencode + +from searx.utils import html_to_text + +# engine dependent config +categories = ['general', 'images'] # TODO , 'music', 'videos', 'files' +paging = True +language_support = True +number_of_results = 5 + +# search-url +base_url = 'http://localhost:8090' +search_url = '/yacysearch.json?{query}'\ + '&startRecord={offset}'\ + '&maximumRecords={limit}'\ + '&contentdom={search_type}'\ + '&resource=global' + +# yacy specific type-definitions +search_types = {'general': 'text', + 'images': 'image', + 'files': 'app', + 'music': 'audio', + 'videos': 'video'} + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * number_of_results + search_type = search_types.get(params.get('category'), '0') + + params['url'] = base_url +\ + search_url.format(query=urlencode({'query': query}), + offset=offset, + limit=number_of_results, + search_type=search_type) + + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&lr=lang_' + params['language'].split('-')[0] + + return params + + +# get response from search-request +def response(resp): + results = [] + + raw_search_results = loads(resp.text) + + # return empty array if there are no results + if not raw_search_results: + return [] + + search_results = raw_search_results.get('channels', []) + + if len(search_results) == 0: + return [] + + for result in search_results[0].get('items', []): + # parse image results + if result.get('image'): + # append result + results.append({'url': result['url'], + 'title': result['title'], + 'content': '', + 'img_src': result['image'], + 'template': 'images.html'}) + + # parse general results + else: + publishedDate = parser.parse(result['pubDate']) + + # append result + results.append({'url': result['link'], + 'title': result['title'], + 'content': html_to_text(result['description']), + 'publishedDate': publishedDate}) + + # TODO parse video, audio and file results + + # return results + return results diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py new file mode 100644 index 0000000..5387aaf --- /dev/null +++ b/searx/engines/yahoo.py @@ -0,0 +1,153 @@ +""" + Yahoo (Web) + + @website https://search.yahoo.com/web + @provide-api yes (https://developer.yahoo.com/boss/search/), + $0.80/1000 queries + + @using-api no (because pricing) + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content, suggestion +""" + +from lxml import html +from searx.engines.xpath import extract_text, extract_url +from searx.url_utils import unquote, urlencode + +# engine dependent config +categories = ['general'] +paging = True +language_support = True +time_range_support = True + +# search-url +base_url = 'https://search.yahoo.com/' +search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}' +search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time' + +supported_languages_url = 'https://search.yahoo.com/web/advanced' + +# specific xpath variables +results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" +url_xpath = './/h3/a/@href' +title_xpath = './/h3/a' +content_xpath = './/div[@class="compText aAbs"]' +suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a" + +time_range_dict = {'day': ['1d', 'd'], + 'week': ['1w', 'w'], + 'month': ['1m', 'm']} + + +# remove yahoo-specific tracking-url +def parse_url(url_string): + endings = ['/RS', '/RK'] + endpositions = [] + start = url_string.find('http', url_string.find('/RU=') + 1) + + for ending in endings: + endpos = url_string.rfind(ending) + if endpos > -1: + endpositions.append(endpos) + + if start == 0 or len(endpositions) == 0: + return url_string + else: + end = min(endpositions) + return unquote(url_string[start:end]) + + +def _get_url(query, offset, language, time_range): + if time_range in time_range_dict: + return base_url + search_url_with_time.format(offset=offset, + query=urlencode({'p': query}), + lang=language, + age=time_range_dict[time_range][0], + btf=time_range_dict[time_range][1]) + return base_url + search_url.format(offset=offset, + query=urlencode({'p': query}), + lang=language) + + +def _get_language(params): + if params['language'] == 'all': + return 'en' + elif params['language'][:2] == 'zh': + if params['language'] == 'zh' or params['language'] == 'zh-CH': + return 'szh' + else: + return 'tzh' + else: + return params['language'].split('-')[0] + + +# do search-request +def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + + offset = (params['pageno'] - 1) * 10 + 1 + language = _get_language(params) + + params['url'] = _get_url(query, offset, language, params['time_range']) + + # TODO required? + params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ + .format(lang=language) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + try: + results_num = int(dom.xpath('//div[@class="compPagination"]/span[last()]/text()')[0] + .split()[0].replace(',', '')) + results.append({'number_of_results': results_num}) + except: + pass + + # parse results + for result in dom.xpath(results_xpath): + try: + url = parse_url(extract_url(result.xpath(url_xpath), search_url)) + title = extract_text(result.xpath(title_xpath)[0]) + except: + continue + + content = extract_text(result.xpath(content_xpath)[0]) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # if no suggestion found, return results + suggestions = dom.xpath(suggestion_xpath) + if not suggestions: + return results + + # parse suggestion + for suggestion in suggestions: + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = html.fromstring(resp.text) + options = dom.xpath('//div[@id="yschlang"]/span/label/input') + for option in options: + code = option.xpath('./@value')[0][5:].replace('_', '-') + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py new file mode 100644 index 0000000..ae54a4a --- /dev/null +++ b/searx/engines/yahoo_news.py @@ -0,0 +1,107 @@ +# Yahoo (News) +# +# @website https://news.yahoo.com +# @provide-api yes (https://developer.yahoo.com/boss/search/) +# $0.80/1000 queries +# +# @using-api no (because pricing) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, publishedDate + +import re +from datetime import datetime, timedelta +from lxml import html +from searx.engines.xpath import extract_text, extract_url +from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url +from dateutil import parser +from searx.url_utils import urlencode + +# engine dependent config +categories = ['news'] +paging = True +language_support = True + +# search-url +search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa + +# specific xpath variables +results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' +url_xpath = './/h3/a/@href' +title_xpath = './/h3/a' +content_xpath = './/div[@class="compText"]' +publishedDate_xpath = './/span[contains(@class,"tri")]' +suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].split('_')[0] + + params['url'] = search_url.format(offset=offset, + query=urlencode({'p': query}), + lang=language) + + # TODO required? + params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\ + .format(lang=language) + return params + + +def sanitize_url(url): + if ".yahoo.com/" in url: + return re.sub(u"\\;\\_ylt\\=.+$", "", url) + else: + return url + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + urls = result.xpath(url_xpath) + if len(urls) != 1: + continue + url = sanitize_url(parse_url(extract_url(urls, search_url))) + title = extract_text(result.xpath(title_xpath)[0]) + content = extract_text(result.xpath(content_xpath)[0]) + + # parse publishedDate + publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) + + # still useful ? + if re.match("^[0-9]+ minute(s|) ago$", publishedDate): + publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) + elif re.match("^[0-9]+ days? ago$", publishedDate): + publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group())) + elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): + timeNumbers = re.findall(r'\d+', publishedDate) + publishedDate = datetime.now()\ + - timedelta(hours=int(timeNumbers[0]))\ + - timedelta(minutes=int(timeNumbers[1])) + else: + try: + publishedDate = parser.parse(publishedDate) + except: + publishedDate = datetime.now() + + if publishedDate.year == 1900: + publishedDate = publishedDate.replace(year=datetime.now().year) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'publishedDate': publishedDate}) + + # return results + return results diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py new file mode 100644 index 0000000..1c789f6 --- /dev/null +++ b/searx/engines/yandex.py @@ -0,0 +1,64 @@ +""" + Yahoo (Web) + + @website https://yandex.ru/ + @provide-api ? + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html +from searx import logger +from searx.url_utils import urlencode + +logger = logger.getChild('yandex engine') + +# engine dependent config +categories = ['general'] +paging = True +language_support = True # TODO + +default_tld = 'com' +language_map = {'ru': 'ru', + 'ua': 'ua', + 'be': 'by', + 'kk': 'kz', + 'tr': 'com.tr'} + +# search-url +base_url = 'https://yandex.{tld}/' +search_url = 'search/?{query}&p={page}' + +results_xpath = '//li[@class="serp-item"]' +url_xpath = './/h2/a/@href' +title_xpath = './/h2/a//text()' +content_xpath = './/div[@class="text-container typo typo_text_m typo_line_m organic__text"]//text()' + + +def request(query, params): + lang = params['language'].split('-')[0] + host = base_url.format(tld=language_map.get(lang) or default_tld) + params['url'] = host + search_url.format(page=params['pageno'] - 1, + query=urlencode({'text': query})) + return params + + +# get response from search-request +def response(resp): + dom = html.fromstring(resp.text) + results = [] + + for result in dom.xpath(results_xpath): + try: + res = {'url': result.xpath(url_xpath)[0], + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))} + except: + logger.exception('yandex parse crash') + continue + + results.append(res) + + return results diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py new file mode 100644 index 0000000..6de18aa --- /dev/null +++ b/searx/engines/youtube_api.py @@ -0,0 +1,83 @@ +# Youtube (Videos) +# +# @website https://www.youtube.com/ +# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, content, publishedDate, thumbnail, embedded + +from json import loads +from dateutil import parser +from searx.url_utils import urlencode + +# engine dependent config +categories = ['videos', 'music'] +paging = False +language_support = True +api_key = None + +# search-url +base_url = 'https://www.googleapis.com/youtube/v3/search' +search_url = base_url + '?part=snippet&{query}&maxResults=20&key={api_key}' + +embedded_url = '<iframe width="540" height="304" ' +\ + 'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\ + 'frameborder="0" allowfullscreen></iframe>' + +base_youtube_url = 'https://www.youtube.com/watch?v=' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), + api_key=api_key) + + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0] + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if 'items' not in search_results: + return [] + + # parse results + for result in search_results['items']: + videoid = result['id']['videoId'] + + title = result['snippet']['title'] + content = '' + thumbnail = '' + + pubdate = result['snippet']['publishedAt'] + publishedDate = parser.parse(pubdate) + + thumbnail = result['snippet']['thumbnails']['high']['url'] + + content = result['snippet']['description'] + + url = base_youtube_url + videoid + + embedded = embedded_url.format(videoid=videoid) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'publishedDate': publishedDate, + 'embedded': embedded, + 'thumbnail': thumbnail}) + + # return results + return results diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py new file mode 100644 index 0000000..9f01841 --- /dev/null +++ b/searx/engines/youtube_noapi.py @@ -0,0 +1,89 @@ +# Youtube (Videos) +# +# @website https://www.youtube.com/ +# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list) +# +# @using-api no +# @results HTML +# @stable no +# @parse url, title, content, publishedDate, thumbnail, embedded + +from lxml import html +from searx.engines.xpath import extract_text +from searx.utils import list_get +from searx.url_utils import quote_plus + +# engine dependent config +categories = ['videos', 'music'] +paging = True +language_support = False +time_range_support = True + +# search-url +base_url = 'https://www.youtube.com/results' +search_url = base_url + '?search_query={query}&page={page}' +time_range_url = '&sp=EgII{time_range}%253D%253D' +time_range_dict = {'day': 'Ag', + 'week': 'Aw', + 'month': 'BA', + 'year': 'BQ'} + +embedded_url = '<iframe width="540" height="304" ' +\ + 'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\ + 'frameborder="0" allowfullscreen></iframe>' + +base_youtube_url = 'https://www.youtube.com/watch?v=' + +# specific xpath variables +results_xpath = "//ol/li/div[contains(@class, 'yt-lockup yt-lockup-tile yt-lockup-video vve-check')]" +url_xpath = './/h3/a/@href' +title_xpath = './/div[@class="yt-lockup-content"]/h3/a' +content_xpath = './/div[@class="yt-lockup-content"]/div[@class="yt-lockup-description yt-ui-ellipsis yt-ui-ellipsis-2"]' + + +# returns extract_text on the first result selected by the xpath or None +def extract_text_from_dom(result, xpath): + r = result.xpath(xpath) + if len(r) > 0: + return extract_text(r[0]) + return None + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=quote_plus(query), + page=params['pageno']) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + videoid = list_get(result.xpath('@data-context-item-id'), 0) + if videoid is not None: + url = base_youtube_url + videoid + thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg' + + title = extract_text_from_dom(result, title_xpath) or videoid + content = extract_text_from_dom(result, content_xpath) + + embedded = embedded_url.format(videoid=videoid) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'embedded': embedded, + 'thumbnail': thumbnail}) + + # return results + return results |