diff options
author | Johannes 'josch' Schauer <josch@debian.org> | 2020-02-01 01:09:45 +0100 |
---|---|---|
committer | Johannes 'josch' Schauer <josch@debian.org> | 2020-02-01 01:09:45 +0100 |
commit | 3e49246c2e44159486ea66fed3757cdb4e4d0c50 (patch) | |
tree | 73fecfac5dd7475f346d4bff59d78aec04a966a8 /utils |
Import Upstream version 0.15.0+dfsg1
Diffstat (limited to 'utils')
-rw-r--r-- | utils/fabfile.py | 117 | ||||
-rw-r--r-- | utils/fetch_currencies.py | 163 | ||||
-rwxr-xr-x | utils/fetch_firefox_version.py | 73 | ||||
-rw-r--r-- | utils/fetch_languages.py | 191 | ||||
-rw-r--r-- | utils/google_search.py | 35 | ||||
-rwxr-xr-x | utils/standalone_searx.py | 104 | ||||
-rwxr-xr-x | utils/update-translations.sh | 15 |
7 files changed, 698 insertions, 0 deletions
diff --git a/utils/fabfile.py b/utils/fabfile.py new file mode 100644 index 0000000..559e2ab --- /dev/null +++ b/utils/fabfile.py @@ -0,0 +1,117 @@ +from fabric.api import cd, run, sudo, put +from cStringIO import StringIO + + +base_dir = '/usr/local' +hostname = 'searx.me' +searx_dir = base_dir + '/searx' +searx_ve_dir = searx_dir + '/searx-ve' +current_user = run('whoami').stdout.strip() + +uwsgi_file = ''' +[uwsgi] +# Who will run the code +uid = {user} +gid = {user} + +# Number of workers +workers = 8 + +# The right granted on the created socket +chmod-socket = 666 + +# Plugin to use and interpretor config +single-interpreter = true +master = true +plugin = python + +# Module to import +module = searx.webapp + +# Virtualenv and python path +virtualenv = {searx_ve_dir} +pythonpath = {searx_dir} +chdir = {searx_dir}/searx +'''.format(user=current_user, + searx_dir=searx_dir, + searx_ve_dir=searx_ve_dir) + +nginx_config = ''' +server {{ + listen 80; + server_name {hostname}; + server_name www.{hostname}; + root /usr/local/searx; + + location / {{ + include uwsgi_params; + uwsgi_pass unix:/run/uwsgi/app/searx/socket; + }} +}} +'''.format(hostname=hostname) + + +def stop(): + sudo('/etc/init.d/uwsgi stop') + + +def start(): + sudo('/etc/init.d/uwsgi start') + + +def restart(): + sudo('/etc/init.d/uwsgi restart') + + +def init(): + if not run('test -d ' + searx_dir, warn_only=True).failed: + return + + sudo('apt-get update') + + sudo('apt-get install git' + ' build-essential' + ' libxslt-dev' + ' python-dev' + ' python-virtualenv' + ' python-pybabel' + ' zlib1g-dev' + ' uwsgi' + ' uwsgi-plugin-python' + ' nginx') + + sudo('mkdir -p ' + base_dir) + + put(StringIO(nginx_config), '/etc/nginx/sites-enabled/searx', use_sudo=True) + sudo('/etc/init.d/nginx restart') + + with cd(base_dir): + sudo('git clone https://github.com/asciimoo/searx') + + sudo('chown -R {user}:{user} {searx_dir}'.format(user=current_user, searx_dir=searx_dir)) + put(StringIO(uwsgi_file), searx_dir + '/uwsgi.ini') + sudo('ln -s {0}/uwsgi.ini /etc/uwsgi/apps-enabled/searx.ini'.format(searx_dir)) + + run('virtualenv {0}'.format(searx_ve_dir)) + + with cd(searx_dir): + run('source {0}/bin/activate && pip install -r requirements.txt'.format(searx_ve_dir)) + + start() + + +def deploy(): + init() + + with cd(searx_dir): + run("git stash", warn_only=True) + run("git pull origin master") + run("git stash pop", warn_only=True) + + restart() + + +def clean(): + sudo('rm -rf {searx_dir}'.format(searx_dir=searx_dir), warn_only=True) + sudo('rm /etc/uwsgi/apps-enabled/searx.ini', warn_only=True) + sudo('rm /etc/nginx/sites-enabled/searx', warn_only=True) diff --git a/utils/fetch_currencies.py b/utils/fetch_currencies.py new file mode 100644 index 0000000..5605fb3 --- /dev/null +++ b/utils/fetch_currencies.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function + +import json +import re +import unicodedata +import string +from urllib import urlencode +from requests import get + +languages = {'de', 'en', 'es', 'fr', 'hu', 'it', 'nl', 'jp'} + +url_template = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&{query}&props=labels%7Cdatatype%7Cclaims%7Caliases&languages=' + '|'.join(languages) +url_wmflabs_template = 'http://wdq.wmflabs.org/api?q=' +url_wikidata_search_template = 'http://www.wikidata.org/w/api.php?action=query&list=search&format=json&srnamespace=0&srprop=sectiontitle&{query}' + +wmflabs_queries = [ + 'CLAIM[31:8142]', # all devise +] + +db = { + 'iso4217': { + }, + 'names': { + } +} + + +def remove_accents(data): + return unicodedata.normalize('NFKD', data).lower() + + +def normalize_name(name): + return re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' ')) + + +def add_currency_name(name, iso4217): + global db + + db_names = db['names'] + + if not isinstance(iso4217, basestring): + print("problem", name, iso4217) + return + + name = normalize_name(name) + + if name == '': + print("name empty", iso4217) + return + + iso4217_set = db_names.get(name, None) + if iso4217_set is not None and iso4217 not in iso4217_set: + db_names[name].append(iso4217) + else: + db_names[name] = [iso4217] + + +def add_currency_label(label, iso4217, language): + global db + + db['iso4217'][iso4217] = db['iso4217'].get(iso4217, {}) + db['iso4217'][iso4217][language] = label + + +def get_property_value(data, name): + prop = data.get('claims', {}).get(name, {}) + if len(prop) == 0: + return None + + value = prop[0].get('mainsnak', {}).get('datavalue', {}).get('value', '') + if value == '': + return None + + return value + + +def parse_currency(data): + iso4217 = get_property_value(data, 'P498') + + if iso4217 is not None: + unit = get_property_value(data, 'P558') + if unit is not None: + add_currency_name(unit, iso4217) + + labels = data.get('labels', {}) + for language in languages: + name = labels.get(language, {}).get('value', None) + if name is not None: + add_currency_name(name, iso4217) + add_currency_label(name, iso4217, language) + + aliases = data.get('aliases', {}) + for language in aliases: + for i in range(0, len(aliases[language])): + alias = aliases[language][i].get('value', None) + add_currency_name(alias, iso4217) + + +def fetch_data(wikidata_ids): + url = url_template.format(query=urlencode({'ids': '|'.join(wikidata_ids)})) + htmlresponse = get(url) + jsonresponse = json.loads(htmlresponse.content) + entities = jsonresponse.get('entities', {}) + + for pname in entities: + pvalue = entities.get(pname) + parse_currency(pvalue) + + +def add_q(i): + return "Q" + str(i) + + +def fetch_data_batch(wikidata_ids): + while len(wikidata_ids) > 0: + if len(wikidata_ids) > 50: + fetch_data(wikidata_ids[0:49]) + wikidata_ids = wikidata_ids[50:] + else: + fetch_data(wikidata_ids) + wikidata_ids = [] + + +def wdq_query(query): + url = url_wmflabs_template + query + htmlresponse = get(url) + jsonresponse = json.loads(htmlresponse.content) + qlist = map(add_q, jsonresponse.get('items', {})) + error = jsonresponse.get('status', {}).get('error', None) + if error is not None and error != 'OK': + print("error for query '" + query + "' :" + error) + + fetch_data_batch(qlist) + + +def wd_query(query, offset=0): + qlist = [] + + url = url_wikidata_search_template.format(query=urlencode({'srsearch': query, 'srlimit': 50, 'sroffset': offset})) + htmlresponse = get(url) + jsonresponse = json.loads(htmlresponse.content) + for r in jsonresponse.get('query', {}).get('search', {}): + qlist.append(r.get('title', '')) + fetch_data_batch(qlist) + + +# fetch # +for q in wmflabs_queries: + wdq_query(q) + +# static +add_currency_name(u"euro", 'EUR') +add_currency_name(u"euros", 'EUR') +add_currency_name(u"dollar", 'USD') +add_currency_name(u"dollars", 'USD') +add_currency_name(u"peso", 'MXN') +add_currency_name(u"pesos", 'MXN') + +# write +f = open("currencies.json", "wb") +json.dump(db, f, indent=4, encoding="utf-8") +f.close() diff --git a/utils/fetch_firefox_version.py b/utils/fetch_firefox_version.py new file mode 100755 index 0000000..ed17958 --- /dev/null +++ b/utils/fetch_firefox_version.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +# set path +from sys import path +from os.path import realpath, dirname, join +path.append(realpath(dirname(realpath(__file__)) + '/../')) + +# +import json +import requests +import re +from distutils.version import LooseVersion, StrictVersion +from lxml import html +from searx.url_utils import urlparse, urljoin +from searx import searx_dir + +URL = 'https://ftp.mozilla.org/pub/firefox/releases/' +RELEASE_PATH = '/pub/firefox/releases/' + +NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$') +# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$') +# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$') + +# +useragents = { + "versions": (), + "os": ('Windows NT 10; WOW64', + 'X11; Linux x86_64'), + "ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}" +} + + +def fetch_firefox_versions(): + resp = requests.get(URL, timeout=2.0) + if resp.status_code != 200: + raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code) + else: + dom = html.fromstring(resp.text) + versions = [] + + for link in dom.xpath('//a/@href'): + url = urlparse(urljoin(URL, link)) + path = url.path + if path.startswith(RELEASE_PATH): + version = path[len(RELEASE_PATH):-1] + if NORMAL_REGEX.match(version): + versions.append(LooseVersion(version)) + + list.sort(versions, reverse=True) + return versions + + +def fetch_firefox_last_versions(): + versions = fetch_firefox_versions() + + result = [] + major_last = versions[0].version[0] + major_list = (major_last, major_last - 1) + for version in versions: + major_current = version.version[0] + if major_current in major_list: + result.append(version.vstring) + + return result + + +def get_useragents_filename(): + return join(join(searx_dir, "data"), "useragents.json") + + +useragents["versions"] = fetch_firefox_last_versions() +with open(get_useragents_filename(), "w") as f: + json.dump(useragents, f, indent=4, ensure_ascii=False) diff --git a/utils/fetch_languages.py b/utils/fetch_languages.py new file mode 100644 index 0000000..a6af073 --- /dev/null +++ b/utils/fetch_languages.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- + +# This script generates languages.py from intersecting each engine's supported languages. +# +# Output files (engines_languages.json and languages.py) +# are written in current directory to avoid overwriting in case something goes wrong. + +from json import dump +import io +from sys import path +from babel import Locale, UnknownLocaleError +from babel.languages import get_global + +path.append('../searx') # noqa +from searx import settings +from searx.engines import initialize_engines, engines + +# Output files. +engines_languages_file = 'engines_languages.json' +languages_file = 'languages.py' + + +# Fetchs supported languages for each engine and writes json file with those. +def fetch_supported_languages(): + engines_languages = {} + for engine_name in engines: + if hasattr(engines[engine_name], 'fetch_supported_languages'): + try: + engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() + if type(engines_languages[engine_name]) == list: + engines_languages[engine_name] = sorted(engines_languages[engine_name]) + except Exception as e: + print(e) + + # write json file + with io.open(engines_languages_file, "w", encoding="utf-8") as f: + dump(engines_languages, f, ensure_ascii=False, indent=4, separators=(',', ': ')) + + return engines_languages + + +# Get babel Locale object from lang_code if possible. +def get_locale(lang_code): + try: + locale = Locale.parse(lang_code, sep='-') + return locale + except (UnknownLocaleError, ValueError): + return None + + +# Append engine_name to list of engines that support locale. +def add_engine_counter(lang_code, engine_name, languages): + if lang_code in languages: + if 'counter' not in languages[lang_code]: + languages[lang_code]['counter'] = [engine_name] + elif engine_name not in languages[lang_code]['counter']: + languages[lang_code]['counter'].append(engine_name) + + +# Join all language lists. +# TODO: Add language names from engine's language list if name not known by babel. +def join_language_lists(engines_languages): + language_list = {} + for engine_name in engines_languages: + for lang_code in engines_languages[engine_name]: + + # apply custom fixes if necessary + if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values(): + lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items() + if lang_code == alias) + + locale = get_locale(lang_code) + + # ensure that lang_code uses standard language and country codes + if locale and locale.territory: + lang_code = locale.language + '-' + locale.territory + + # add locale if it's not in list + if lang_code not in language_list: + if locale: + language_list[lang_code] = {'name': locale.get_language_name().title(), + 'english_name': locale.english_name, + 'country': locale.get_territory_name() or ''} + + # also add language without country + if locale.language not in language_list: + language_list[locale.language] = {'name': locale.get_language_name().title(), + 'english_name': locale.english_name} + else: + language_list[lang_code] = {} + + # count engine for both language_country combination and language alone + add_engine_counter(lang_code, engine_name, language_list) + add_engine_counter(lang_code.split('-')[0], engine_name, language_list) + + return language_list + + +# Filter language list so it only includes the most supported languages and countries. +def filter_language_list(all_languages): + min_supported_engines = 10 + main_engines = [engine_name for engine_name in engines.keys() + if 'general' in engines[engine_name].categories and + engines[engine_name].supported_languages and + not engines[engine_name].disabled] + + # filter list to include only languages supported by most engines or all default general engines + filtered_languages = {code: lang for code, lang + in all_languages.items() + if (len(lang.get('counter', [])) >= min_supported_engines or + all(main_engine in lang.get('counter', []) + for main_engine in main_engines))} + + return filtered_languages + + +# Add country codes to languages without one and filter out language codes. +def assign_country_codes(filtered_languages, all_languages): + sorted_languages = sorted(all_languages, + key=lambda lang: len(all_languages[lang].get('counter', [])), + reverse=True) + previous_lang = None + previous_code = None + countries = 0 + for current_code in sorted(filtered_languages): + current_lang = current_code.split('-')[0] + + # count country codes per language + if current_lang == previous_lang: + countries += 1 + + else: + if previous_lang is not None: + # if language has no single country code + if countries == 0: + # try to get country code with most supported engines + for l in sorted_languages: + l_parts = l.split('-') + if len(l_parts) == 2 and l_parts[0] == previous_lang: + filtered_languages[l] = all_languages[l] + filtered_languages[l]['country'] = '' + countries = 1 + break + + if countries == 0: + # get most likely country code from babel + subtags = get_global('likely_subtags').get(previous_lang) + if subtags: + subtag_parts = subtags.split('_') + new_code = subtag_parts[0] + '-' + subtag_parts[-1] + filtered_languages[new_code] = all_languages[previous_lang] + countries = 1 + + if countries == 1: + # remove countryless version of language if there's only one country + del filtered_languages[previous_lang] + if previous_code in filtered_languages: + filtered_languages[previous_code]['country'] = '' + + countries = 0 + previous_lang = current_lang + + previous_code = current_code + + +# Write languages.py. +def write_languages_file(languages): + new_file = open(languages_file, 'wb') + file_content = '# -*- coding: utf-8 -*-\n'\ + + '# list of language codes\n'\ + + '# this file is generated automatically by utils/update_search_languages.py\n'\ + + '\nlanguage_codes = (' + for code in sorted(languages): + file_content += '\n (u"' + code + '"'\ + + ', u"' + languages[code]['name'].split(' (')[0] + '"'\ + + ', u"' + languages[code].get('country', '') + '"'\ + + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),' + # remove last comma + file_content = file_content[:-1] + file_content += '\n)\n' + new_file.write(file_content.encode('utf8')) + new_file.close() + + +if __name__ == "__main__": + initialize_engines(settings['engines']) + engines_languages = fetch_supported_languages() + all_languages = join_language_lists(engines_languages) + filtered_languages = filter_language_list(all_languages) + assign_country_codes(filtered_languages, all_languages) + write_languages_file(filtered_languages) diff --git a/utils/google_search.py b/utils/google_search.py new file mode 100644 index 0000000..cad32ee --- /dev/null +++ b/utils/google_search.py @@ -0,0 +1,35 @@ +from sys import argv, exit + +if not len(argv) > 1: + print('search query required') + exit(1) + +import requests +from json import dumps +from searx.engines import google +from searx.search import default_request_params + +request_params = default_request_params() +# Possible params +# request_params['headers']['User-Agent'] = '' +# request_params['category'] = '' +request_params['pageno'] = 1 +request_params['language'] = 'en_us' +request_params['time_range'] = '' + +params = google.request(argv[1], request_params) + +request_args = dict( + headers=request_params['headers'], + cookies=request_params['cookies'], +) + +if request_params['method'] == 'GET': + req = requests.get +else: + req = requests.post + request_args['data'] = request_params['data'] + +resp = req(request_params['url'], **request_args) +resp.search_params = request_params +print(dumps(google.response(resp))) diff --git a/utils/standalone_searx.py b/utils/standalone_searx.py new file mode 100755 index 0000000..2231636 --- /dev/null +++ b/utils/standalone_searx.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2016- by Alexandre Flament, <alex@al-f.net> +''' + +# set path +from sys import path +from os.path import realpath, dirname +path.append(realpath(dirname(realpath(__file__)) + '/../')) + +# initialization +from json import dumps +from searx import settings +import sys +import codecs +import searx.query +import searx.search +import searx.engines +import searx.preferences +import argparse + +searx.engines.initialize_engines(settings['engines']) + +# command line parsing +parser = argparse.ArgumentParser(description='Standalone searx.') +parser.add_argument('query', type=str, + help='Text query') +parser.add_argument('--category', type=str, nargs='?', + choices=searx.engines.categories.keys(), + default='general', + help='Search category') +parser.add_argument('--lang', type=str, nargs='?',default='all', + help='Search language') +parser.add_argument('--pageno', type=int, nargs='?', default=1, + help='Page number starting from 1') +parser.add_argument('--safesearch', type=str, nargs='?', choices=['0', '1', '2'], default='0', + help='Safe content filter from none to strict') +parser.add_argument('--timerange', type=str, nargs='?', choices=['day', 'week', 'month', 'year'], + help='Filter by time range') +args = parser.parse_args() + +# search results for the query +form = { + "q":args.query, + "categories":args.category.decode('utf-8'), + "pageno":str(args.pageno), + "language":args.lang, + "time_range":args.timerange +} +preferences = searx.preferences.Preferences(['oscar'], searx.engines.categories.keys(), searx.engines.engines, []) +preferences.key_value_settings['safesearch'].parse(args.safesearch) + +search_query = searx.search.get_search_query_from_webapp(preferences, form) +search = searx.search.Search(search_query) +result_container = search.search() + +# output +from datetime import datetime + +def no_parsed_url(results): + for result in results: + del result['parsed_url'] + return results + +def json_serial(obj): + """JSON serializer for objects not serializable by default json code""" + if isinstance(obj, datetime): + serial = obj.isoformat() + return serial + raise TypeError ("Type not serializable") + +result_container_json = { + "search": { + "q": search_query.query, + "pageno": search_query.pageno, + "lang": search_query.lang, + "safesearch": search_query.safesearch, + "timerange": search_query.time_range, + "engines": search_query.engines + }, + "results": no_parsed_url(result_container.get_ordered_results()), + "infoboxes": result_container.infoboxes, + "suggestions": list(result_container.suggestions), + "answers": list(result_container.answers), + "paging": result_container.paging, + "results_number": result_container.results_number() +} +sys.stdout = codecs.getwriter("UTF-8")(sys.stdout) +sys.stdout.write(dumps(result_container_json, sort_keys=True, indent=4, ensure_ascii=False, encoding="utf-8", default=json_serial)) + diff --git a/utils/update-translations.sh b/utils/update-translations.sh new file mode 100755 index 0000000..240387a --- /dev/null +++ b/utils/update-translations.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +# script to easily update translation language files + +# add new language: +# pybabel init -i messages.pot -d searx/translations -l en + +SEARX_DIR='searx' + +pybabel extract -F babel.cfg -o messages.pot "$SEARX_DIR" +for f in `ls "$SEARX_DIR"'/translations/'`; do + pybabel update -N -i messages.pot -d "$SEARX_DIR"'/translations/' -l "$f" +done + +echo '[!] update done, edit .po files if required and run pybabel compile -d searx/translations/' |