summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorJohannes 'josch' Schauer <josch@debian.org>2020-02-01 01:09:45 +0100
committerJohannes 'josch' Schauer <josch@debian.org>2020-02-01 01:09:45 +0100
commit3e49246c2e44159486ea66fed3757cdb4e4d0c50 (patch)
tree73fecfac5dd7475f346d4bff59d78aec04a966a8 /utils
Import Upstream version 0.15.0+dfsg1
Diffstat (limited to 'utils')
-rw-r--r--utils/fabfile.py117
-rw-r--r--utils/fetch_currencies.py163
-rwxr-xr-xutils/fetch_firefox_version.py73
-rw-r--r--utils/fetch_languages.py191
-rw-r--r--utils/google_search.py35
-rwxr-xr-xutils/standalone_searx.py104
-rwxr-xr-xutils/update-translations.sh15
7 files changed, 698 insertions, 0 deletions
diff --git a/utils/fabfile.py b/utils/fabfile.py
new file mode 100644
index 0000000..559e2ab
--- /dev/null
+++ b/utils/fabfile.py
@@ -0,0 +1,117 @@
+from fabric.api import cd, run, sudo, put
+from cStringIO import StringIO
+
+
+base_dir = '/usr/local'
+hostname = 'searx.me'
+searx_dir = base_dir + '/searx'
+searx_ve_dir = searx_dir + '/searx-ve'
+current_user = run('whoami').stdout.strip()
+
+uwsgi_file = '''
+[uwsgi]
+# Who will run the code
+uid = {user}
+gid = {user}
+
+# Number of workers
+workers = 8
+
+# The right granted on the created socket
+chmod-socket = 666
+
+# Plugin to use and interpretor config
+single-interpreter = true
+master = true
+plugin = python
+
+# Module to import
+module = searx.webapp
+
+# Virtualenv and python path
+virtualenv = {searx_ve_dir}
+pythonpath = {searx_dir}
+chdir = {searx_dir}/searx
+'''.format(user=current_user,
+ searx_dir=searx_dir,
+ searx_ve_dir=searx_ve_dir)
+
+nginx_config = '''
+server {{
+ listen 80;
+ server_name {hostname};
+ server_name www.{hostname};
+ root /usr/local/searx;
+
+ location / {{
+ include uwsgi_params;
+ uwsgi_pass unix:/run/uwsgi/app/searx/socket;
+ }}
+}}
+'''.format(hostname=hostname)
+
+
+def stop():
+ sudo('/etc/init.d/uwsgi stop')
+
+
+def start():
+ sudo('/etc/init.d/uwsgi start')
+
+
+def restart():
+ sudo('/etc/init.d/uwsgi restart')
+
+
+def init():
+ if not run('test -d ' + searx_dir, warn_only=True).failed:
+ return
+
+ sudo('apt-get update')
+
+ sudo('apt-get install git'
+ ' build-essential'
+ ' libxslt-dev'
+ ' python-dev'
+ ' python-virtualenv'
+ ' python-pybabel'
+ ' zlib1g-dev'
+ ' uwsgi'
+ ' uwsgi-plugin-python'
+ ' nginx')
+
+ sudo('mkdir -p ' + base_dir)
+
+ put(StringIO(nginx_config), '/etc/nginx/sites-enabled/searx', use_sudo=True)
+ sudo('/etc/init.d/nginx restart')
+
+ with cd(base_dir):
+ sudo('git clone https://github.com/asciimoo/searx')
+
+ sudo('chown -R {user}:{user} {searx_dir}'.format(user=current_user, searx_dir=searx_dir))
+ put(StringIO(uwsgi_file), searx_dir + '/uwsgi.ini')
+ sudo('ln -s {0}/uwsgi.ini /etc/uwsgi/apps-enabled/searx.ini'.format(searx_dir))
+
+ run('virtualenv {0}'.format(searx_ve_dir))
+
+ with cd(searx_dir):
+ run('source {0}/bin/activate && pip install -r requirements.txt'.format(searx_ve_dir))
+
+ start()
+
+
+def deploy():
+ init()
+
+ with cd(searx_dir):
+ run("git stash", warn_only=True)
+ run("git pull origin master")
+ run("git stash pop", warn_only=True)
+
+ restart()
+
+
+def clean():
+ sudo('rm -rf {searx_dir}'.format(searx_dir=searx_dir), warn_only=True)
+ sudo('rm /etc/uwsgi/apps-enabled/searx.ini', warn_only=True)
+ sudo('rm /etc/nginx/sites-enabled/searx', warn_only=True)
diff --git a/utils/fetch_currencies.py b/utils/fetch_currencies.py
new file mode 100644
index 0000000..5605fb3
--- /dev/null
+++ b/utils/fetch_currencies.py
@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+import json
+import re
+import unicodedata
+import string
+from urllib import urlencode
+from requests import get
+
+languages = {'de', 'en', 'es', 'fr', 'hu', 'it', 'nl', 'jp'}
+
+url_template = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&{query}&props=labels%7Cdatatype%7Cclaims%7Caliases&languages=' + '|'.join(languages)
+url_wmflabs_template = 'http://wdq.wmflabs.org/api?q='
+url_wikidata_search_template = 'http://www.wikidata.org/w/api.php?action=query&list=search&format=json&srnamespace=0&srprop=sectiontitle&{query}'
+
+wmflabs_queries = [
+ 'CLAIM[31:8142]', # all devise
+]
+
+db = {
+ 'iso4217': {
+ },
+ 'names': {
+ }
+}
+
+
+def remove_accents(data):
+ return unicodedata.normalize('NFKD', data).lower()
+
+
+def normalize_name(name):
+ return re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' '))
+
+
+def add_currency_name(name, iso4217):
+ global db
+
+ db_names = db['names']
+
+ if not isinstance(iso4217, basestring):
+ print("problem", name, iso4217)
+ return
+
+ name = normalize_name(name)
+
+ if name == '':
+ print("name empty", iso4217)
+ return
+
+ iso4217_set = db_names.get(name, None)
+ if iso4217_set is not None and iso4217 not in iso4217_set:
+ db_names[name].append(iso4217)
+ else:
+ db_names[name] = [iso4217]
+
+
+def add_currency_label(label, iso4217, language):
+ global db
+
+ db['iso4217'][iso4217] = db['iso4217'].get(iso4217, {})
+ db['iso4217'][iso4217][language] = label
+
+
+def get_property_value(data, name):
+ prop = data.get('claims', {}).get(name, {})
+ if len(prop) == 0:
+ return None
+
+ value = prop[0].get('mainsnak', {}).get('datavalue', {}).get('value', '')
+ if value == '':
+ return None
+
+ return value
+
+
+def parse_currency(data):
+ iso4217 = get_property_value(data, 'P498')
+
+ if iso4217 is not None:
+ unit = get_property_value(data, 'P558')
+ if unit is not None:
+ add_currency_name(unit, iso4217)
+
+ labels = data.get('labels', {})
+ for language in languages:
+ name = labels.get(language, {}).get('value', None)
+ if name is not None:
+ add_currency_name(name, iso4217)
+ add_currency_label(name, iso4217, language)
+
+ aliases = data.get('aliases', {})
+ for language in aliases:
+ for i in range(0, len(aliases[language])):
+ alias = aliases[language][i].get('value', None)
+ add_currency_name(alias, iso4217)
+
+
+def fetch_data(wikidata_ids):
+ url = url_template.format(query=urlencode({'ids': '|'.join(wikidata_ids)}))
+ htmlresponse = get(url)
+ jsonresponse = json.loads(htmlresponse.content)
+ entities = jsonresponse.get('entities', {})
+
+ for pname in entities:
+ pvalue = entities.get(pname)
+ parse_currency(pvalue)
+
+
+def add_q(i):
+ return "Q" + str(i)
+
+
+def fetch_data_batch(wikidata_ids):
+ while len(wikidata_ids) > 0:
+ if len(wikidata_ids) > 50:
+ fetch_data(wikidata_ids[0:49])
+ wikidata_ids = wikidata_ids[50:]
+ else:
+ fetch_data(wikidata_ids)
+ wikidata_ids = []
+
+
+def wdq_query(query):
+ url = url_wmflabs_template + query
+ htmlresponse = get(url)
+ jsonresponse = json.loads(htmlresponse.content)
+ qlist = map(add_q, jsonresponse.get('items', {}))
+ error = jsonresponse.get('status', {}).get('error', None)
+ if error is not None and error != 'OK':
+ print("error for query '" + query + "' :" + error)
+
+ fetch_data_batch(qlist)
+
+
+def wd_query(query, offset=0):
+ qlist = []
+
+ url = url_wikidata_search_template.format(query=urlencode({'srsearch': query, 'srlimit': 50, 'sroffset': offset}))
+ htmlresponse = get(url)
+ jsonresponse = json.loads(htmlresponse.content)
+ for r in jsonresponse.get('query', {}).get('search', {}):
+ qlist.append(r.get('title', ''))
+ fetch_data_batch(qlist)
+
+
+# fetch #
+for q in wmflabs_queries:
+ wdq_query(q)
+
+# static
+add_currency_name(u"euro", 'EUR')
+add_currency_name(u"euros", 'EUR')
+add_currency_name(u"dollar", 'USD')
+add_currency_name(u"dollars", 'USD')
+add_currency_name(u"peso", 'MXN')
+add_currency_name(u"pesos", 'MXN')
+
+# write
+f = open("currencies.json", "wb")
+json.dump(db, f, indent=4, encoding="utf-8")
+f.close()
diff --git a/utils/fetch_firefox_version.py b/utils/fetch_firefox_version.py
new file mode 100755
index 0000000..ed17958
--- /dev/null
+++ b/utils/fetch_firefox_version.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+# set path
+from sys import path
+from os.path import realpath, dirname, join
+path.append(realpath(dirname(realpath(__file__)) + '/../'))
+
+#
+import json
+import requests
+import re
+from distutils.version import LooseVersion, StrictVersion
+from lxml import html
+from searx.url_utils import urlparse, urljoin
+from searx import searx_dir
+
+URL = 'https://ftp.mozilla.org/pub/firefox/releases/'
+RELEASE_PATH = '/pub/firefox/releases/'
+
+NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$')
+# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$')
+# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$')
+
+#
+useragents = {
+ "versions": (),
+ "os": ('Windows NT 10; WOW64',
+ 'X11; Linux x86_64'),
+ "ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}"
+}
+
+
+def fetch_firefox_versions():
+ resp = requests.get(URL, timeout=2.0)
+ if resp.status_code != 200:
+ raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code)
+ else:
+ dom = html.fromstring(resp.text)
+ versions = []
+
+ for link in dom.xpath('//a/@href'):
+ url = urlparse(urljoin(URL, link))
+ path = url.path
+ if path.startswith(RELEASE_PATH):
+ version = path[len(RELEASE_PATH):-1]
+ if NORMAL_REGEX.match(version):
+ versions.append(LooseVersion(version))
+
+ list.sort(versions, reverse=True)
+ return versions
+
+
+def fetch_firefox_last_versions():
+ versions = fetch_firefox_versions()
+
+ result = []
+ major_last = versions[0].version[0]
+ major_list = (major_last, major_last - 1)
+ for version in versions:
+ major_current = version.version[0]
+ if major_current in major_list:
+ result.append(version.vstring)
+
+ return result
+
+
+def get_useragents_filename():
+ return join(join(searx_dir, "data"), "useragents.json")
+
+
+useragents["versions"] = fetch_firefox_last_versions()
+with open(get_useragents_filename(), "w") as f:
+ json.dump(useragents, f, indent=4, ensure_ascii=False)
diff --git a/utils/fetch_languages.py b/utils/fetch_languages.py
new file mode 100644
index 0000000..a6af073
--- /dev/null
+++ b/utils/fetch_languages.py
@@ -0,0 +1,191 @@
+# -*- coding: utf-8 -*-
+
+# This script generates languages.py from intersecting each engine's supported languages.
+#
+# Output files (engines_languages.json and languages.py)
+# are written in current directory to avoid overwriting in case something goes wrong.
+
+from json import dump
+import io
+from sys import path
+from babel import Locale, UnknownLocaleError
+from babel.languages import get_global
+
+path.append('../searx') # noqa
+from searx import settings
+from searx.engines import initialize_engines, engines
+
+# Output files.
+engines_languages_file = 'engines_languages.json'
+languages_file = 'languages.py'
+
+
+# Fetchs supported languages for each engine and writes json file with those.
+def fetch_supported_languages():
+ engines_languages = {}
+ for engine_name in engines:
+ if hasattr(engines[engine_name], 'fetch_supported_languages'):
+ try:
+ engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
+ if type(engines_languages[engine_name]) == list:
+ engines_languages[engine_name] = sorted(engines_languages[engine_name])
+ except Exception as e:
+ print(e)
+
+ # write json file
+ with io.open(engines_languages_file, "w", encoding="utf-8") as f:
+ dump(engines_languages, f, ensure_ascii=False, indent=4, separators=(',', ': '))
+
+ return engines_languages
+
+
+# Get babel Locale object from lang_code if possible.
+def get_locale(lang_code):
+ try:
+ locale = Locale.parse(lang_code, sep='-')
+ return locale
+ except (UnknownLocaleError, ValueError):
+ return None
+
+
+# Append engine_name to list of engines that support locale.
+def add_engine_counter(lang_code, engine_name, languages):
+ if lang_code in languages:
+ if 'counter' not in languages[lang_code]:
+ languages[lang_code]['counter'] = [engine_name]
+ elif engine_name not in languages[lang_code]['counter']:
+ languages[lang_code]['counter'].append(engine_name)
+
+
+# Join all language lists.
+# TODO: Add language names from engine's language list if name not known by babel.
+def join_language_lists(engines_languages):
+ language_list = {}
+ for engine_name in engines_languages:
+ for lang_code in engines_languages[engine_name]:
+
+ # apply custom fixes if necessary
+ if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
+ lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
+ if lang_code == alias)
+
+ locale = get_locale(lang_code)
+
+ # ensure that lang_code uses standard language and country codes
+ if locale and locale.territory:
+ lang_code = locale.language + '-' + locale.territory
+
+ # add locale if it's not in list
+ if lang_code not in language_list:
+ if locale:
+ language_list[lang_code] = {'name': locale.get_language_name().title(),
+ 'english_name': locale.english_name,
+ 'country': locale.get_territory_name() or ''}
+
+ # also add language without country
+ if locale.language not in language_list:
+ language_list[locale.language] = {'name': locale.get_language_name().title(),
+ 'english_name': locale.english_name}
+ else:
+ language_list[lang_code] = {}
+
+ # count engine for both language_country combination and language alone
+ add_engine_counter(lang_code, engine_name, language_list)
+ add_engine_counter(lang_code.split('-')[0], engine_name, language_list)
+
+ return language_list
+
+
+# Filter language list so it only includes the most supported languages and countries.
+def filter_language_list(all_languages):
+ min_supported_engines = 10
+ main_engines = [engine_name for engine_name in engines.keys()
+ if 'general' in engines[engine_name].categories and
+ engines[engine_name].supported_languages and
+ not engines[engine_name].disabled]
+
+ # filter list to include only languages supported by most engines or all default general engines
+ filtered_languages = {code: lang for code, lang
+ in all_languages.items()
+ if (len(lang.get('counter', [])) >= min_supported_engines or
+ all(main_engine in lang.get('counter', [])
+ for main_engine in main_engines))}
+
+ return filtered_languages
+
+
+# Add country codes to languages without one and filter out language codes.
+def assign_country_codes(filtered_languages, all_languages):
+ sorted_languages = sorted(all_languages,
+ key=lambda lang: len(all_languages[lang].get('counter', [])),
+ reverse=True)
+ previous_lang = None
+ previous_code = None
+ countries = 0
+ for current_code in sorted(filtered_languages):
+ current_lang = current_code.split('-')[0]
+
+ # count country codes per language
+ if current_lang == previous_lang:
+ countries += 1
+
+ else:
+ if previous_lang is not None:
+ # if language has no single country code
+ if countries == 0:
+ # try to get country code with most supported engines
+ for l in sorted_languages:
+ l_parts = l.split('-')
+ if len(l_parts) == 2 and l_parts[0] == previous_lang:
+ filtered_languages[l] = all_languages[l]
+ filtered_languages[l]['country'] = ''
+ countries = 1
+ break
+
+ if countries == 0:
+ # get most likely country code from babel
+ subtags = get_global('likely_subtags').get(previous_lang)
+ if subtags:
+ subtag_parts = subtags.split('_')
+ new_code = subtag_parts[0] + '-' + subtag_parts[-1]
+ filtered_languages[new_code] = all_languages[previous_lang]
+ countries = 1
+
+ if countries == 1:
+ # remove countryless version of language if there's only one country
+ del filtered_languages[previous_lang]
+ if previous_code in filtered_languages:
+ filtered_languages[previous_code]['country'] = ''
+
+ countries = 0
+ previous_lang = current_lang
+
+ previous_code = current_code
+
+
+# Write languages.py.
+def write_languages_file(languages):
+ new_file = open(languages_file, 'wb')
+ file_content = '# -*- coding: utf-8 -*-\n'\
+ + '# list of language codes\n'\
+ + '# this file is generated automatically by utils/update_search_languages.py\n'\
+ + '\nlanguage_codes = ('
+ for code in sorted(languages):
+ file_content += '\n (u"' + code + '"'\
+ + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
+ + ', u"' + languages[code].get('country', '') + '"'\
+ + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
+ # remove last comma
+ file_content = file_content[:-1]
+ file_content += '\n)\n'
+ new_file.write(file_content.encode('utf8'))
+ new_file.close()
+
+
+if __name__ == "__main__":
+ initialize_engines(settings['engines'])
+ engines_languages = fetch_supported_languages()
+ all_languages = join_language_lists(engines_languages)
+ filtered_languages = filter_language_list(all_languages)
+ assign_country_codes(filtered_languages, all_languages)
+ write_languages_file(filtered_languages)
diff --git a/utils/google_search.py b/utils/google_search.py
new file mode 100644
index 0000000..cad32ee
--- /dev/null
+++ b/utils/google_search.py
@@ -0,0 +1,35 @@
+from sys import argv, exit
+
+if not len(argv) > 1:
+ print('search query required')
+ exit(1)
+
+import requests
+from json import dumps
+from searx.engines import google
+from searx.search import default_request_params
+
+request_params = default_request_params()
+# Possible params
+# request_params['headers']['User-Agent'] = ''
+# request_params['category'] = ''
+request_params['pageno'] = 1
+request_params['language'] = 'en_us'
+request_params['time_range'] = ''
+
+params = google.request(argv[1], request_params)
+
+request_args = dict(
+ headers=request_params['headers'],
+ cookies=request_params['cookies'],
+)
+
+if request_params['method'] == 'GET':
+ req = requests.get
+else:
+ req = requests.post
+ request_args['data'] = request_params['data']
+
+resp = req(request_params['url'], **request_args)
+resp.search_params = request_params
+print(dumps(google.response(resp)))
diff --git a/utils/standalone_searx.py b/utils/standalone_searx.py
new file mode 100755
index 0000000..2231636
--- /dev/null
+++ b/utils/standalone_searx.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+
+(C) 2016- by Alexandre Flament, <alex@al-f.net>
+'''
+
+# set path
+from sys import path
+from os.path import realpath, dirname
+path.append(realpath(dirname(realpath(__file__)) + '/../'))
+
+# initialization
+from json import dumps
+from searx import settings
+import sys
+import codecs
+import searx.query
+import searx.search
+import searx.engines
+import searx.preferences
+import argparse
+
+searx.engines.initialize_engines(settings['engines'])
+
+# command line parsing
+parser = argparse.ArgumentParser(description='Standalone searx.')
+parser.add_argument('query', type=str,
+ help='Text query')
+parser.add_argument('--category', type=str, nargs='?',
+ choices=searx.engines.categories.keys(),
+ default='general',
+ help='Search category')
+parser.add_argument('--lang', type=str, nargs='?',default='all',
+ help='Search language')
+parser.add_argument('--pageno', type=int, nargs='?', default=1,
+ help='Page number starting from 1')
+parser.add_argument('--safesearch', type=str, nargs='?', choices=['0', '1', '2'], default='0',
+ help='Safe content filter from none to strict')
+parser.add_argument('--timerange', type=str, nargs='?', choices=['day', 'week', 'month', 'year'],
+ help='Filter by time range')
+args = parser.parse_args()
+
+# search results for the query
+form = {
+ "q":args.query,
+ "categories":args.category.decode('utf-8'),
+ "pageno":str(args.pageno),
+ "language":args.lang,
+ "time_range":args.timerange
+}
+preferences = searx.preferences.Preferences(['oscar'], searx.engines.categories.keys(), searx.engines.engines, [])
+preferences.key_value_settings['safesearch'].parse(args.safesearch)
+
+search_query = searx.search.get_search_query_from_webapp(preferences, form)
+search = searx.search.Search(search_query)
+result_container = search.search()
+
+# output
+from datetime import datetime
+
+def no_parsed_url(results):
+ for result in results:
+ del result['parsed_url']
+ return results
+
+def json_serial(obj):
+ """JSON serializer for objects not serializable by default json code"""
+ if isinstance(obj, datetime):
+ serial = obj.isoformat()
+ return serial
+ raise TypeError ("Type not serializable")
+
+result_container_json = {
+ "search": {
+ "q": search_query.query,
+ "pageno": search_query.pageno,
+ "lang": search_query.lang,
+ "safesearch": search_query.safesearch,
+ "timerange": search_query.time_range,
+ "engines": search_query.engines
+ },
+ "results": no_parsed_url(result_container.get_ordered_results()),
+ "infoboxes": result_container.infoboxes,
+ "suggestions": list(result_container.suggestions),
+ "answers": list(result_container.answers),
+ "paging": result_container.paging,
+ "results_number": result_container.results_number()
+}
+sys.stdout = codecs.getwriter("UTF-8")(sys.stdout)
+sys.stdout.write(dumps(result_container_json, sort_keys=True, indent=4, ensure_ascii=False, encoding="utf-8", default=json_serial))
+
diff --git a/utils/update-translations.sh b/utils/update-translations.sh
new file mode 100755
index 0000000..240387a
--- /dev/null
+++ b/utils/update-translations.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+# script to easily update translation language files
+
+# add new language:
+# pybabel init -i messages.pot -d searx/translations -l en
+
+SEARX_DIR='searx'
+
+pybabel extract -F babel.cfg -o messages.pot "$SEARX_DIR"
+for f in `ls "$SEARX_DIR"'/translations/'`; do
+ pybabel update -N -i messages.pot -d "$SEARX_DIR"'/translations/' -l "$f"
+done
+
+echo '[!] update done, edit .po files if required and run pybabel compile -d searx/translations/'