summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorJohannes 'josch' Schauer <josch@mister-muffin.de>2017-06-16 15:18:31 +0200
committerJohannes 'josch' Schauer <josch@mister-muffin.de>2017-06-16 15:18:31 +0200
commit7fe1a5ea5ff4aeecbbc2af673cbdc88fbbea18d5 (patch)
treefecfa8408befea37218807ea487e1f954afb356c /utils
New upstream version 0.12.0+dfsg1
Diffstat (limited to 'utils')
-rw-r--r--utils/fabfile.py117
-rw-r--r--utils/fetch_currencies.py161
-rw-r--r--utils/fetch_languages.py189
-rw-r--r--utils/google_search.py35
-rwxr-xr-xutils/standalone_searx.py101
-rwxr-xr-xutils/update-translations.sh15
6 files changed, 618 insertions, 0 deletions
diff --git a/utils/fabfile.py b/utils/fabfile.py
new file mode 100644
index 0000000..559e2ab
--- /dev/null
+++ b/utils/fabfile.py
@@ -0,0 +1,117 @@
+from fabric.api import cd, run, sudo, put
+from cStringIO import StringIO
+
+
+base_dir = '/usr/local'
+hostname = 'searx.me'
+searx_dir = base_dir + '/searx'
+searx_ve_dir = searx_dir + '/searx-ve'
+current_user = run('whoami').stdout.strip()
+
+uwsgi_file = '''
+[uwsgi]
+# Who will run the code
+uid = {user}
+gid = {user}
+
+# Number of workers
+workers = 8
+
+# The right granted on the created socket
+chmod-socket = 666
+
+# Plugin to use and interpretor config
+single-interpreter = true
+master = true
+plugin = python
+
+# Module to import
+module = searx.webapp
+
+# Virtualenv and python path
+virtualenv = {searx_ve_dir}
+pythonpath = {searx_dir}
+chdir = {searx_dir}/searx
+'''.format(user=current_user,
+ searx_dir=searx_dir,
+ searx_ve_dir=searx_ve_dir)
+
+nginx_config = '''
+server {{
+ listen 80;
+ server_name {hostname};
+ server_name www.{hostname};
+ root /usr/local/searx;
+
+ location / {{
+ include uwsgi_params;
+ uwsgi_pass unix:/run/uwsgi/app/searx/socket;
+ }}
+}}
+'''.format(hostname=hostname)
+
+
+def stop():
+ sudo('/etc/init.d/uwsgi stop')
+
+
+def start():
+ sudo('/etc/init.d/uwsgi start')
+
+
+def restart():
+ sudo('/etc/init.d/uwsgi restart')
+
+
+def init():
+ if not run('test -d ' + searx_dir, warn_only=True).failed:
+ return
+
+ sudo('apt-get update')
+
+ sudo('apt-get install git'
+ ' build-essential'
+ ' libxslt-dev'
+ ' python-dev'
+ ' python-virtualenv'
+ ' python-pybabel'
+ ' zlib1g-dev'
+ ' uwsgi'
+ ' uwsgi-plugin-python'
+ ' nginx')
+
+ sudo('mkdir -p ' + base_dir)
+
+ put(StringIO(nginx_config), '/etc/nginx/sites-enabled/searx', use_sudo=True)
+ sudo('/etc/init.d/nginx restart')
+
+ with cd(base_dir):
+ sudo('git clone https://github.com/asciimoo/searx')
+
+ sudo('chown -R {user}:{user} {searx_dir}'.format(user=current_user, searx_dir=searx_dir))
+ put(StringIO(uwsgi_file), searx_dir + '/uwsgi.ini')
+ sudo('ln -s {0}/uwsgi.ini /etc/uwsgi/apps-enabled/searx.ini'.format(searx_dir))
+
+ run('virtualenv {0}'.format(searx_ve_dir))
+
+ with cd(searx_dir):
+ run('source {0}/bin/activate && pip install -r requirements.txt'.format(searx_ve_dir))
+
+ start()
+
+
+def deploy():
+ init()
+
+ with cd(searx_dir):
+ run("git stash", warn_only=True)
+ run("git pull origin master")
+ run("git stash pop", warn_only=True)
+
+ restart()
+
+
+def clean():
+ sudo('rm -rf {searx_dir}'.format(searx_dir=searx_dir), warn_only=True)
+ sudo('rm /etc/uwsgi/apps-enabled/searx.ini', warn_only=True)
+ sudo('rm /etc/nginx/sites-enabled/searx', warn_only=True)
diff --git a/utils/fetch_currencies.py b/utils/fetch_currencies.py
new file mode 100644
index 0000000..716b505
--- /dev/null
+++ b/utils/fetch_currencies.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+import json
+import re
+import unicodedata
+import string
+from urllib import urlencode
+from requests import get
+
+languages = {'de', 'en', 'es', 'fr', 'hu', 'it', 'nl', 'jp'}
+
+url_template = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&{query}&props=labels%7Cdatatype%7Cclaims%7Caliases&languages=' + '|'.join(languages)
+url_wmflabs_template = 'http://wdq.wmflabs.org/api?q='
+url_wikidata_search_template = 'http://www.wikidata.org/w/api.php?action=query&list=search&format=json&srnamespace=0&srprop=sectiontitle&{query}'
+
+wmflabs_queries = [
+ 'CLAIM[31:8142]', # all devise
+]
+
+db = {
+ 'iso4217': {
+ },
+ 'names': {
+ }
+}
+
+
+def remove_accents(data):
+ return unicodedata.normalize('NFKD', data).lower()
+
+
+def normalize_name(name):
+ return re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' '))
+
+
+def add_currency_name(name, iso4217):
+ global db
+
+ db_names = db['names']
+
+ if not isinstance(iso4217, basestring):
+ print "problem", name, iso4217
+ return
+
+ name = normalize_name(name)
+
+ if name == '':
+ print "name empty", iso4217
+ return
+
+ iso4217_set = db_names.get(name, None)
+ if iso4217_set is not None and iso4217 not in iso4217_set:
+ db_names[name].append(iso4217)
+ else:
+ db_names[name] = [iso4217]
+
+
+def add_currency_label(label, iso4217, language):
+ global db
+
+ db['iso4217'][iso4217] = db['iso4217'].get(iso4217, {})
+ db['iso4217'][iso4217][language] = label
+
+
+def get_property_value(data, name):
+ prop = data.get('claims', {}).get(name, {})
+ if len(prop) == 0:
+ return None
+
+ value = prop[0].get('mainsnak', {}).get('datavalue', {}).get('value', '')
+ if value == '':
+ return None
+
+ return value
+
+
+def parse_currency(data):
+ iso4217 = get_property_value(data, 'P498')
+
+ if iso4217 is not None:
+ unit = get_property_value(data, 'P558')
+ if unit is not None:
+ add_currency_name(unit, iso4217)
+
+ labels = data.get('labels', {})
+ for language in languages:
+ name = labels.get(language, {}).get('value', None)
+ if name is not None:
+ add_currency_name(name, iso4217)
+ add_currency_label(name, iso4217, language)
+
+ aliases = data.get('aliases', {})
+ for language in aliases:
+ for i in range(0, len(aliases[language])):
+ alias = aliases[language][i].get('value', None)
+ add_currency_name(alias, iso4217)
+
+
+def fetch_data(wikidata_ids):
+ url = url_template.format(query=urlencode({'ids': '|'.join(wikidata_ids)}))
+ htmlresponse = get(url)
+ jsonresponse = json.loads(htmlresponse.content)
+ entities = jsonresponse.get('entities', {})
+
+ for pname in entities:
+ pvalue = entities.get(pname)
+ parse_currency(pvalue)
+
+
+def add_q(i):
+ return "Q" + str(i)
+
+
+def fetch_data_batch(wikidata_ids):
+ while len(wikidata_ids) > 0:
+ if len(wikidata_ids) > 50:
+ fetch_data(wikidata_ids[0:49])
+ wikidata_ids = wikidata_ids[50:]
+ else:
+ fetch_data(wikidata_ids)
+ wikidata_ids = []
+
+
+def wdq_query(query):
+ url = url_wmflabs_template + query
+ htmlresponse = get(url)
+ jsonresponse = json.loads(htmlresponse.content)
+ qlist = map(add_q, jsonresponse.get('items', {}))
+ error = jsonresponse.get('status', {}).get('error', None)
+ if error is not None and error != 'OK':
+ print "error for query '" + query + "' :" + error
+
+ fetch_data_batch(qlist)
+
+
+def wd_query(query, offset=0):
+ qlist = []
+
+ url = url_wikidata_search_template.format(query=urlencode({'srsearch': query, 'srlimit': 50, 'sroffset': offset}))
+ htmlresponse = get(url)
+ jsonresponse = json.loads(htmlresponse.content)
+ for r in jsonresponse.get('query', {}).get('search', {}):
+ qlist.append(r.get('title', ''))
+ fetch_data_batch(qlist)
+
+
+# fetch #
+for q in wmflabs_queries:
+ wdq_query(q)
+
+# static
+add_currency_name(u"euro", 'EUR')
+add_currency_name(u"euros", 'EUR')
+add_currency_name(u"dollar", 'USD')
+add_currency_name(u"dollars", 'USD')
+add_currency_name(u"peso", 'MXN')
+add_currency_name(u"pesos", 'MXN')
+
+# write
+f = open("currencies.json", "wb")
+json.dump(db, f, indent=4, encoding="utf-8")
+f.close()
diff --git a/utils/fetch_languages.py b/utils/fetch_languages.py
new file mode 100644
index 0000000..3241370
--- /dev/null
+++ b/utils/fetch_languages.py
@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+
+# This script generates languages.py from intersecting each engine's supported languages.
+#
+# The country names are obtained from http://api.geonames.org which requires registering as a user.
+#
+# Output files (engines_languages.json and languages.py)
+# are written in current directory to avoid overwriting in case something goes wrong.
+
+from requests import get
+from urllib import urlencode
+from lxml.html import fromstring
+from json import loads, dumps
+import io
+from sys import path
+path.append('../searx') # noqa
+from searx import settings
+from searx.engines import initialize_engines, engines
+
+# Geonames API for country names.
+geonames_user = '' # ADD USER NAME HERE
+country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
+
+# Output files.
+engines_languages_file = 'engines_languages.json'
+languages_file = 'languages.py'
+
+engines_languages = {}
+
+
+# To filter out invalid codes and dialects.
+def valid_code(lang_code):
+ # filter invalid codes
+ # sl-SL is technically not invalid, but still a mistake
+ invalid_codes = ['sl-SL', 'wt-WT', 'jw']
+ invalid_countries = ['UK', 'XA', 'XL']
+ if lang_code[:2] == 'xx'\
+ or lang_code in invalid_codes\
+ or lang_code[-2:] in invalid_countries\
+ or is_dialect(lang_code):
+ return False
+
+ return True
+
+
+# Language codes with any additional tags other than language and country.
+def is_dialect(lang_code):
+ lang_code = lang_code.split('-')
+ if len(lang_code) > 2 or len(lang_code[0]) > 3:
+ return True
+ if len(lang_code) == 2 and len(lang_code[1]) > 2:
+ return True
+
+ return False
+
+
+# Get country name in specified language.
+def get_country_name(locale):
+ if geonames_user is '':
+ return ''
+
+ locale = locale.split('-')
+ if len(locale) != 2:
+ return ''
+
+ url = country_names_url.format(parameters=urlencode({'lang': locale[0],
+ 'country': locale[1],
+ 'username': geonames_user}))
+ response = get(url)
+ json = loads(response.text)
+ content = json.get('geonames', None)
+ if content is None or len(content) != 1:
+ print "No country name found for " + locale[0] + "-" + locale[1]
+ return ''
+
+ return content[0].get('countryName', '')
+
+
+# Fetchs supported languages for each engine and writes json file with those.
+def fetch_supported_languages():
+ initialize_engines(settings['engines'])
+ for engine_name in engines:
+ if hasattr(engines[engine_name], 'fetch_supported_languages'):
+ try:
+ engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
+ except Exception as e:
+ print e
+
+ # write json file
+ with io.open(engines_languages_file, "w", encoding="utf-8") as f:
+ f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
+
+
+# Join all language lists.
+# Iterate all languages supported by each engine.
+def join_language_lists():
+ global languages
+ # include wikipedia first for more accurate language names
+ languages = {code: lang for code, lang
+ in engines_languages['wikipedia'].iteritems()
+ if valid_code(code)}
+
+ for engine_name in engines_languages:
+ for locale in engines_languages[engine_name]:
+ if valid_code(locale):
+ # if language is not on list or if it has no name yet
+ if locale not in languages or not languages[locale].get('name'):
+ if isinstance(engines_languages[engine_name], dict):
+ languages[locale] = engines_languages[engine_name][locale]
+ else:
+ languages[locale] = {}
+
+ # add to counter of engines that support given language
+ lang = locale.split('-')[0]
+ if lang in languages:
+ if 'counter' not in languages[lang]:
+ languages[lang]['counter'] = [engine_name]
+ elif engine_name not in languages[lang]['counter']:
+ languages[lang]['counter'].append(engine_name)
+
+ # filter list to include only languages supported by most engines
+ min_supported_engines = int(0.70 * len(engines_languages))
+ languages = {code: lang for code, lang
+ in languages.iteritems()
+ if len(lang.get('counter', [])) >= min_supported_engines or
+ len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
+
+ # get locales that have no name or country yet
+ for locale in languages.keys():
+ # try to get language names
+ if not languages[locale].get('name'):
+ name = languages.get(locale.split('-')[0], {}).get('name', None)
+ if name:
+ languages[locale]['name'] = name
+ else:
+ # filter out locales with no name
+ del languages[locale]
+ continue
+
+ # try to get language name in english
+ if not languages[locale].get('english_name'):
+ languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
+
+ # try to get country name
+ if locale.find('-') > 0 and not languages[locale].get('country'):
+ languages[locale]['country'] = get_country_name(locale) or ''
+
+
+# Remove countryless language if language is featured in only one country.
+def filter_single_country_languages():
+ prev_lang = None
+ prev_code = None
+ for code in sorted(languages):
+ lang = code.split('-')[0]
+ if lang == prev_lang:
+ countries += 1
+ else:
+ if prev_lang is not None and countries == 1:
+ del languages[prev_lang]
+ languages[prev_code]['country'] = ''
+ countries = 0
+ prev_lang = lang
+ prev_code = code
+
+
+# Write languages.py.
+def write_languages_file():
+ new_file = open(languages_file, 'w')
+ file_content = '# -*- coding: utf-8 -*-\n'\
+ + '# list of language codes\n'\
+ + '# this file is generated automatically by utils/update_search_languages.py\n'\
+ + '\nlanguage_codes = ('
+ for code in sorted(languages):
+ file_content += '\n (u"' + code + '"'\
+ + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
+ + ', u"' + languages[code].get('country', '') + '"'\
+ + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
+ # remove last comma
+ file_content = file_content[:-1]
+ file_content += '\n)\n'
+ new_file.write(file_content.encode('utf8'))
+ new_file.close()
+
+
+if __name__ == "__main__":
+ fetch_supported_languages()
+ join_language_lists()
+ filter_single_country_languages()
+ write_languages_file()
diff --git a/utils/google_search.py b/utils/google_search.py
new file mode 100644
index 0000000..cad32ee
--- /dev/null
+++ b/utils/google_search.py
@@ -0,0 +1,35 @@
+from sys import argv, exit
+
+if not len(argv) > 1:
+ print('search query required')
+ exit(1)
+
+import requests
+from json import dumps
+from searx.engines import google
+from searx.search import default_request_params
+
+request_params = default_request_params()
+# Possible params
+# request_params['headers']['User-Agent'] = ''
+# request_params['category'] = ''
+request_params['pageno'] = 1
+request_params['language'] = 'en_us'
+request_params['time_range'] = ''
+
+params = google.request(argv[1], request_params)
+
+request_args = dict(
+ headers=request_params['headers'],
+ cookies=request_params['cookies'],
+)
+
+if request_params['method'] == 'GET':
+ req = requests.get
+else:
+ req = requests.post
+ request_args['data'] = request_params['data']
+
+resp = req(request_params['url'], **request_args)
+resp.search_params = request_params
+print(dumps(google.response(resp)))
diff --git a/utils/standalone_searx.py b/utils/standalone_searx.py
new file mode 100755
index 0000000..b19df4b
--- /dev/null
+++ b/utils/standalone_searx.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+
+(C) 2016- by Alexandre Flament, <alex@al-f.net>
+'''
+
+# set path
+from sys import path
+from os.path import realpath, dirname
+path.append(realpath(dirname(realpath(__file__)) + '/../'))
+
+# initialization
+from json import dumps
+from searx import settings
+import searx.query
+import searx.search
+import searx.engines
+import searx.preferences
+import argparse
+
+searx.engines.initialize_engines(settings['engines'])
+
+# command line parsing
+parser = argparse.ArgumentParser(description='Standalone searx.')
+parser.add_argument('query', type=str,
+ help='Text query')
+parser.add_argument('--category', type=str, nargs='?',
+ choices=searx.engines.categories.keys(),
+ default='general',
+ help='Search category')
+parser.add_argument('--lang', type=str, nargs='?',default='all',
+ help='Search language')
+parser.add_argument('--pageno', type=int, nargs='?', default=1,
+ help='Page number starting from 1')
+parser.add_argument('--safesearch', type=str, nargs='?', choices=['0', '1', '2'], default='0',
+ help='Safe content filter from none to strict')
+parser.add_argument('--timerange', type=str, nargs='?', choices=['day', 'week', 'month', 'year'],
+ help='Filter by time range')
+args = parser.parse_args()
+
+# search results for the query
+form = {
+ "q":args.query,
+ "categories":args.category.decode('utf-8'),
+ "pageno":str(args.pageno),
+ "language":args.lang,
+ "time_range":args.timerange
+}
+preferences = searx.preferences.Preferences(['oscar'], searx.engines.categories.keys(), searx.engines.engines, [])
+preferences.key_value_settings['safesearch'].parse(args.safesearch)
+
+search_query = searx.search.get_search_query_from_webapp(preferences, form)
+search = searx.search.Search(search_query)
+result_container = search.search()
+
+# output
+from datetime import datetime
+
+def no_parsed_url(results):
+ for result in results:
+ del result['parsed_url']
+ return results
+
+def json_serial(obj):
+ """JSON serializer for objects not serializable by default json code"""
+ if isinstance(obj, datetime):
+ serial = obj.isoformat()
+ return serial
+ raise TypeError ("Type not serializable")
+
+result_container_json = {
+ "search": {
+ "q": search_query.query,
+ "pageno": search_query.pageno,
+ "lang": search_query.lang,
+ "safesearch": search_query.safesearch,
+ "timerange": search_query.time_range,
+ "engines": search_query.engines
+ },
+ "results": no_parsed_url(result_container.get_ordered_results()),
+ "infoboxes": result_container.infoboxes,
+ "suggestions": list(result_container.suggestions),
+ "answers": list(result_container.answers),
+ "paging": result_container.paging,
+ "results_number": result_container.results_number()
+}
+
+print(dumps(result_container_json, sort_keys=True, indent=4, ensure_ascii=False, encoding="utf-8", default=json_serial))
diff --git a/utils/update-translations.sh b/utils/update-translations.sh
new file mode 100755
index 0000000..00e7fb1
--- /dev/null
+++ b/utils/update-translations.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+# script to easily update translation language files
+
+# add new language:
+# pybabel init -i messages.pot -d searx/translations -l en
+
+SEARX_DIR='searx'
+
+pybabel extract -F babel.cfg -o messages.pot $SEARX_DIR
+for f in `ls $SEARX_DIR'/translations/'`; do
+ pybabel update -N -i messages.pot -d $SEARX_DIR'/translations/' -l $f
+done
+
+echo '[!] update done, edit .po files if required and run pybabel compile -d searx/translations/'