From 7fe1a5ea5ff4aeecbbc2af673cbdc88fbbea18d5 Mon Sep 17 00:00:00 2001 From: Johannes 'josch' Schauer Date: Fri, 16 Jun 2017 15:18:31 +0200 Subject: New upstream version 0.12.0+dfsg1 --- utils/fabfile.py | 117 +++++++++++++++++++++++++++ utils/fetch_currencies.py | 161 ++++++++++++++++++++++++++++++++++++ utils/fetch_languages.py | 189 +++++++++++++++++++++++++++++++++++++++++++ utils/google_search.py | 35 ++++++++ utils/standalone_searx.py | 101 +++++++++++++++++++++++ utils/update-translations.sh | 15 ++++ 6 files changed, 618 insertions(+) create mode 100644 utils/fabfile.py create mode 100644 utils/fetch_currencies.py create mode 100644 utils/fetch_languages.py create mode 100644 utils/google_search.py create mode 100755 utils/standalone_searx.py create mode 100755 utils/update-translations.sh (limited to 'utils') diff --git a/utils/fabfile.py b/utils/fabfile.py new file mode 100644 index 0000000..559e2ab --- /dev/null +++ b/utils/fabfile.py @@ -0,0 +1,117 @@ +from fabric.api import cd, run, sudo, put +from cStringIO import StringIO + + +base_dir = '/usr/local' +hostname = 'searx.me' +searx_dir = base_dir + '/searx' +searx_ve_dir = searx_dir + '/searx-ve' +current_user = run('whoami').stdout.strip() + +uwsgi_file = ''' +[uwsgi] +# Who will run the code +uid = {user} +gid = {user} + +# Number of workers +workers = 8 + +# The right granted on the created socket +chmod-socket = 666 + +# Plugin to use and interpretor config +single-interpreter = true +master = true +plugin = python + +# Module to import +module = searx.webapp + +# Virtualenv and python path +virtualenv = {searx_ve_dir} +pythonpath = {searx_dir} +chdir = {searx_dir}/searx +'''.format(user=current_user, + searx_dir=searx_dir, + searx_ve_dir=searx_ve_dir) + +nginx_config = ''' +server {{ + listen 80; + server_name {hostname}; + server_name www.{hostname}; + root /usr/local/searx; + + location / {{ + include uwsgi_params; + uwsgi_pass unix:/run/uwsgi/app/searx/socket; + }} +}} +'''.format(hostname=hostname) + + +def stop(): + sudo('/etc/init.d/uwsgi stop') + + +def start(): + sudo('/etc/init.d/uwsgi start') + + +def restart(): + sudo('/etc/init.d/uwsgi restart') + + +def init(): + if not run('test -d ' + searx_dir, warn_only=True).failed: + return + + sudo('apt-get update') + + sudo('apt-get install git' + ' build-essential' + ' libxslt-dev' + ' python-dev' + ' python-virtualenv' + ' python-pybabel' + ' zlib1g-dev' + ' uwsgi' + ' uwsgi-plugin-python' + ' nginx') + + sudo('mkdir -p ' + base_dir) + + put(StringIO(nginx_config), '/etc/nginx/sites-enabled/searx', use_sudo=True) + sudo('/etc/init.d/nginx restart') + + with cd(base_dir): + sudo('git clone https://github.com/asciimoo/searx') + + sudo('chown -R {user}:{user} {searx_dir}'.format(user=current_user, searx_dir=searx_dir)) + put(StringIO(uwsgi_file), searx_dir + '/uwsgi.ini') + sudo('ln -s {0}/uwsgi.ini /etc/uwsgi/apps-enabled/searx.ini'.format(searx_dir)) + + run('virtualenv {0}'.format(searx_ve_dir)) + + with cd(searx_dir): + run('source {0}/bin/activate && pip install -r requirements.txt'.format(searx_ve_dir)) + + start() + + +def deploy(): + init() + + with cd(searx_dir): + run("git stash", warn_only=True) + run("git pull origin master") + run("git stash pop", warn_only=True) + + restart() + + +def clean(): + sudo('rm -rf {searx_dir}'.format(searx_dir=searx_dir), warn_only=True) + sudo('rm /etc/uwsgi/apps-enabled/searx.ini', warn_only=True) + sudo('rm /etc/nginx/sites-enabled/searx', warn_only=True) diff --git a/utils/fetch_currencies.py b/utils/fetch_currencies.py new file mode 100644 index 0000000..716b505 --- /dev/null +++ b/utils/fetch_currencies.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +import json +import re +import unicodedata +import string +from urllib import urlencode +from requests import get + +languages = {'de', 'en', 'es', 'fr', 'hu', 'it', 'nl', 'jp'} + +url_template = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&{query}&props=labels%7Cdatatype%7Cclaims%7Caliases&languages=' + '|'.join(languages) +url_wmflabs_template = 'http://wdq.wmflabs.org/api?q=' +url_wikidata_search_template = 'http://www.wikidata.org/w/api.php?action=query&list=search&format=json&srnamespace=0&srprop=sectiontitle&{query}' + +wmflabs_queries = [ + 'CLAIM[31:8142]', # all devise +] + +db = { + 'iso4217': { + }, + 'names': { + } +} + + +def remove_accents(data): + return unicodedata.normalize('NFKD', data).lower() + + +def normalize_name(name): + return re.sub(' +', ' ', remove_accents(name.lower()).replace('-', ' ')) + + +def add_currency_name(name, iso4217): + global db + + db_names = db['names'] + + if not isinstance(iso4217, basestring): + print "problem", name, iso4217 + return + + name = normalize_name(name) + + if name == '': + print "name empty", iso4217 + return + + iso4217_set = db_names.get(name, None) + if iso4217_set is not None and iso4217 not in iso4217_set: + db_names[name].append(iso4217) + else: + db_names[name] = [iso4217] + + +def add_currency_label(label, iso4217, language): + global db + + db['iso4217'][iso4217] = db['iso4217'].get(iso4217, {}) + db['iso4217'][iso4217][language] = label + + +def get_property_value(data, name): + prop = data.get('claims', {}).get(name, {}) + if len(prop) == 0: + return None + + value = prop[0].get('mainsnak', {}).get('datavalue', {}).get('value', '') + if value == '': + return None + + return value + + +def parse_currency(data): + iso4217 = get_property_value(data, 'P498') + + if iso4217 is not None: + unit = get_property_value(data, 'P558') + if unit is not None: + add_currency_name(unit, iso4217) + + labels = data.get('labels', {}) + for language in languages: + name = labels.get(language, {}).get('value', None) + if name is not None: + add_currency_name(name, iso4217) + add_currency_label(name, iso4217, language) + + aliases = data.get('aliases', {}) + for language in aliases: + for i in range(0, len(aliases[language])): + alias = aliases[language][i].get('value', None) + add_currency_name(alias, iso4217) + + +def fetch_data(wikidata_ids): + url = url_template.format(query=urlencode({'ids': '|'.join(wikidata_ids)})) + htmlresponse = get(url) + jsonresponse = json.loads(htmlresponse.content) + entities = jsonresponse.get('entities', {}) + + for pname in entities: + pvalue = entities.get(pname) + parse_currency(pvalue) + + +def add_q(i): + return "Q" + str(i) + + +def fetch_data_batch(wikidata_ids): + while len(wikidata_ids) > 0: + if len(wikidata_ids) > 50: + fetch_data(wikidata_ids[0:49]) + wikidata_ids = wikidata_ids[50:] + else: + fetch_data(wikidata_ids) + wikidata_ids = [] + + +def wdq_query(query): + url = url_wmflabs_template + query + htmlresponse = get(url) + jsonresponse = json.loads(htmlresponse.content) + qlist = map(add_q, jsonresponse.get('items', {})) + error = jsonresponse.get('status', {}).get('error', None) + if error is not None and error != 'OK': + print "error for query '" + query + "' :" + error + + fetch_data_batch(qlist) + + +def wd_query(query, offset=0): + qlist = [] + + url = url_wikidata_search_template.format(query=urlencode({'srsearch': query, 'srlimit': 50, 'sroffset': offset})) + htmlresponse = get(url) + jsonresponse = json.loads(htmlresponse.content) + for r in jsonresponse.get('query', {}).get('search', {}): + qlist.append(r.get('title', '')) + fetch_data_batch(qlist) + + +# fetch # +for q in wmflabs_queries: + wdq_query(q) + +# static +add_currency_name(u"euro", 'EUR') +add_currency_name(u"euros", 'EUR') +add_currency_name(u"dollar", 'USD') +add_currency_name(u"dollars", 'USD') +add_currency_name(u"peso", 'MXN') +add_currency_name(u"pesos", 'MXN') + +# write +f = open("currencies.json", "wb") +json.dump(db, f, indent=4, encoding="utf-8") +f.close() diff --git a/utils/fetch_languages.py b/utils/fetch_languages.py new file mode 100644 index 0000000..3241370 --- /dev/null +++ b/utils/fetch_languages.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +# This script generates languages.py from intersecting each engine's supported languages. +# +# The country names are obtained from http://api.geonames.org which requires registering as a user. +# +# Output files (engines_languages.json and languages.py) +# are written in current directory to avoid overwriting in case something goes wrong. + +from requests import get +from urllib import urlencode +from lxml.html import fromstring +from json import loads, dumps +import io +from sys import path +path.append('../searx') # noqa +from searx import settings +from searx.engines import initialize_engines, engines + +# Geonames API for country names. +geonames_user = '' # ADD USER NAME HERE +country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}' + +# Output files. +engines_languages_file = 'engines_languages.json' +languages_file = 'languages.py' + +engines_languages = {} + + +# To filter out invalid codes and dialects. +def valid_code(lang_code): + # filter invalid codes + # sl-SL is technically not invalid, but still a mistake + invalid_codes = ['sl-SL', 'wt-WT', 'jw'] + invalid_countries = ['UK', 'XA', 'XL'] + if lang_code[:2] == 'xx'\ + or lang_code in invalid_codes\ + or lang_code[-2:] in invalid_countries\ + or is_dialect(lang_code): + return False + + return True + + +# Language codes with any additional tags other than language and country. +def is_dialect(lang_code): + lang_code = lang_code.split('-') + if len(lang_code) > 2 or len(lang_code[0]) > 3: + return True + if len(lang_code) == 2 and len(lang_code[1]) > 2: + return True + + return False + + +# Get country name in specified language. +def get_country_name(locale): + if geonames_user is '': + return '' + + locale = locale.split('-') + if len(locale) != 2: + return '' + + url = country_names_url.format(parameters=urlencode({'lang': locale[0], + 'country': locale[1], + 'username': geonames_user})) + response = get(url) + json = loads(response.text) + content = json.get('geonames', None) + if content is None or len(content) != 1: + print "No country name found for " + locale[0] + "-" + locale[1] + return '' + + return content[0].get('countryName', '') + + +# Fetchs supported languages for each engine and writes json file with those. +def fetch_supported_languages(): + initialize_engines(settings['engines']) + for engine_name in engines: + if hasattr(engines[engine_name], 'fetch_supported_languages'): + try: + engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() + except Exception as e: + print e + + # write json file + with io.open(engines_languages_file, "w", encoding="utf-8") as f: + f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8"))) + + +# Join all language lists. +# Iterate all languages supported by each engine. +def join_language_lists(): + global languages + # include wikipedia first for more accurate language names + languages = {code: lang for code, lang + in engines_languages['wikipedia'].iteritems() + if valid_code(code)} + + for engine_name in engines_languages: + for locale in engines_languages[engine_name]: + if valid_code(locale): + # if language is not on list or if it has no name yet + if locale not in languages or not languages[locale].get('name'): + if isinstance(engines_languages[engine_name], dict): + languages[locale] = engines_languages[engine_name][locale] + else: + languages[locale] = {} + + # add to counter of engines that support given language + lang = locale.split('-')[0] + if lang in languages: + if 'counter' not in languages[lang]: + languages[lang]['counter'] = [engine_name] + elif engine_name not in languages[lang]['counter']: + languages[lang]['counter'].append(engine_name) + + # filter list to include only languages supported by most engines + min_supported_engines = int(0.70 * len(engines_languages)) + languages = {code: lang for code, lang + in languages.iteritems() + if len(lang.get('counter', [])) >= min_supported_engines or + len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines} + + # get locales that have no name or country yet + for locale in languages.keys(): + # try to get language names + if not languages[locale].get('name'): + name = languages.get(locale.split('-')[0], {}).get('name', None) + if name: + languages[locale]['name'] = name + else: + # filter out locales with no name + del languages[locale] + continue + + # try to get language name in english + if not languages[locale].get('english_name'): + languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '') + + # try to get country name + if locale.find('-') > 0 and not languages[locale].get('country'): + languages[locale]['country'] = get_country_name(locale) or '' + + +# Remove countryless language if language is featured in only one country. +def filter_single_country_languages(): + prev_lang = None + prev_code = None + for code in sorted(languages): + lang = code.split('-')[0] + if lang == prev_lang: + countries += 1 + else: + if prev_lang is not None and countries == 1: + del languages[prev_lang] + languages[prev_code]['country'] = '' + countries = 0 + prev_lang = lang + prev_code = code + + +# Write languages.py. +def write_languages_file(): + new_file = open(languages_file, 'w') + file_content = '# -*- coding: utf-8 -*-\n'\ + + '# list of language codes\n'\ + + '# this file is generated automatically by utils/update_search_languages.py\n'\ + + '\nlanguage_codes = (' + for code in sorted(languages): + file_content += '\n (u"' + code + '"'\ + + ', u"' + languages[code]['name'].split(' (')[0] + '"'\ + + ', u"' + languages[code].get('country', '') + '"'\ + + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),' + # remove last comma + file_content = file_content[:-1] + file_content += '\n)\n' + new_file.write(file_content.encode('utf8')) + new_file.close() + + +if __name__ == "__main__": + fetch_supported_languages() + join_language_lists() + filter_single_country_languages() + write_languages_file() diff --git a/utils/google_search.py b/utils/google_search.py new file mode 100644 index 0000000..cad32ee --- /dev/null +++ b/utils/google_search.py @@ -0,0 +1,35 @@ +from sys import argv, exit + +if not len(argv) > 1: + print('search query required') + exit(1) + +import requests +from json import dumps +from searx.engines import google +from searx.search import default_request_params + +request_params = default_request_params() +# Possible params +# request_params['headers']['User-Agent'] = '' +# request_params['category'] = '' +request_params['pageno'] = 1 +request_params['language'] = 'en_us' +request_params['time_range'] = '' + +params = google.request(argv[1], request_params) + +request_args = dict( + headers=request_params['headers'], + cookies=request_params['cookies'], +) + +if request_params['method'] == 'GET': + req = requests.get +else: + req = requests.post + request_args['data'] = request_params['data'] + +resp = req(request_params['url'], **request_args) +resp.search_params = request_params +print(dumps(google.response(resp))) diff --git a/utils/standalone_searx.py b/utils/standalone_searx.py new file mode 100755 index 0000000..b19df4b --- /dev/null +++ b/utils/standalone_searx.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2016- by Alexandre Flament, +''' + +# set path +from sys import path +from os.path import realpath, dirname +path.append(realpath(dirname(realpath(__file__)) + '/../')) + +# initialization +from json import dumps +from searx import settings +import searx.query +import searx.search +import searx.engines +import searx.preferences +import argparse + +searx.engines.initialize_engines(settings['engines']) + +# command line parsing +parser = argparse.ArgumentParser(description='Standalone searx.') +parser.add_argument('query', type=str, + help='Text query') +parser.add_argument('--category', type=str, nargs='?', + choices=searx.engines.categories.keys(), + default='general', + help='Search category') +parser.add_argument('--lang', type=str, nargs='?',default='all', + help='Search language') +parser.add_argument('--pageno', type=int, nargs='?', default=1, + help='Page number starting from 1') +parser.add_argument('--safesearch', type=str, nargs='?', choices=['0', '1', '2'], default='0', + help='Safe content filter from none to strict') +parser.add_argument('--timerange', type=str, nargs='?', choices=['day', 'week', 'month', 'year'], + help='Filter by time range') +args = parser.parse_args() + +# search results for the query +form = { + "q":args.query, + "categories":args.category.decode('utf-8'), + "pageno":str(args.pageno), + "language":args.lang, + "time_range":args.timerange +} +preferences = searx.preferences.Preferences(['oscar'], searx.engines.categories.keys(), searx.engines.engines, []) +preferences.key_value_settings['safesearch'].parse(args.safesearch) + +search_query = searx.search.get_search_query_from_webapp(preferences, form) +search = searx.search.Search(search_query) +result_container = search.search() + +# output +from datetime import datetime + +def no_parsed_url(results): + for result in results: + del result['parsed_url'] + return results + +def json_serial(obj): + """JSON serializer for objects not serializable by default json code""" + if isinstance(obj, datetime): + serial = obj.isoformat() + return serial + raise TypeError ("Type not serializable") + +result_container_json = { + "search": { + "q": search_query.query, + "pageno": search_query.pageno, + "lang": search_query.lang, + "safesearch": search_query.safesearch, + "timerange": search_query.time_range, + "engines": search_query.engines + }, + "results": no_parsed_url(result_container.get_ordered_results()), + "infoboxes": result_container.infoboxes, + "suggestions": list(result_container.suggestions), + "answers": list(result_container.answers), + "paging": result_container.paging, + "results_number": result_container.results_number() +} + +print(dumps(result_container_json, sort_keys=True, indent=4, ensure_ascii=False, encoding="utf-8", default=json_serial)) diff --git a/utils/update-translations.sh b/utils/update-translations.sh new file mode 100755 index 0000000..00e7fb1 --- /dev/null +++ b/utils/update-translations.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +# script to easily update translation language files + +# add new language: +# pybabel init -i messages.pot -d searx/translations -l en + +SEARX_DIR='searx' + +pybabel extract -F babel.cfg -o messages.pot $SEARX_DIR +for f in `ls $SEARX_DIR'/translations/'`; do + pybabel update -N -i messages.pot -d $SEARX_DIR'/translations/' -l $f +done + +echo '[!] update done, edit .po files if required and run pybabel compile -d searx/translations/' -- cgit v1.2.3