diff options
Diffstat (limited to 'searx/engines/deviantart.py')
-rw-r--r-- | searx/engines/deviantart.py | 104 |
1 files changed, 50 insertions, 54 deletions
diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index a0e27e6..7840495 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -1,81 +1,77 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Deviantart (Images) - - @website https://www.deviantart.com/ - @provide-api yes (https://www.deviantart.com/developers/) (RSS) - - @using-api no (TODO, rewrite to api) - @results HTML - @stable no (HTML can change) - @parse url, title, thumbnail_src, img_src - - @todo rewrite to api """ +# pylint: disable=missing-function-docstring +from urllib.parse import urlencode from lxml import html -import re -from searx.engines.xpath import extract_text -from searx.url_utils import urlencode + +# about +about = { + "website": 'https://www.deviantart.com/', + "wikidata_id": 'Q46523', + "official_api_documentation": 'https://www.deviantart.com/developers/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} # engine dependent config categories = ['images'] paging = True time_range_support = True -# search-url -base_url = 'https://www.deviantart.com/' -search_url = base_url + 'search?page={page}&{query}' -time_range_url = '&order={range}' - -time_range_dict = {'day': 11, - 'week': 14, - 'month': 15} +time_range_dict = { + 'day': 'popular-24-hours', + 'week': 'popular-1-week', + 'month': 'popular-1-month', + 'year': 'most-recent', +} +# search-url +base_url = 'https://www.deviantart.com' -# do search-request def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: - return params - params['url'] = search_url.format(page=params['pageno'], - query=urlencode({'q': query})) + # https://www.deviantart.com/search/deviations?page=5&q=foo + + query = { + 'page' : params['pageno'], + 'q' : query, + } if params['time_range'] in time_range_dict: - params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + query['order'] = time_range_dict[params['time_range']] - return params + params['url'] = base_url + '/search/deviations?' + urlencode(query) + return params -# get response from search-request def response(resp): - results = [] - # return empty array if a redirection code is returned - if resp.status_code == 302: - return [] + results = [] dom = html.fromstring(resp.text) - # parse results for row in dom.xpath('//div[contains(@data-hook, "content_row")]'): for result in row.xpath('./div'): - link = result.xpath('.//a[@data-hook="deviation_link"]')[0] - url = link.attrib.get('href') - title = link.attrib.get('title') - thumbnail_src = result.xpath('.//img')[0].attrib.get('src') - img_src = thumbnail_src - - # http to https, remove domain sharding - thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) - thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) - - url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) - - # append result - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'template': 'images.html'}) - - # return results + + a_tag = result.xpath('.//a[@data-hook="deviation_link"]')[0] + noscript_tag = a_tag.xpath('.//noscript') + + if noscript_tag: + img_tag = noscript_tag[0].xpath('.//img') + else: + img_tag = a_tag.xpath('.//img') + if not img_tag: + continue + img_tag = img_tag[0] + + results.append({ + 'template': 'images.html', + 'url': a_tag.attrib.get('href'), + 'img_src': img_tag.attrib.get('src'), + 'title': img_tag.attrib.get('alt'), + }) + return results |