summaryrefslogtreecommitdiff
path: root/bridges/GQMagazineBridge.php
blob: 961b3a09a36001154c0e534888682a8c5563d3c3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
<?php

/**
 * An extension of the previous SexactuBridge to cover the whole GQMagazine.
 * This one taks a page (as an example sexe/news or journaliste/maia-mazaurette) which is to be configured,
 * reads all the articles visible on that page, and make a stream out of it.
 * @author nicolas-delsaux
 *
 */
class GQMagazineBridge extends BridgeAbstract
{
	const MAINTAINER = 'Riduidel';

	const NAME = 'GQMagazine';

	// URI is no more valid, since we can address the whole gq galaxy
	const URI = 'https://www.gqmagazine.fr';

	const CACHE_TIMEOUT = 7200; // 2h
	const DESCRIPTION = 'GQMagazine section extractor bridge. This bridge allows you get only a specific section.';

	const DEFAULT_DOMAIN = 'www.gqmagazine.fr';

	const PARAMETERS = array( array(
		'domain' => array(
			'name' => 'Domain to use',
			'required' => true,
			'defaultValue' => self::DEFAULT_DOMAIN
		),
		'page' => array(
			'name' => 'Initial page to load',
			'required' => true,
			'exampleValue' => 'sexe/news'
		),
	));

	const REPLACED_ATTRIBUTES = array(
		'href' => 'href',
		'src' => 'src',
		'data-original' => 'src'
	);

	private function getDomain() {
		$domain = $this->getInput('domain');
		if (empty($domain))
			$domain = self::DEFAULT_DOMAIN;
		if (strpos($domain, '://') === false)
			$domain = 'https://' . $domain;
		return $domain;
	}

	public function getURI()
	{
		return $this->getDomain() . '/' . $this->getInput('page');
	}

	public function collectData()
	{
		$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());

		// Since GQ don't want simple class scrapping, let's do it the hard way and ... discover content !
		$main = $html->find('main', 0);
		foreach ($main->find('a') as $link) {
			$uri = $link->href;
			$title = $link->find('h2', 0);
			$date = $link->find('time', 0);

			$item = array();
			$author = $link->find('span[itemprop=name]', 0);
			$item['author'] = $author->plaintext;
			$item['title'] = $title->plaintext;
			if(substr($uri, 0, 1) === 'h') { // absolute uri
				$item['uri'] = $uri;
			} else if(substr($uri, 0, 1) === '/') { // domain relative url
				$item['uri'] = $this->getDomain() . $uri;
			} else {
				$item['uri'] = $this->getDomain() . '/' . $uri;
			}

			$article = $this->loadFullArticle($item['uri']);
			if($article) {
				$item['content'] = $this->replaceUriInHtmlElement($article);
			} else {
				$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
			}
			$short_date = $date->datetime;
			$item['timestamp'] = strtotime($short_date);
			$this->items[] = $item;
		}
	}

	/**
	 * Loads the full article and returns the contents
	 * @param $uri The article URI
	 * @return The article content
	 */
	private function loadFullArticle($uri){
		$html = getSimpleHTMLDOMCached($uri);
		// Once again, that generated css classes madness is an obstacle ... which i can go over easily
		foreach($html->find('div') as $div) {
			// List the CSS classes of that div
			$classes = $div->class;
			// I can't directly lookup that class since GQ since to generate random names like "ArticleBodySection-fkggUW"
			if(strpos($classes, 'ArticleBodySection') !== false) {
				return $div;
			}
		}
		return null;
	}

	/**
	 * Replaces all relative URIs with absolute ones
	 * @param $element A simplehtmldom element
	 * @return The $element->innertext with all URIs replaced
	 */
	private function replaceUriInHtmlElement($element){
		$returned = $element->innertext;
		foreach (self::REPLACED_ATTRIBUTES as $initial => $final) {
			$returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned);
		}
		return $returned;
	}
}