summaryrefslogtreecommitdiff
path: root/bridges/CNETBridge.php
blob: 564b817a94203a1c98a4c3b320f515601ca984df (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
<?php
class CNETBridge extends BridgeAbstract {

	const MAINTAINER = 'ORelio';
	const NAME = 'CNET News';
	const URI = 'https://www.cnet.com/';
	const CACHE_TIMEOUT = 3600; // 1h
	const DESCRIPTION = 'Returns the newest articles.';
	const PARAMETERS = array(
		array(
			'topic' => array(
				'name' => 'Topic',
				'type' => 'list',
				'values' => array(
					'All articles' => '',
					'Apple' => 'apple',
					'Google' => 'google',
					'Microsoft' => 'tags-microsoft',
					'Computers' => 'topics-computers',
					'Mobile' => 'topics-mobile',
					'Sci-Tech' => 'topics-sci-tech',
					'Security' => 'topics-security',
					'Internet' => 'topics-internet',
					'Tech Industry' => 'topics-tech-industry'
				)
			)
		)
	);

	private function cleanArticle($article_html) {
		$offset_p = strpos($article_html, '<p>');
		$offset_figure = strpos($article_html, '<figure');
		$offset = ($offset_figure < $offset_p ? $offset_figure : $offset_p);
		$article_html = substr($article_html, $offset);
		$article_html = str_replace('href="/', 'href="' . self::URI, $article_html);
		$article_html = str_replace(' height="0"', '', $article_html);
		$article_html = str_replace('<noscript>', '', $article_html);
		$article_html = str_replace('</noscript>', '', $article_html);
		$article_html = StripWithDelimiters($article_html, '<a class="clickToEnlarge', '</a>');
		$article_html = stripWithDelimiters($article_html, '<span class="nowPlaying', '</span>');
		$article_html = stripWithDelimiters($article_html, '<span class="duration', '</span>');
		$article_html = stripWithDelimiters($article_html, '<script', '</script>');
		$article_html = stripWithDelimiters($article_html, '<svg', '</svg>');
		return $article_html;
	}

	public function collectData() {

		// Retrieve and check user input
		$topic = str_replace('-', '/', $this->getInput('topic'));
		if (!empty($topic) && (substr_count($topic, '/') > 1 || !ctype_alpha(str_replace('/', '', $topic))))
			returnClientError('Invalid topic: ' . $topic);

		// Retrieve webpage
		$pageUrl = self::URI . (empty($topic) ? 'news/' : $topic . '/');
		$html = getSimpleHTMLDOM($pageUrl)
		or returnServerError('Could not request CNET: ' . $pageUrl);

		// Process articles
		foreach($html->find('div.assetBody, div.riverPost') as $element) {

			if(count($this->items) >= 10) {
				break;
			}

			$article_title = trim($element->find('h2, h3', 0)->plaintext);
			$article_uri = self::URI . substr($element->find('a', 0)->href, 1);
			$article_thumbnail = $element->parent()->find('img[src]', 0)->src;
			$article_timestamp = strtotime($element->find('time.assetTime, div.timeAgo', 0)->plaintext);
			$article_author = trim($element->find('a[rel=author], a.name', 0)->plaintext);
			$article_content = '<p><b>' . trim($element->find('p.dek', 0)->plaintext) . '</b></p>';

			if (is_null($article_thumbnail))
				$article_thumbnail = extractFromDelimiters($element->innertext, '<img src="', '"');

			if (!empty($article_title) && !empty($article_uri) && strpos($article_uri, self::URI . 'news/') !== false) {

				$article_html = getSimpleHTMLDOMCached($article_uri) or $article_html = null;

				if (!is_null($article_html)) {

					if (empty($article_thumbnail))
						$article_thumbnail = $article_html->find('div.originalImage', 0);
					if (empty($article_thumbnail))
						$article_thumbnail = $article_html->find('span.imageContainer', 0);
					if (is_object($article_thumbnail))
						$article_thumbnail = $article_thumbnail->find('img', 0)->src;

					$article_content .= trim(
						$this->cleanArticle(
							extractFromDelimiters(
								$article_html, '<article', '<footer'
							)
						)
					);
				}

				$item = array();
				$item['uri'] = $article_uri;
				$item['title'] = $article_title;
				$item['author'] = $article_author;
				$item['timestamp'] = $article_timestamp;
				$item['enclosures'] = array($article_thumbnail);
				$item['content'] = $article_content;
				$this->items[] = $item;
			}
		}
	}
}