summaryrefslogtreecommitdiff
path: root/bridges/TheGuardianBridge.php
blob: e655f0ef54a45b5248ab2186ef7d77c4734e0d57 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
<?php
class TheGuardianBridge extends FeedExpander {
	const MAINTAINER = 'IceWreck';
	const NAME = 'The Guardian Bridge';
	const URI = 'https://www.theguardian.com/';
	const CACHE_TIMEOUT = 600; // This is a news site, so don't cache for more than 10 mins
	const DESCRIPTION = 'RSS feed for The Guardian';
	const PARAMETERS = array( array(
		'feed' => array(
			'name' => 'Feed',
			'type' => 'list',
			'values' => array(
				'World News' => 'world/rss',
				'US News' => '/us-news/rss',
				'UK News' => '/uk-news/rss',
				'Europe News' => '/world/europe-news/rss',
				'Asia News' => '/world/asia/rss',
				'Tech' => '/uk/technology/rss',
				'Business News' => '/uk/business/rss',
				'Opinion' => '/uk/commentisfree/rss',
				'Lifestyle' => '/uk/lifeandstyle/rss',
				'Culture' => '/uk/culture/rss',
				'Sports' => '/uk/sport/rss'
			)
		)

		/*

		Topicwise Links

		You can find the base feed for any topic by appending /rss to the url.

		Example:

		https://feeds.theguardian.com/theguardian/uk-news/rss
		https://feeds.theguardian.com/theguardian/us-news/rss

		Or simply

		https://www.theguardian.com/world/rss

		Just add that topic as a value in the PARAMETERS const.

		*/


	));

	public function collectData(){
		$feed = $this->getInput('feed');
		$feedURL = 'https://feeds.theguardian.com/theguardian/' . $feed;
		$this->collectExpandableDatas($feedURL, 10);
	}

	protected function parseItem($newsItem){
		$item = parent::parseItem($newsItem);

		// --- Recovering the article ---

		// $articlePage gets the entire page's contents
		$articlePage = getSimpleHTMLDOM($newsItem->link);
		// figure contain's the main article image
		$article = $articlePage->find('figure', 0);
		// content__article-body has the actual article
		foreach($articlePage->find('.content__article-body') as $element)
			$article = $article . $element;

		// --- Fixing ugly elements ---

		// Replace the image viewer and BS with the image itself
		foreach($articlePage->find('a.article__img-container') as $uslElementLoc) {
			$main_img = $uslElementLoc->find('img', 0);
			$article = str_replace($uslElementLoc, $main_img, $article);
		}

		// List of all the crap in the article
		$uselessElements = array(
			'#show-caption',
			'.element-atom',
			'.submeta',
			'youtube-media-atom',
			'svg'
		);

		// Remove the listed crap
		foreach($uselessElements as $uslElement) {
			foreach($articlePage->find($uslElement) as $uslElementLoc) {
				$article = str_replace($uslElementLoc, '', $article);
			}
		}

		$item['content'] = $article;

		return $item;
	}
}