diff options
author | Johannes Schauer <josch@debian.org> | 2017-08-04 22:06:01 +0200 |
---|---|---|
committer | Johannes Schauer <josch@debian.org> | 2017-08-04 22:06:01 +0200 |
commit | b005331cd910c0cc7dee2ddf82491b8248f431cf (patch) | |
tree | ff8b5cbfe81d570b878cb8d60ee51d07c3b1d059 /bridges/SexactuBridge.php |
Import rss-bridge_2017-08-03.orig.tar.gz
[dgit import orig rss-bridge_2017-08-03.orig.tar.gz]
Diffstat (limited to 'bridges/SexactuBridge.php')
-rw-r--r-- | bridges/SexactuBridge.php | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/bridges/SexactuBridge.php b/bridges/SexactuBridge.php new file mode 100644 index 0000000..5bc552a --- /dev/null +++ b/bridges/SexactuBridge.php @@ -0,0 +1,88 @@ +<?php +class SexactuBridge extends BridgeAbstract { + + const MAINTAINER = 'Riduidel'; + const NAME = 'Sexactu'; + const AUTHOR = 'Maïa Mazaurette'; + const URI = 'http://www.gqmagazine.fr'; + const CACHE_TIMEOUT = 7200; // 2h + const DESCRIPTION = 'Sexactu via rss-bridge'; + + const REPLACED_ATTRIBUTES = array( + 'href' => 'href', + 'src' => 'src', + 'data-original' => 'src' + ); + + public function getURI(){ + return self::URI . '/sexactu'; + } + + public function collectData(){ + $html = getSimpleHTMLDOM($this->getURI()) + or returnServerError('Could not request ' . $this->getURI()); + + $sexactu = $html->find('.container_sexactu', 0); + $rowList = $sexactu->find('.row'); + foreach($rowList as $row) { + // only use first list as second one only contains pages numbers + + $title = $row->find('.title', 0); + if($title) { + $item = array(); + $item['author'] = self::AUTHOR; + $item['title'] = $title->plaintext; + $urlAttribute = "data-href"; + $uri = $title->$urlAttribute; + if($uri === false) + continue; + if(substr($uri, 0, 1) === 'h') { // absolute uri + $item['uri'] = $uri; + } else if(substr($uri, 0, 1) === '/') { // domain relative url + $item['uri'] = self::URI . $uri; + } else { + $item['uri'] = $this->getURI() . $uri; + } + $article = $this->loadFullArticle($item['uri']); + $item['content'] = $this->replaceUriInHtmlElement($article->find('.article_content', 0)); + + $publicationDate = $article->find('time[itemprop=datePublished]', 0); + $short_date = $publicationDate->datetime; + $item['timestamp'] = strtotime($short_date); + } else { + // Sometimes we get rubbish, ignore. + continue; + } + $this->items[] = $item; + } + } + + /** + * Loads the full article and returns the contents + * @param $uri The article URI + * @return The article content + */ + private function loadFullArticle($uri){ + $html = getSimpleHTMLDOMCached($uri); + + $content = $html->find('#article', 0); + if($content) { + return $content; + } + + return null; + } + + /** + * Replaces all relative URIs with absolute ones + * @param $element A simplehtmldom element + * @return The $element->innertext with all URIs replaced + */ + private function replaceUriInHtmlElement($element){ + $returned = $element->innertext; + foreach (self::REPLACED_ATTRIBUTES as $initial => $final) { + $returned = str_replace($initial . '="/', $final . '="' . self::URI . '/', $returned); + } + return $returned; + } +} |