diff options
author | Johannes 'josch' Schauer <josch@debian.org> | 2019-09-24 22:51:24 +0200 |
---|---|---|
committer | Johannes 'josch' Schauer <josch@debian.org> | 2019-09-24 22:51:24 +0200 |
commit | 8702184834089fd80a0caedd34297f834e716f52 (patch) | |
tree | 462095b3c147cb2a56ac5f1f42ce3d4e8b6839e2 /bridges/LWNprevBridge.php |
Import Upstream version 2019-01-13
Diffstat (limited to 'bridges/LWNprevBridge.php')
-rw-r--r-- | bridges/LWNprevBridge.php | 265 |
1 files changed, 265 insertions, 0 deletions
diff --git a/bridges/LWNprevBridge.php b/bridges/LWNprevBridge.php new file mode 100644 index 0000000..baa30c9 --- /dev/null +++ b/bridges/LWNprevBridge.php @@ -0,0 +1,265 @@ +<?php +class LWNprevBridge extends BridgeAbstract{ + const MAINTAINER = 'Pierre Mazière'; + const NAME = 'LWN Free Weekly Edition'; + const URI = 'https://lwn.net/'; + const CACHE_TIMEOUT = 604800; // 1 week + const DESCRIPTION = 'LWN Free Weekly Edition available one week late'; + + private $editionTimeStamp; + + function getURI(){ + return self::URI . 'free/bigpage'; + } + + private function jumpToNextTag(&$node){ + while($node && $node->nodeType === XML_TEXT_NODE) { + $nextNode = $node->nextSibling; + if(!$nextNode) { + break; + } + $node = $nextNode; + } + } + + private function jumpToPreviousTag(&$node){ + while($node && $node->nodeType === XML_TEXT_NODE) { + $previousNode = $node->previousSibling; + if(!$previousNode) { + break; + } + $node = $previousNode; + } + } + + public function collectData(){ + // Because the LWN page is written in loose HTML and not XHTML, + // Simple HTML Dom is not accurate enough for the job + $content = getContents($this->getURI()) + or returnServerError('No results for LWNprev'); + + $contents = explode('<b>Page editor</b>', $content); + + foreach($contents as $content) { + if(strpos($content, '<html>') === false) { + $content = <<<EOD +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html><head><title>LWN</title></head><body>{$content}</body></html> +EOD; + } else { + $content = $content . '</body></html>'; + } + + libxml_use_internal_errors(true); + $html = new DOMDocument(); + $html->loadHTML($content); + libxml_clear_errors(); + + $edition = $html->getElementsByTagName('h1'); + if($edition->length !== 0) { + $text = $edition->item(0)->textContent; + $this->editionTimeStamp = strtotime( + substr($text, strpos($text, 'for ') + strlen('for ')) + ); + } + + if(strpos($content, 'Cat1HL') === false) { + $items = $this->getFeatureContents($html); + } elseif(strpos($content, 'Cat3HL') === false) { + $items = $this->getBriefItems($html); + } else { + $items = $this->getAnnouncements($html); + } + + $this->items = array_merge($this->items, $items); + } + } + + private function getArticleContent(&$title){ + $link = $title->firstChild; + $this->jumpToNextTag($link); + $item['uri'] = self::URI; + if($link->nodeName === 'a') { + $item['uri'] .= $link->getAttribute('href'); + } + + $item['timestamp'] = $this->editionTimeStamp; + + $node = $title; + $content = ''; + $contentEnd = false; + while(!$contentEnd) { + $node = $node->nextSibling; + if(!$node || ( + $node->nodeType !== XML_TEXT_NODE && + $node->nodeName === 'h2' || ( + !is_null($node->attributes) && + !is_null($class = $node->attributes->getNamedItem('class')) && + in_array($class->nodeValue, array('Cat1HL','Cat2HL')) + ) + ) + ) { + $contentEnd = true; + } else { + $content .= $node->C14N(); + } + } + $item['content'] = $content; + return $item; + } + + private function getFeatureContents(&$html){ + $items = array(); + foreach($html->getElementsByTagName('h2') as $title) { + if($title->getAttribute('class') !== 'SummaryHL') { + continue; + } + + $item = array(); + + $author = $title->nextSibling; + $this->jumpToNextTag($author); + if($author->getAttribute('class') === 'FeatureByline') { + $item['author'] = $author->getElementsByTagName('b')->item(0)->textContent; + } else { + continue; + } + + $item['title'] = $title->textContent; + + $items[] = array_merge($item, $this->getArticleContent($title)); + } + return $items; + } + + private function getItemPrefix(&$cat, &$cats){ + $cat1 = ''; + $cat2 = ''; + $cat3 = ''; + switch($cat->getAttribute('class')) { + case 'Cat3HL': + $cat3 = $cat->textContent; + $cat = $cat->previousSibling; + $this->jumpToPreviousTag($cat); + $cats[2] = $cat3; + if($cat->getAttribute('class') !== 'Cat2HL') { + break; + } + case 'Cat2HL': + $cat2 = $cat->textContent; + $cat = $cat->previousSibling; + $this->jumpToPreviousTag($cat); + $cats[1] = $cat2; + if(empty($cat3)) { + $cats[2] = ''; + } + if($cat->getAttribute('class') !== 'Cat1HL') { + break; + } + case 'Cat1HL': + $cat1 = $cat->textContent; + $cats[0] = $cat1; + if(empty($cat3)) { + $cats[2] = ''; + } + if(empty($cat2)) { + $cats[1] = ''; + } + break; + default: + break; + } + + $prefix = ''; + if(!empty($cats[0])) { + $prefix .= '[' . $cats[0] . ($cats[1] ? '/' . $cats[1] : '') . '] '; + } + return $prefix; + } + + private function getAnnouncements(&$html){ + $items = array(); + $cats = array('','',''); + + foreach($html->getElementsByTagName('p') as $newsletters) { + if($newsletters->getAttribute('class') !== 'Cat3HL') { + continue; + } + + $item = array(); + + $item['uri'] = self::URI . '#' . count($items); + + $item['timestamp'] = $this->editionTimeStamp; + + $item['author'] = 'LWN'; + + $cat = $newsletters->previousSibling; + $this->jumpToPreviousTag($cat); + $prefix = $this->getItemPrefix($cat, $cats); + $item['title'] = $prefix . ' ' . $newsletters->textContent; + + $node = $newsletters; + $content = ''; + $contentEnd = false; + while(!$contentEnd) { + $node = $node->nextSibling; + if(!$node || ( + $node->nodeType !== XML_TEXT_NODE && ( + !is_null($node->attributes) && + !is_null($class = $node->attributes->getNamedItem('class')) && + in_array($class->nodeValue, array('Cat1HL','Cat2HL','Cat3HL')) + ) + ) + ) { + $contentEnd = true; + } else { + $content .= $node->C14N(); + } + } + $item['content'] = $content; + $items[] = $item; + } + + foreach($html->getElementsByTagName('h2') as $title) { + if($title->getAttribute('class') !== 'SummaryHL') { + continue; + } + + $item = array(); + + $cat = $title->previousSibling; + $this->jumpToPreviousTag($cat); + $cat = $cat->previousSibling; + $this->jumpToPreviousTag($cat); + $prefix = $this->getItemPrefix($cat, $cats); + $item['title'] = $prefix . ' ' . $title->textContent; + $items[] = array_merge($item, $this->getArticleContent($title)); + } + + return $items; + } + + private function getBriefItems(&$html){ + $items = array(); + $cats = array('','',''); + foreach($html->getElementsByTagName('h2') as $title) { + if($title->getAttribute('class') !== 'SummaryHL') { + continue; + } + + $item = array(); + + $cat = $title->previousSibling; + $this->jumpToPreviousTag($cat); + $cat = $cat->previousSibling; + $this->jumpToPreviousTag($cat); + $prefix = $this->getItemPrefix($cat, $cats); + $item['title'] = $prefix . ' ' . $title->textContent; + $items[] = array_merge($item, $this->getArticleContent($title)); + } + + return $items; + } +} +?> |