summaryrefslogtreecommitdiff
path: root/lib/html.php
diff options
context:
space:
mode:
Diffstat (limited to 'lib/html.php')
-rw-r--r--lib/html.php33
1 files changed, 6 insertions, 27 deletions
diff --git a/lib/html.php b/lib/html.php
index e49ca7a..13db97a 100644
--- a/lib/html.php
+++ b/lib/html.php
@@ -26,23 +26,13 @@
* already removes some of the tags (search for `remove_noise` in simple_html_dom.php).
*/
function sanitize($html,
-$tags_to_remove = array('script', 'iframe', 'input', 'form'),
-$attributes_to_keep = array('title', 'href', 'src'),
-$text_to_keep = array()){
+ $tags_to_remove = array('script', 'iframe', 'input', 'form'),
+ $attributes_to_keep = array('title', 'href', 'src'),
+ $text_to_keep = array()){
+
$htmlContent = str_get_html($html);
- /*
- * Notice: simple_html_dom currently doesn't support "->find(*)", which is a
- * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/
- *
- * A solution to this is to find all nodes WITHOUT a specific attribute. If
- * the attribute is very unlikely to appear in the DOM, this is essentially
- * returning all nodes.
- *
- * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib
- * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM.
- */
- foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {
+ foreach($htmlContent->find('*') as $element) {
if(in_array($element->tag, $text_to_keep)) {
$element->outertext = $element->plaintext;
} elseif(in_array($element->tag, $tags_to_remove)) {
@@ -89,18 +79,7 @@ function backgroundToImg($htmlContent) {
$regex = '/background-image[ ]{0,}:[ ]{0,}url\([\'"]{0,}(.*?)[\'"]{0,}\)/';
$htmlContent = str_get_html($htmlContent);
- /*
- * Notice: simple_html_dom currently doesn't support "->find(*)", which is a
- * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/
- *
- * A solution to this is to find all nodes WITHOUT a specific attribute. If
- * the attribute is very unlikely to appear in the DOM, this is essentially
- * returning all nodes.
- *
- * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib
- * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM.
- */
- foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {
+ foreach($htmlContent->find('*') as $element) {
if(preg_match($regex, $element->style, $matches) > 0) {