summaryrefslogtreecommitdiff
path: root/lib/html.php
blob: e49ca7afb6490a1e2fe57c99433196428a09a0a2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
<?php
/**
 * This file is part of RSS-Bridge, a PHP project capable of generating RSS and
 * Atom feeds for websites that don't have one.
 *
 * For the full license information, please view the UNLICENSE file distributed
 * with this source code.
 *
 * @package	Core
 * @license	http://unlicense.org/ UNLICENSE
 * @link	https://github.com/rss-bridge/rss-bridge
 */

/**
 * Removes unwanted tags from a given HTML text.
 *
 * @param string $html The HTML text to sanitize.
 * @param array $tags_to_remove A list of tags to remove from the DOM.
 * @param array $attributes_to_keep A list of attributes to keep on tags (other
 * attributes are removed).
 * @param array $text_to_keep A list of tags where the innertext replaces the tag
 * (i.e. `<p>Hello World!</p>` becomes `Hello World!`).
 * @return object A simplehtmldom object of the remaining contents.
 *
 * @todo Check if this implementation is still necessary, because simplehtmldom
 * already removes some of the tags (search for `remove_noise` in simple_html_dom.php).
 */
function sanitize($html,
$tags_to_remove = array('script', 'iframe', 'input', 'form'),
$attributes_to_keep = array('title', 'href', 'src'),
$text_to_keep = array()){
	$htmlContent = str_get_html($html);

	/*
	 * Notice: simple_html_dom currently doesn't support "->find(*)", which is a
	 * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/
	 *
	 * A solution to this is to find all nodes WITHOUT a specific attribute. If
	 * the attribute is very unlikely to appear in the DOM, this is essentially
	 * returning all nodes.
	 *
	 * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib
	 * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM.
	 */
	foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {
		if(in_array($element->tag, $text_to_keep)) {
			$element->outertext = $element->plaintext;
		} elseif(in_array($element->tag, $tags_to_remove)) {
			$element->outertext = '';
		} else {
			foreach($element->getAllAttributes() as $attributeName => $attribute) {
				if(!in_array($attributeName, $attributes_to_keep))
					$element->removeAttribute($attributeName);
			}
		}
	}

	return $htmlContent;
}

/**
 * Replace background by image
 *
 * Replaces tags with styles of `backgroud-image` by `<img />` tags.
 *
 * For example:
 *
 * ```HTML
 * <html>
 *   <body style="background-image: url('bgimage.jpg');">
 *     <h1>Hello world!</h1>
 *   </body>
 * </html>
 * ```
 *
 * results in this output:
 *
 * ```HTML
 * <html>
 *   <img style="display:block;" src="bgimage.jpg" />
 * </html>
 * ```
 *
 * @param string $htmlContent The HTML content
 * @return string The HTML content with all ocurrences replaced
 */
function backgroundToImg($htmlContent) {

	$regex = '/background-image[ ]{0,}:[ ]{0,}url\([\'"]{0,}(.*?)[\'"]{0,}\)/';
	$htmlContent = str_get_html($htmlContent);

	/*
	 * Notice: simple_html_dom currently doesn't support "->find(*)", which is a
	 * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/
	 *
	 * A solution to this is to find all nodes WITHOUT a specific attribute. If
	 * the attribute is very unlikely to appear in the DOM, this is essentially
	 * returning all nodes.
	 *
	 * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib
	 * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM.
	 */
	foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {

		if(preg_match($regex, $element->style, $matches) > 0) {

			$element->outertext = '<img style="display:block;" src="' . $matches[1] . '" />';

		}

	}

	return $htmlContent;

}

/**
 * Convert relative links in HTML into absolute links
 *
 * This function is based on `php-urljoin`.
 *
 * @link https://github.com/plaidfluff/php-urljoin php-urljoin
 *
 * @param string|object $content The HTML content. Supports HTML objects or string objects
 * @param string $server Fully qualified URL to the page containing relative links
 * @return object Content with fixed URLs.
 */
function defaultLinkTo($content, $server){
	$string_convert = false;
	if (is_string($content)) {
		$string_convert = true;
		$content = str_get_html($content);
	}

	foreach($content->find('img') as $image) {
		$image->src = urljoin($server, $image->src);
	}

	foreach($content->find('a') as $anchor) {
		$anchor->href = urljoin($server, $anchor->href);
	}

	if ($string_convert) {
		$content = $content->outertext;
	}

	return $content;
}

/**
 * Extract the first part of a string matching the specified start and end delimiters
 *
 * @param string $string Input string, e.g. `<div>Post author: John Doe</div>`
 * @param string $start Start delimiter, e.g. `author: `
 * @param string $end End delimiter, e.g. `<`
 * @return string|bool Extracted string, e.g. `John Doe`, or false if the
 * delimiters were not found.
 */
function extractFromDelimiters($string, $start, $end) {
	if (strpos($string, $start) !== false) {
		$section_retrieved = substr($string, strpos($string, $start) + strlen($start));
		$section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end));
		return $section_retrieved;
	} return false;
}

/**
 * Remove one or more part(s) of a string using a start and end delmiters
 *
 * @param string $string Input string, e.g. `foo<script>superscript()</script>bar`
 * @param string $start Start delimiter, e.g. `<script`
 * @param string $end End delimiter, e.g. `</script>`
 * @return string Cleaned string, e.g. `foobar`
 */
function stripWithDelimiters($string, $start, $end) {
	while(strpos($string, $start) !== false) {
		$section_to_remove = substr($string, strpos($string, $start));
		$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
		$string = str_replace($section_to_remove, '', $string);
	}
	return $string;
}

/**
 * Remove HTML sections containing one or more sections using the same HTML tag
 *
 * @param string $string Input string, e.g. `foo<div class="ads"><div>ads</div>ads</div>bar`
 * @param string $tag_name Name of the HTML tag, e.g. `div`
 * @param string $tag_start Start of the HTML tag to remove, e.g. `<div class="ads">`
 * @return string Cleaned String, e.g. `foobar`
 *
 * @todo This function needs more documentation to make it maintainable.
 */
function stripRecursiveHTMLSection($string, $tag_name, $tag_start){
	$open_tag = '<' . $tag_name;
	$close_tag = '</' . $tag_name . '>';
	$close_tag_length = strlen($close_tag);
	if(strpos($tag_start, $open_tag) === 0) {
		while(strpos($string, $tag_start) !== false) {
			$max_recursion = 100;
			$section_to_remove = null;
			$section_start = strpos($string, $tag_start);
			$search_offset = $section_start;
			do {
				$max_recursion--;
				$section_end = strpos($string, $close_tag, $search_offset);
				$search_offset = $section_end + $close_tag_length;
				$section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
				$open_tag_count = substr_count($section_to_remove, $open_tag);
				$close_tag_count = substr_count($section_to_remove, $close_tag);
			} while ($open_tag_count > $close_tag_count && $max_recursion > 0);
			$string = str_replace($section_to_remove, '', $string);
		}
	}
	return $string;
}

/**
 * Convert Markdown into HTML. Only a subset of the Markdown syntax is implemented.
 *
 * @link https://daringfireball.net/projects/markdown/ Markdown
 * @link https://github.github.com/gfm/ GitHub Flavored Markdown Spec
 *
 * @param string $string Input string in Markdown format
 * @return string output string in HTML format
 */
function markdownToHtml($string) {

	//For more details about how these regex work:
	// https://github.com/RSS-Bridge/rss-bridge/pull/802#discussion_r216138702
	// Images: https://regex101.com/r/JW9Evr/1
	// Links: https://regex101.com/r/eRGVe7/1
	// Bold: https://regex101.com/r/2p40Y0/1
	// Italic: https://regex101.com/r/xJkET9/1
	// Separator: https://regex101.com/r/ZBEqFP/1
	// Plain URL: https://regex101.com/r/2JHYwb/1
	// Site name: https://regex101.com/r/qIuKYE/1

	$string = preg_replace('/\!\[([^\]]+)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
	$string = preg_replace('/\[([^\]]+)\]\(([^\)]+)\)/', '<a href="$2">$1</a>', $string);
	$string = preg_replace('/\*\*(.*)\*\*/U', '<b>$1</b>', $string);
	$string = preg_replace('/\*(.*)\*/U', '<i>$1</i>', $string);
	$string = preg_replace('/__(.*)__/U', '<b>$1</b>', $string);
	$string = preg_replace('/_(.*)_/U', '<i>$1</i>', $string);
	$string = preg_replace('/[-]{6,99}/', '<hr />', $string);
	$string = str_replace('&#10;', '<br />', $string);
	$string = preg_replace('/([^"])(https?:\/\/[^ "<]+)([^"])/', '$1<a href="$2">$2</a>$3', $string . ' ');
	$string = preg_replace('/([^"\/])(www\.[^ "<]+)([^"])/', '$1<a href="http://$2">$2</a>$3', $string . ' ');

	//As the regex are not perfect, we need to fix <i> and </i> that are introduced in URLs
	// Fixup regex <i>: https://regex101.com/r/NTRPf6/1
	// Fixup regex </i>: https://regex101.com/r/aNklRp/1

	$count = 1;
	while($count > 0) {
		$string = preg_replace('/ (src|href)="([^"]+)<i>([^"]+)"/U', ' $1="$2_$3"', $string, -1, $count);
	}

	$count = 1;
	while($count > 0) {
		$string = preg_replace('/ (src|href)="([^"]+)<\/i>([^"]+)"/U', ' $1="$2_$3"', $string, -1, $count);
	}

	return '<div>' . trim($string) . '</div>';
}