purify html before trying to parse wild urls. This way at least it should parse.

2024-11-10 05:02:58 +00:00 · 2011-07-04 23:02:04 -07:00 · 2011-07-04 23:02:04 -07:00 · 24d41e2c6e
commit 24d41e2c6e
parent 92831c9416
1 changed files with 13 additions and 4 deletions
--- a/mod/parse_url.php
+++ b/mod/parse_url.php
@ -1,6 +1,7 @@
 <?php

 require_once('library/HTML5/Parser.php');
+require_once('library/HTMLPurifier.auto.php');

 function parse_url_content(&$a) {

@ -31,16 +32,25 @@ function parse_url_content(&$a) {
 		killme();
 	}

+	logger('parse_url: data: ' . $s, LOGGER_DATA);

 	if(! $s) {
 		echo sprintf($template,$url,$url,'');
 		killme();
 	}

+	$config = HTMLPurifier_Config::createDefault();
+	$config->set('Cache.DefinitionImpl', null);
+
+	$purifier = new HTMLPurifier($config);
+	$s = $purifier->purify($s);
+
 	$dom = @HTML5_Parser::parse($s);

-	if(! $dom)
-		return $ret;
+	if(! $dom) {
+		echo sprintf($template,$url,$url,'');
+		killme();
+	}

 	$items = $dom->getElementsByTagName('title');

@ -51,7 +61,6 @@ function parse_url_content(&$a) {
 		}
 	}

-
 	$divs = $dom->getElementsByTagName('div');
 	if($divs) {
 		foreach($divs as $div) {
@ -94,6 +103,6 @@ function parse_url_content(&$a) {
 		$text = '<br />' . $text;
 	}

-	echo sprintf($template,$url,$title,$text);
+	echo sprintf($template,$url,($title) ? $title : $url,$text);
 	killme();
 }