The "scrape" bugfix lead to partly damaged encodings. This is fixed now.

This commit is contained in:
Michael Vogel 2015-01-04 11:56:41 +01:00
parent 3195bacd9e
commit a86c143e24
5 changed files with 19 additions and 11 deletions

View file

@ -20,7 +20,12 @@ class HTML5_Parser
// Cleanup invalid HTML
$doc = new DOMDocument();
@$doc->loadHTML($text);
if (mb_detect_encoding($text, "UTF-8", true) == "UTF-8")
@$doc->loadHTML('<?xml encoding="UTF-8" ?>'.$text);
else
@$doc->loadHTML($text);
$text = $doc->saveHTML();
$tokenizer = new HTML5_Tokenizer($text, $builder);