mirror of
https://github.com/friendica/friendica
synced 2025-04-25 19:10:11 +00:00
New function to convert bbcode to markdown.
This commit is contained in:
parent
7c2d27e275
commit
3968e77f9e
10 changed files with 2939 additions and 19 deletions
618
include/markdownify/parsehtml/parsehtml.php
Normal file
618
include/markdownify/parsehtml/parsehtml.php
Normal file
|
@ -0,0 +1,618 @@
|
|||
<?php
|
||||
/**
|
||||
* parseHTML is a HTML parser which works with PHP 4 and above.
|
||||
* It tries to handle invalid HTML to some degree.
|
||||
*
|
||||
* @version 1.0 beta
|
||||
* @author Milian Wolff (mail@milianw.de, http://milianw.de)
|
||||
* @license LGPL, see LICENSE_LGPL.txt and the summary below
|
||||
* @copyright (C) 2007 Milian Wolff
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
class parseHTML {
|
||||
/**
|
||||
* tags which are always empty (<br /> etc.)
|
||||
*
|
||||
* @var array<string>
|
||||
*/
|
||||
var $emptyTags = array(
|
||||
'br',
|
||||
'hr',
|
||||
'input',
|
||||
'img',
|
||||
'area',
|
||||
'link',
|
||||
'meta',
|
||||
'param',
|
||||
);
|
||||
/**
|
||||
* tags with preformatted text
|
||||
* whitespaces wont be touched in them
|
||||
*
|
||||
* @var array<string>
|
||||
*/
|
||||
var $preformattedTags = array(
|
||||
'script',
|
||||
'style',
|
||||
'pre',
|
||||
'code',
|
||||
);
|
||||
/**
|
||||
* supress HTML tags inside preformatted tags (see above)
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
var $noTagsInCode = false;
|
||||
/**
|
||||
* html to be parsed
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
var $html = '';
|
||||
/**
|
||||
* node type:
|
||||
*
|
||||
* - tag (see isStartTag)
|
||||
* - text (includes cdata)
|
||||
* - comment
|
||||
* - doctype
|
||||
* - pi (processing instruction)
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
var $nodeType = '';
|
||||
/**
|
||||
* current node content, i.e. either a
|
||||
* simple string (text node), or something like
|
||||
* <tag attrib="value"...>
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
var $node = '';
|
||||
/**
|
||||
* wether current node is an opening tag (<a>) or not (</a>)
|
||||
* set to NULL if current node is not a tag
|
||||
* NOTE: empty tags (<br />) set this to true as well!
|
||||
*
|
||||
* @var bool | null
|
||||
*/
|
||||
var $isStartTag = null;
|
||||
/**
|
||||
* wether current node is an empty tag (<br />) or not (<a></a>)
|
||||
*
|
||||
* @var bool | null
|
||||
*/
|
||||
var $isEmptyTag = null;
|
||||
/**
|
||||
* tag name
|
||||
*
|
||||
* @var string | null
|
||||
*/
|
||||
var $tagName = '';
|
||||
/**
|
||||
* attributes of current tag
|
||||
*
|
||||
* @var array (attribName=>value) | null
|
||||
*/
|
||||
var $tagAttributes = null;
|
||||
/**
|
||||
* wether the current tag is a block element
|
||||
*
|
||||
* @var bool | null
|
||||
*/
|
||||
var $isBlockElement = null;
|
||||
|
||||
/**
|
||||
* keep whitespace
|
||||
*
|
||||
* @var int
|
||||
*/
|
||||
var $keepWhitespace = 0;
|
||||
/**
|
||||
* list of open tags
|
||||
* count this to get current depth
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
var $openTags = array();
|
||||
/**
|
||||
* list of block elements
|
||||
*
|
||||
* @var array
|
||||
* TODO: what shall we do with <del> and <ins> ?!
|
||||
*/
|
||||
var $blockElements = array (
|
||||
# tag name => <bool> is block
|
||||
# block elements
|
||||
'address' => true,
|
||||
'blockquote' => true,
|
||||
'center' => true,
|
||||
'del' => true,
|
||||
'dir' => true,
|
||||
'div' => true,
|
||||
'dl' => true,
|
||||
'fieldset' => true,
|
||||
'form' => true,
|
||||
'h1' => true,
|
||||
'h2' => true,
|
||||
'h3' => true,
|
||||
'h4' => true,
|
||||
'h5' => true,
|
||||
'h6' => true,
|
||||
'hr' => true,
|
||||
'ins' => true,
|
||||
'isindex' => true,
|
||||
'menu' => true,
|
||||
'noframes' => true,
|
||||
'noscript' => true,
|
||||
'ol' => true,
|
||||
'p' => true,
|
||||
'pre' => true,
|
||||
'table' => true,
|
||||
'ul' => true,
|
||||
# set table elements and list items to block as well
|
||||
'thead' => true,
|
||||
'tbody' => true,
|
||||
'tfoot' => true,
|
||||
'td' => true,
|
||||
'tr' => true,
|
||||
'th' => true,
|
||||
'li' => true,
|
||||
'dd' => true,
|
||||
'dt' => true,
|
||||
# header items and html / body as well
|
||||
'html' => true,
|
||||
'body' => true,
|
||||
'head' => true,
|
||||
'meta' => true,
|
||||
'link' => true,
|
||||
'style' => true,
|
||||
'title' => true,
|
||||
# unfancy media tags, when indented should be rendered as block
|
||||
'map' => true,
|
||||
'object' => true,
|
||||
'param' => true,
|
||||
'embed' => true,
|
||||
'area' => true,
|
||||
# inline elements
|
||||
'a' => false,
|
||||
'abbr' => false,
|
||||
'acronym' => false,
|
||||
'applet' => false,
|
||||
'b' => false,
|
||||
'basefont' => false,
|
||||
'bdo' => false,
|
||||
'big' => false,
|
||||
'br' => false,
|
||||
'button' => false,
|
||||
'cite' => false,
|
||||
'code' => false,
|
||||
'del' => false,
|
||||
'dfn' => false,
|
||||
'em' => false,
|
||||
'font' => false,
|
||||
'i' => false,
|
||||
'img' => false,
|
||||
'ins' => false,
|
||||
'input' => false,
|
||||
'iframe' => false,
|
||||
'kbd' => false,
|
||||
'label' => false,
|
||||
'q' => false,
|
||||
'samp' => false,
|
||||
'script' => false,
|
||||
'select' => false,
|
||||
'small' => false,
|
||||
'span' => false,
|
||||
'strong' => false,
|
||||
'sub' => false,
|
||||
'sup' => false,
|
||||
'textarea' => false,
|
||||
'tt' => false,
|
||||
'var' => false,
|
||||
);
|
||||
/**
|
||||
* get next node, set $this->html prior!
|
||||
*
|
||||
* @param void
|
||||
* @return bool
|
||||
*/
|
||||
function nextNode() {
|
||||
if (empty($this->html)) {
|
||||
# we are done with parsing the html string
|
||||
return false;
|
||||
}
|
||||
static $skipWhitespace = true;
|
||||
if ($this->isStartTag && !$this->isEmptyTag) {
|
||||
array_push($this->openTags, $this->tagName);
|
||||
if (in_array($this->tagName, $this->preformattedTags)) {
|
||||
# dont truncate whitespaces for <code> or <pre> contents
|
||||
$this->keepWhitespace++;
|
||||
}
|
||||
}
|
||||
|
||||
if ($this->html[0] == '<') {
|
||||
$token = substr($this->html, 0, 9);
|
||||
if (substr($token, 0, 2) == '<?') {
|
||||
# xml prolog or other pi's
|
||||
/** TODO **/
|
||||
#trigger_error('this might need some work', E_USER_NOTICE);
|
||||
$pos = strpos($this->html, '>');
|
||||
$this->setNode('pi', $pos + 1);
|
||||
return true;
|
||||
}
|
||||
if (substr($token, 0, 4) == '<!--') {
|
||||
# comment
|
||||
$pos = strpos($this->html, '-->');
|
||||
if ($pos === false) {
|
||||
# could not find a closing -->, use next gt instead
|
||||
# this is firefox' behaviour
|
||||
$pos = strpos($this->html, '>') + 1;
|
||||
} else {
|
||||
$pos += 3;
|
||||
}
|
||||
$this->setNode('comment', $pos);
|
||||
|
||||
$skipWhitespace = true;
|
||||
return true;
|
||||
}
|
||||
if ($token == '<!DOCTYPE') {
|
||||
# doctype
|
||||
$this->setNode('doctype', strpos($this->html, '>')+1);
|
||||
|
||||
$skipWhitespace = true;
|
||||
return true;
|
||||
}
|
||||
if ($token == '<![CDATA[') {
|
||||
# cdata, use text node
|
||||
|
||||
# remove leading <![CDATA[
|
||||
$this->html = substr($this->html, 9);
|
||||
|
||||
$this->setNode('text', strpos($this->html, ']]>')+3);
|
||||
|
||||
# remove trailing ]]> and trim
|
||||
$this->node = substr($this->node, 0, -3);
|
||||
$this->handleWhitespaces();
|
||||
|
||||
$skipWhitespace = true;
|
||||
return true;
|
||||
}
|
||||
if ($this->parseTag()) {
|
||||
# seems to be a tag
|
||||
# handle whitespaces
|
||||
if ($this->isBlockElement) {
|
||||
$skipWhitespace = true;
|
||||
} else {
|
||||
$skipWhitespace = false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if ($this->keepWhitespace) {
|
||||
$skipWhitespace = false;
|
||||
}
|
||||
# when we get here it seems to be a text node
|
||||
$pos = strpos($this->html, '<');
|
||||
if ($pos === false) {
|
||||
$pos = strlen($this->html);
|
||||
}
|
||||
$this->setNode('text', $pos);
|
||||
$this->handleWhitespaces();
|
||||
if ($skipWhitespace && $this->node == ' ') {
|
||||
return $this->nextNode();
|
||||
}
|
||||
$skipWhitespace = false;
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* parse tag, set tag name and attributes, see if it's a closing tag and so forth...
|
||||
*
|
||||
* @param void
|
||||
* @return bool
|
||||
*/
|
||||
function parseTag() {
|
||||
static $a_ord, $z_ord, $special_ords;
|
||||
if (!isset($a_ord)) {
|
||||
$a_ord = ord('a');
|
||||
$z_ord = ord('z');
|
||||
$special_ords = array(
|
||||
ord(':'), // for xml:lang
|
||||
ord('-'), // for http-equiv
|
||||
);
|
||||
}
|
||||
|
||||
$tagName = '';
|
||||
|
||||
$pos = 1;
|
||||
$isStartTag = $this->html[$pos] != '/';
|
||||
if (!$isStartTag) {
|
||||
$pos++;
|
||||
}
|
||||
# get tagName
|
||||
while (isset($this->html[$pos])) {
|
||||
$pos_ord = ord(strtolower($this->html[$pos]));
|
||||
if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
|
||||
$tagName .= $this->html[$pos];
|
||||
$pos++;
|
||||
} else {
|
||||
$pos--;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$tagName = strtolower($tagName);
|
||||
if (empty($tagName) || !isset($this->blockElements[$tagName])) {
|
||||
# something went wrong => invalid tag
|
||||
$this->invalidTag();
|
||||
return false;
|
||||
}
|
||||
if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
|
||||
# we supress all HTML tags inside code tags
|
||||
$this->invalidTag();
|
||||
return false;
|
||||
}
|
||||
|
||||
# get tag attributes
|
||||
/** TODO: in html 4 attributes do not need to be quoted **/
|
||||
$isEmptyTag = false;
|
||||
$attributes = array();
|
||||
$currAttrib = '';
|
||||
while (isset($this->html[$pos+1])) {
|
||||
$pos++;
|
||||
# close tag
|
||||
if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
|
||||
if ($this->html[$pos] == '/') {
|
||||
$isEmptyTag = true;
|
||||
$pos++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
$pos_ord = ord(strtolower($this->html[$pos]));
|
||||
if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
|
||||
# attribute name
|
||||
$currAttrib .= $this->html[$pos];
|
||||
} elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
|
||||
# drop whitespace
|
||||
} elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
|
||||
# get attribute value
|
||||
$pos++;
|
||||
$await = $this->html[$pos]; # single or double quote
|
||||
$pos++;
|
||||
$value = '';
|
||||
while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
|
||||
$value .= $this->html[$pos];
|
||||
$pos++;
|
||||
}
|
||||
$attributes[$currAttrib] = $value;
|
||||
$currAttrib = '';
|
||||
} else {
|
||||
$this->invalidTag();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if ($this->html[$pos] != '>') {
|
||||
$this->invalidTag();
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!empty($currAttrib)) {
|
||||
# html 4 allows something like <option selected> instead of <option selected="selected">
|
||||
$attributes[$currAttrib] = $currAttrib;
|
||||
}
|
||||
if (!$isStartTag) {
|
||||
if (!empty($attributes) || $tagName != end($this->openTags)) {
|
||||
# end tags must not contain any attributes
|
||||
# or maybe we did not expect a different tag to be closed
|
||||
$this->invalidTag();
|
||||
return false;
|
||||
}
|
||||
array_pop($this->openTags);
|
||||
if (in_array($tagName, $this->preformattedTags)) {
|
||||
$this->keepWhitespace--;
|
||||
}
|
||||
}
|
||||
$pos++;
|
||||
$this->node = substr($this->html, 0, $pos);
|
||||
$this->html = substr($this->html, $pos);
|
||||
$this->tagName = $tagName;
|
||||
$this->tagAttributes = $attributes;
|
||||
$this->isStartTag = $isStartTag;
|
||||
$this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags);
|
||||
if ($this->isEmptyTag) {
|
||||
# might be not well formed
|
||||
$this->node = preg_replace('# */? *>$#', ' />', $this->node);
|
||||
}
|
||||
$this->nodeType = 'tag';
|
||||
$this->isBlockElement = $this->blockElements[$tagName];
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* handle invalid tags
|
||||
*
|
||||
* @param void
|
||||
* @return void
|
||||
*/
|
||||
function invalidTag() {
|
||||
$this->html = substr_replace($this->html, '<', 0, 1);
|
||||
}
|
||||
/**
|
||||
* update all vars and make $this->html shorter
|
||||
*
|
||||
* @param string $type see description for $this->nodeType
|
||||
* @param int $pos to which position shall we cut?
|
||||
* @return void
|
||||
*/
|
||||
function setNode($type, $pos) {
|
||||
if ($this->nodeType == 'tag') {
|
||||
# set tag specific vars to null
|
||||
# $type == tag should not be called here
|
||||
# see this::parseTag() for more
|
||||
$this->tagName = null;
|
||||
$this->tagAttributes = null;
|
||||
$this->isStartTag = null;
|
||||
$this->isEmptyTag = null;
|
||||
$this->isBlockElement = null;
|
||||
|
||||
}
|
||||
$this->nodeType = $type;
|
||||
$this->node = substr($this->html, 0, $pos);
|
||||
$this->html = substr($this->html, $pos);
|
||||
}
|
||||
/**
|
||||
* check if $this->html begins with $str
|
||||
*
|
||||
* @param string $str
|
||||
* @return bool
|
||||
*/
|
||||
function match($str) {
|
||||
return substr($this->html, 0, strlen($str)) == $str;
|
||||
}
|
||||
/**
|
||||
* truncate whitespaces
|
||||
*
|
||||
* @param void
|
||||
* @return void
|
||||
*/
|
||||
function handleWhitespaces() {
|
||||
if ($this->keepWhitespace) {
|
||||
# <pre> or <code> before...
|
||||
return;
|
||||
}
|
||||
# truncate multiple whitespaces to a single one
|
||||
$this->node = preg_replace('#\s+#s', ' ', $this->node);
|
||||
}
|
||||
/**
|
||||
* normalize self::node
|
||||
*
|
||||
* @param void
|
||||
* @return void
|
||||
*/
|
||||
function normalizeNode() {
|
||||
$this->node = '<';
|
||||
if (!$this->isStartTag) {
|
||||
$this->node .= '/'.$this->tagName.'>';
|
||||
return;
|
||||
}
|
||||
$this->node .= $this->tagName;
|
||||
foreach ($this->tagAttributes as $name => $value) {
|
||||
$this->node .= ' '.$name.'="'.str_replace('"', '"', $value).'"';
|
||||
}
|
||||
if ($this->isEmptyTag) {
|
||||
$this->node .= ' /';
|
||||
}
|
||||
$this->node .= '>';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* indent a HTML string properly
|
||||
*
|
||||
* @param string $html
|
||||
* @param string $indent optional
|
||||
* @return string
|
||||
*/
|
||||
function indentHTML($html, $indent = " ", $noTagsInCode = false) {
|
||||
$parser = new parseHTML;
|
||||
$parser->noTagsInCode = $noTagsInCode;
|
||||
$parser->html = $html;
|
||||
$html = '';
|
||||
$last = true; # last tag was block elem
|
||||
$indent_a = array();
|
||||
while($parser->nextNode()) {
|
||||
if ($parser->nodeType == 'tag') {
|
||||
$parser->normalizeNode();
|
||||
}
|
||||
if ($parser->nodeType == 'tag' && $parser->isBlockElement) {
|
||||
$isPreOrCode = in_array($parser->tagName, array('code', 'pre'));
|
||||
if (!$parser->keepWhitespace && !$last && !$isPreOrCode) {
|
||||
$html = rtrim($html)."\n";
|
||||
}
|
||||
if ($parser->isStartTag) {
|
||||
$html .= implode($indent_a);
|
||||
if (!$parser->isEmptyTag) {
|
||||
array_push($indent_a, $indent);
|
||||
}
|
||||
} else {
|
||||
array_pop($indent_a);
|
||||
if (!$isPreOrCode) {
|
||||
$html .= implode($indent_a);
|
||||
}
|
||||
}
|
||||
$html .= $parser->node;
|
||||
if (!$parser->keepWhitespace && !($isPreOrCode && $parser->isStartTag)) {
|
||||
$html .= "\n";
|
||||
}
|
||||
$last = true;
|
||||
} else {
|
||||
if ($parser->nodeType == 'tag' && $parser->tagName == 'br') {
|
||||
$html .= $parser->node."\n";
|
||||
$last = true;
|
||||
continue;
|
||||
} elseif ($last && !$parser->keepWhitespace) {
|
||||
$html .= implode($indent_a);
|
||||
$parser->node = ltrim($parser->node);
|
||||
}
|
||||
$html .= $parser->node;
|
||||
|
||||
if (in_array($parser->nodeType, array('comment', 'pi', 'doctype'))) {
|
||||
$html .= "\n";
|
||||
} else {
|
||||
$last = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
/*
|
||||
# testcase / example
|
||||
error_reporting(E_ALL);
|
||||
|
||||
$html = '<p>Simple block on one line:</p>
|
||||
|
||||
<div>foo</div>
|
||||
|
||||
<p>And nested without indentation:</p>
|
||||
|
||||
<div>
|
||||
<div>
|
||||
<div>
|
||||
foo
|
||||
</div>
|
||||
<div style=">"/>
|
||||
</div>
|
||||
<div>bar</div>
|
||||
</div>
|
||||
|
||||
<p>And with attributes:</p>
|
||||
|
||||
<div>
|
||||
<div id="foo">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<p>This was broken in 1.0.2b7:</p>
|
||||
|
||||
<div class="inlinepage">
|
||||
<div class="toggleableend">
|
||||
foo
|
||||
</div>
|
||||
</div>';
|
||||
#$html = '<a href="asdfasdf" title=\'asdf\' foo="bar">asdf</a>';
|
||||
echo indentHTML($html);
|
||||
die();
|
||||
*/
|
Loading…
Add table
Add a link
Reference in a new issue