mirror of
https://github.com/friendica/friendica
synced 2025-05-11 05:44:10 +02:00
Move HTML to Markdown library to Composer
This commit is contained in:
parent
1ab28bbe03
commit
c9dafe3b4e
43 changed files with 2380 additions and 776 deletions
231
vendor/league/html-to-markdown/src/HtmlConverter.php
vendored
Normal file
231
vendor/league/html-to-markdown/src/HtmlConverter.php
vendored
Normal file
|
@ -0,0 +1,231 @@
|
|||
<?php
|
||||
|
||||
namespace League\HTMLToMarkdown;
|
||||
|
||||
/**
|
||||
* Class HtmlConverter
|
||||
*
|
||||
* A helper class to convert HTML to Markdown.
|
||||
*
|
||||
* @author Colin O'Dell <colinodell@gmail.com>
|
||||
* @author Nick Cernis <nick@cern.is>
|
||||
*
|
||||
* @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
|
||||
*
|
||||
* @license http://www.opensource.org/licenses/mit-license.php MIT
|
||||
*/
|
||||
class HtmlConverter
|
||||
{
|
||||
/**
|
||||
* @var Environment
|
||||
*/
|
||||
protected $environment;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param Environment|array $options Environment object or configuration options
|
||||
*/
|
||||
public function __construct($options = array())
|
||||
{
|
||||
if ($options instanceof Environment) {
|
||||
$this->environment = $options;
|
||||
} elseif (is_array($options)) {
|
||||
$defaults = array(
|
||||
'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
|
||||
'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
|
||||
'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
|
||||
'bold_style' => '**', // Set to '__' if you prefer the underlined style
|
||||
'italic_style' => '_', // Set to '*' if you prefer the asterisk style
|
||||
'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
|
||||
'hard_break' => false,// Set to true to turn <br> into `\n` instead of ` \n`
|
||||
);
|
||||
|
||||
$this->environment = Environment::createDefaultEnvironment($defaults);
|
||||
|
||||
$this->environment->getConfig()->merge($options);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Environment
|
||||
*/
|
||||
public function getEnvironment()
|
||||
{
|
||||
return $this->environment;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Configuration
|
||||
*/
|
||||
public function getConfig()
|
||||
{
|
||||
return $this->environment->getConfig();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert
|
||||
*
|
||||
* @see HtmlConverter::convert
|
||||
*
|
||||
* @param string $html
|
||||
*
|
||||
* @return string The Markdown version of the html
|
||||
*/
|
||||
public function __invoke($html)
|
||||
{
|
||||
return $this->convert($html);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert
|
||||
*
|
||||
* Loads HTML and passes to getMarkdown()
|
||||
*
|
||||
* @param string $html
|
||||
*
|
||||
* @throws \InvalidArgumentException
|
||||
*
|
||||
* @return string The Markdown version of the html
|
||||
*/
|
||||
public function convert($html)
|
||||
{
|
||||
if (trim($html) === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
$document = $this->createDOMDocument($html);
|
||||
|
||||
// Work on the entire DOM tree (including head and body)
|
||||
if (!($root = $document->getElementsByTagName('html')->item(0))) {
|
||||
throw new \InvalidArgumentException('Invalid HTML was provided');
|
||||
}
|
||||
|
||||
$rootElement = new Element($root);
|
||||
$this->convertChildren($rootElement);
|
||||
|
||||
// Store the now-modified DOMDocument as a string
|
||||
$markdown = $document->saveHTML();
|
||||
|
||||
return $this->sanitize($markdown);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $html
|
||||
*
|
||||
* @return \DOMDocument
|
||||
*/
|
||||
private function createDOMDocument($html)
|
||||
{
|
||||
$document = new \DOMDocument();
|
||||
|
||||
if ($this->getConfig()->getOption('suppress_errors')) {
|
||||
// Suppress conversion errors (from http://bit.ly/pCCRSX)
|
||||
libxml_use_internal_errors(true);
|
||||
}
|
||||
|
||||
// Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
|
||||
$document->loadHTML('<?xml encoding="UTF-8">' . $html);
|
||||
$document->encoding = 'UTF-8';
|
||||
|
||||
if ($this->getConfig()->getOption('suppress_errors')) {
|
||||
libxml_clear_errors();
|
||||
}
|
||||
|
||||
return $document;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Children
|
||||
*
|
||||
* Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
|
||||
*
|
||||
* Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
|
||||
* starting with the innermost element and working up to the outermost element.
|
||||
*
|
||||
* @param ElementInterface $element
|
||||
*/
|
||||
private function convertChildren(ElementInterface $element)
|
||||
{
|
||||
// Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
|
||||
// except if the current node is a code tag, which needs to be converted by the CodeConverter.
|
||||
if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
|
||||
return;
|
||||
}
|
||||
|
||||
// If the node has children, convert those to Markdown first
|
||||
if ($element->hasChildren()) {
|
||||
foreach ($element->getChildren() as $child) {
|
||||
$this->convertChildren($child);
|
||||
}
|
||||
}
|
||||
|
||||
// Now that child nodes have been converted, convert the original node
|
||||
$markdown = $this->convertToMarkdown($element);
|
||||
|
||||
// Create a DOM text node containing the Markdown equivalent of the original node
|
||||
|
||||
// Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
|
||||
$element->setFinalMarkdown($markdown);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert to Markdown
|
||||
*
|
||||
* Converts an individual node into a #text node containing a string of its Markdown equivalent.
|
||||
*
|
||||
* Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
|
||||
*
|
||||
* @param ElementInterface $element
|
||||
*
|
||||
* @return string The converted HTML as Markdown
|
||||
*/
|
||||
protected function convertToMarkdown(ElementInterface $element)
|
||||
{
|
||||
$tag = $element->getTagName();
|
||||
|
||||
// Strip nodes named in remove_nodes
|
||||
$tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
|
||||
if (in_array($tag, $tags_to_remove)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$converter = $this->environment->getConverterByTag($tag);
|
||||
|
||||
return $converter->convert($element);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $markdown
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
protected function sanitize($markdown)
|
||||
{
|
||||
$markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
|
||||
$markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
|
||||
$markdown = trim($markdown); // Remove blank spaces at the beggining of the html
|
||||
|
||||
/*
|
||||
* Removing unwanted tags. Tags should be added to the array in the order they are expected.
|
||||
* XML, html and body opening tags should be in that order. Same case with closing tags
|
||||
*/
|
||||
$unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '
');
|
||||
|
||||
foreach ($unwanted as $tag) {
|
||||
if (strpos($tag, '/') === false) {
|
||||
// Opening tags
|
||||
if (strpos($markdown, $tag) === 0) {
|
||||
$markdown = substr($markdown, strlen($tag));
|
||||
}
|
||||
} else {
|
||||
// Closing tags
|
||||
if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
|
||||
$markdown = substr($markdown, 0, -strlen($tag));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return trim($markdown, "\n\r\0\x0B");
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue