refactor language detection

This commit is contained in:
nobody 2022-07-05 11:34:07 +10:00
parent ed0d4048c1
commit 9dc37df489
7 changed files with 93 additions and 96 deletions

View file

@ -3768,8 +3768,11 @@ class Activity
return;
}
$plaintext = prepare_text($item['body'],((isset($item['mimetype'])) ? $item['mimetype'] : 'text/x-multicode'));
$plaintext = html2plain((isset($item['title']) && $item['title']) ? $item['title'] . ' ' . $plaintext : $plaintext);
if ($channel['channel_system']) {
if (!MessageFilter::evaluate($item, get_config('system', 'pubstream_incl'), get_config('system', 'pubstream_excl'))) {
if (!MessageFilter::evaluate($item, get_config('system', 'pubstream_incl'), get_config('system', 'pubstream_excl'), ['plaintext' => $plaintext])) {
logger('post is filtered');
return;
}

View file

@ -0,0 +1,49 @@
<?php
namespace Code\Lib;
require_once('library/text_languagedetect/Text/LanguageDetect.php');
use Text_LanguageDetect;
/**
* @see http://pear.php.net/package/Text_LanguageDetect
* @param string $s A string to examine
* @return string Language code in 2-letter ISO 639-1 (en, de, fr) format
*
* @TODO: The PEAR library is no longer being maintained and has had recent issues loading with composer (2020-06-29).
* This project: https://github.com/patrickschur/language-detection *may* be useful as a replacement.
*/
class LanguageDetect
{
const MINLENGTH = 48;
const MINCONFIDENCE = 0.01;
public function detect($string)
{
$detector = new Text_LanguageDetect();
if (mb_strlen($string) < self::MINLENGTH) {
return '';
}
try {
// return 2-letter ISO 639-1 (en) language code
$detector->setNameMode(2);
$result = $detector->detectConfidence($string);
} catch (Text_LanguageDetect_Exception $e) {
// null operation
}
if (!($result && isset($result['language']))) {
return '';
}
if ($result['confidence'] < self::MINCONFIDENCE) {
return '';
}
return($result['language']);
}
}

View file

@ -1666,6 +1666,10 @@ class Libzot
// if any further changes are to be made, change a copy and not the original
$arr = $msg_arr;
$plaintext = prepare_text($arr['body'],((isset($arr['mimetype'])) ? $arr['mimetype'] : 'text/x-multicode'));
$plaintext = html2plain((isset($arr['title']) && $arr['title']) ? $arr['title'] . ' ' . $plaintext : $plaintext);
$DR = new DReport(z_root(), $sender, $d, $arr['mid']);
$channel = Channel::from_hash($d);
@ -1770,7 +1774,7 @@ class Libzot
$local_public = false;
continue;
}
if (!MessageFilter::evaluate($arr, get_config('system', 'pubstream_incl'), get_config('system', 'pubstream_excl'))) {
if (!MessageFilter::evaluate($arr, get_config('system', 'pubstream_incl'), get_config('system', 'pubstream_excl'), ['plaintext' => $plaintext])) {
$local_public = false;
continue;
}

View file

@ -7,21 +7,27 @@ require_once('include/html2plain.php');
class MessageFilter
{
public static function evaluate($item, $incl, $excl)
public static function evaluate($item, $incl, $excl, $opts = [])
{
// Option: plaintext
// Improve language detection by providing a plaintext version of $item['body'] which has no markup constructs/tags.
$text = prepare_text($item['body'],((isset($item['mimetype'])) ? $item['mimetype'] : 'text/x-multicode'));
$text = html2plain((isset($item['title']) && $item['title']) ? $item['title'] . ' ' . $text : $text);
if (array_key_exists('plaintext', $opts)) {
$text = $opts['plaintext'];
}
else {
$text = $item['body'];
}
$lang = null;
// Language matching is a bit tricky, because the language can be ambiguous (detect_language() returns '').
// If the language is ambiguous, the message will be accepted regardless of language rules.
// If the language is ambiguous, the message will pass (be accepted) regardless of language rules.
if ((strpos($incl, 'lang=') !== false) || (strpos($excl, 'lang=') !== false) || (strpos($incl, 'lang!=') !== false) || (strpos($excl, 'lang!=') !== false)) {
$lang = detect_language($text);
$detector = new LanguageDetect();
$lang = $detector->detect($text);
}
$tags = ((isset($item['term']) && is_array($item['term']) && count($item['term'])) ? $item['term'] : false);
@ -68,9 +74,9 @@ class MessageFilter
if (self::test_condition(substr($word, 1), $item)) {
return false;
}
} elseif ((strpos($word, '/') === 0) && preg_match($word, $text)) {
} elseif ((strpos($word, '/') === 0) && preg_match($word, $item['body'])) {
return false;
} elseif (stristr($text, $word) !== false) {
} elseif (stristr($item['body'], $word) !== false) {
return false;
}
}

View file

@ -13,8 +13,10 @@ use Code\Lib\ActivityStreams;
use Code\Lib\Apps;
use Code\Lib\Enotify;
use Code\Lib\Channel;
use Code\Lib\LanguageDetect;
use Code\Lib\MarkdownSoap;
use Code\Lib\MessageFilter;
use Code\Lib\Config;
use Code\Lib\IConfig;
use Code\Lib\PConfig;
use Code\Lib\LibBlock;
@ -1570,8 +1572,11 @@ function item_store($arr, $allow_exec = false, $deliver = true, $linkid = true)
// obsolete, but needed so as not to throw not-null constraints on some database driveres
$arr['item_flags'] = ((x($arr,'item_flags')) ? intval($arr['item_flags']) : 0 );
$languagetext = prepare_text($arr['body'],((isset($arr['mimetype'])) ? $arr['mimetype'] : 'text/x-multicode'));
$languagetext = html2plain((isset($arr['title']) && $arr['title']) ? $arr['title'] . ' ' . $languagetext : $languagetext);
$arr['lang'] = detect_language($arr['body']);
$detector = new LanguageDetect();
$arr['lang'] = $detector->detect($languagetext);
// apply the input filter here
@ -2080,7 +2085,11 @@ function item_store_update($arr, $allow_exec = false, $deliver = true, $linkid =
return $ret;
}
$arr['lang'] = detect_language($arr['body']);
$languagetext = prepare_text($arr['body'],((isset($arr['mimetype'])) ? $arr['mimetype'] : 'text/x-multicode'));
$languagetext = html2plain((isset($arr['title']) && $arr['title']) ? $arr['title'] . ' ' . $languagetext : $languagetext);
$detector = new LanguageDetect();
$arr['lang'] = $detector->detect($languagetext);
// apply the input filter here
@ -3438,10 +3447,13 @@ function post_is_importable($channel_id, $item, $abook) {
return true;
}
$text = prepare_text($item['body'],((isset($item['mimetype'])) ? $item['mimetype'] : 'text/x-multicode'));
$text = html2plain((isset($item['title']) && $item['title']) ? $item['title'] . ' ' . $text : $text);
$incl = PConfig::get($channel_id, 'system', 'message_filter_incl', EMPTY_STR);
$excl = PConfig::get($channel_id, 'system', 'message_filter_excl', EMPTY_STR);
if ($incl || $excl) {
$x = MessageFilter::evaluate($item, $incl, $excl);
$x = MessageFilter::evaluate($item, $incl, $excl, ['plaintext' => $text]);
if (! $x) {
logger('MessageFilter: channel blocked content', LOGGER_DEBUG, LOG_INFO);
return false;
@ -3460,7 +3472,7 @@ function post_is_importable($channel_id, $item, $abook) {
if (! ($ab['abook_incl'] || $ab['abook_excl']) ) {
continue;
}
$evaluator = MessageFilter::evaluate($item, $ab['abook_incl'], $ab['abook_excl']);
$evaluator = MessageFilter::evaluate($item, $ab['abook_incl'], $ab['abook_excl'], ['plaintext' => $text]);
// A negative assessment for any individual connections is an instant fail.
if (! $evaluator) {
return false;

View file

@ -279,70 +279,6 @@ function string_plural_select_default($n)
return ($n != 1);
}
/**
* @brief Takes a string and tries to identify the language.
*
* It uses the pear library Text_LanguageDetect and it can identify 52 human languages.
* It returns the identified languges and a confidence score for each.
*
* Strings need to have a min length config['system']['language_detect_min_length']
* and you can influence the confidence that must be met before a result will get
* returned through config['system']['language_detect_min_confidence'].
*
* @see http://pear.php.net/package/Text_LanguageDetect
* @param string $s A string to examine
* @return string Language code in 2-letter ISO 639-1 (en, de, fr) format
*
* @TODO: The PEAR library is no longer being maintained and has had recent issues loading with composer (2020-06-29).
* This project: https://github.com/patrickschur/language-detection *may* be useful as a replacement.
*
*/
function detect_language($s)
{
require_once('library/text_languagedetect/Text/LanguageDetect.php');
$min_length = get_config('system', 'language_detect_min_length');
if ($min_length === false) {
$min_length = LANGUAGE_DETECT_MIN_LENGTH;
}
$min_confidence = get_config('system', 'language_detect_min_confidence');
if ($min_confidence === false) {
$min_confidence = LANGUAGE_DETECT_MIN_CONFIDENCE;
}
// embedded apps have long base64 strings which will trip up the detector.
$naked_body = preg_replace('/\[app\](.*?)\[\/app\]/', '', $s);
// strip off bbcode
$naked_body = preg_replace('/\[(.+?)\]/', '', $naked_body);
if (mb_strlen($naked_body) < intval($min_length)) {
logger('string length less than ' . intval($min_length), LOGGER_DATA);
return '';
}
$l = new Text_LanguageDetect();
try {
// return 2-letter ISO 639-1 (en) language code
$l->setNameMode(2);
$lng = $l->detectConfidence($naked_body);
logger('detect language: ' . print_r($lng, true) . $naked_body, LOGGER_DATA);
} catch (Text_LanguageDetect_Exception $e) {
logger('detect language exception: ' . $e->getMessage(), LOGGER_DATA);
}
if ((! $lng) || (! (x($lng, 'language')))) {
return '';
}
if ($lng['confidence'] < (float) $min_confidence) {
logger('detect language: confidence less than ' . (float) $min_confidence, LOGGER_DATA);
return '';
}
return($lng['language']);
}
/**
* @brief Returns the display name of a given language code.
*

View file

@ -3,12 +3,8 @@
namespace Code\Tests\Unit\Lib;
use Code\Tests\Unit\UnitTestCase;
use phpmock\phpunit\PHPMock;
use Code\Lib\MessageFilter;
include 'boot.php';
sys_boot();
/**
* @brief Unit Test case for HTTPSig class.
*
@ -16,10 +12,8 @@ sys_boot();
*/
class MessageFilterTest extends UnitTestCase
{
use PHPMock;
/** @test */
public function languageFilterTests()
public function testLanguageFilter()
{
// Check accept language rules
@ -49,12 +43,5 @@ class MessageFilterTest extends UnitTestCase
$x = MessageFilter::evaluate([ 'body' => 'the quick brown fox jumped over the lazy dog. Therefore the world is flat.' ], '', 'lang!=en');
$this->assertTrue($x);
}
}