2023-09-30 15:56:50 +00:00
< ? php
/**
* Name : Compact Language Detector
* Description : Improved language detection
* Version : 0.1
* Author : Michael Vogel < heluecht @ pirati . ca >
*/
use Friendica\Core\Hook ;
use Friendica\Core\Logger ;
use Friendica\DI ;
function cld_install ()
{
2023-10-06 03:54:45 +00:00
Hook :: register ( 'detect_languages' , __FILE__ , 'cld_detect_languages' );
2023-09-30 15:56:50 +00:00
}
2023-10-01 04:14:10 +00:00
function cld_detect_languages ( array & $data )
2023-09-30 15:56:50 +00:00
{
if ( ! in_array ( 'cld2' , get_loaded_extensions ())) {
Logger :: warning ( 'CLD2 is not installed.' );
return ;
}
$cld2 = new \CLD2Detector ();
$cld2 -> setEncodingHint ( CLD2Encoding :: UTF8 ); // optional, hints about text encoding
2023-10-06 03:54:45 +00:00
$cld2 -> setPlainText ( true );
2023-09-30 15:56:50 +00:00
$result = $cld2 -> detect ( $data [ 'text' ]);
2023-10-11 18:49:42 +00:00
2023-09-30 15:56:50 +00:00
if ( $data [ 'detected' ]) {
$original = array_key_first ( $data [ 'detected' ]);
} else {
$original = '' ;
}
$detected = $result [ 'language_code' ];
if ( $detected == 'pt' ) {
$detected = 'pt-PT' ;
2023-10-06 03:54:45 +00:00
} elseif ( $detected == 'az' ) {
$detected = 'az-Latn' ;
} elseif ( $detected == 'bs' ) {
$detected = 'bs-Latn' ;
2023-09-30 15:56:50 +00:00
} elseif ( $detected == 'el' ) {
$detected = 'el-monoton' ;
2023-10-06 03:54:45 +00:00
} elseif ( $detected == 'ht' ) {
$detected = 'fr' ;
} elseif ( $detected == 'iw' ) {
$detected = 'he' ;
} elseif ( $detected == 'jw' ) {
$detected = 'jv' ;
} elseif ( $detected == 'ms' ) {
$detected = 'ms-Latn' ;
2023-09-30 15:56:50 +00:00
} elseif ( $detected == 'no' ) {
$detected = 'nb' ;
2023-10-06 03:54:45 +00:00
} elseif ( $detected == 'sr' ) {
$detected = 'sr-Cyrl' ;
2023-09-30 15:56:50 +00:00
} elseif ( $detected == 'zh' ) {
$detected = 'zh-Hans' ;
} elseif ( $detected == 'zh-Hant' ) {
$detected = 'zh-hant' ;
}
2023-10-06 03:54:45 +00:00
// languages that aren't supported via the base language detection
if ( in_array ( $detected , [ 'ceb' , 'hmn' , 'ht' , 'kk' , 'ky' , 'mg' , 'mk' , 'ml' , 'ny' , 'or' , 'pa' , 'rw' , 'su' , 'st' , 'tg' , 'ts' , 'xx-Qaai' ])) {
return ;
}
2023-09-30 15:56:50 +00:00
if ( ! $result [ 'is_reliable' ]) {
2023-10-06 03:54:45 +00:00
Logger :: debug ( 'Unreliable detection' , [ 'uri-id' => $data [ 'uri-id' ], 'original' => $original , 'detected' => $detected , 'name' => $result [ 'language_name' ], 'probability' => $result [ 'language_probability' ], 'text' => $data [ 'text' ]]);
2023-10-11 18:49:42 +00:00
if (( $original == $detected ) && ( $data [ 'detected' ][ $original ] < $result [ 'language_probability' ] / 100 )) {
$data [ 'detected' ][ $original ] = $result [ 'language_probability' ] / 100 ;
}
2023-09-30 15:56:50 +00:00
return ;
}
$available = array_keys ( DI :: l10n () -> convertForLanguageDetection ( DI :: l10n () -> getAvailableLanguages ( true )));
if ( ! in_array ( $detected , $available )) {
2023-10-06 03:54:45 +00:00
Logger :: debug ( 'Unsupported language' , [ 'uri-id' => $data [ 'uri-id' ], 'original' => $original , 'detected' => $detected , 'name' => $result [ 'language_name' ], 'probability' => $result [ 'language_probability' ], 'text' => $data [ 'text' ]]);
2023-09-30 15:56:50 +00:00
return ;
}
2023-10-11 18:49:42 +00:00
if ( $original != $detected ) {
Logger :: debug ( 'Detected different language' , [ 'uri-id' => $data [ 'uri-id' ], 'original' => $original , 'detected' => $detected , 'name' => $result [ 'language_name' ], 'probability' => $result [ 'language_probability' ], 'text' => $data [ 'text' ]]);
}
$length = count ( $data [ 'detected' ]);
if ( $length > 0 ) {
unset ( $data [ 'detected' ][ $detected ]);
$data [ 'detected' ] = array_merge ([ $detected => $result [ 'language_probability' ] / 100 ], array_slice ( $data [ 'detected' ], 0 , $length - 1 ));
} else {
$data [ 'detected' ] = [ $detected => $result [ 'language_probability' ] / 100 ];
}
2023-09-30 15:56:50 +00:00
}