Use OCR to fetch text in images

This commit is contained in:
Michael 2024-01-13 19:30:20 +00:00
parent 16b12e1545
commit 3a60229e5c
5 changed files with 93 additions and 7 deletions

View file

@ -208,13 +208,17 @@ class Media
$filetype = !empty($media['mimetype']) ? strtolower(current(explode('/', $media['mimetype']))) : '';
if (($media['type'] == self::IMAGE) || ($filetype == 'image')) {
$imagedata = Images::getInfoFromURLCached($media['url']);
$imagedata = Images::getInfoFromURLCached($media['url'], empty($media['description']));
if ($imagedata) {
$media['mimetype'] = $imagedata['mime'];
$media['size'] = $imagedata['size'];
$media['width'] = $imagedata[0];
$media['height'] = $imagedata[1];
$media['blurhash'] = $imagedata['blurhash'] ?? null;
if (!empty($imagedata['description']) && empty($media['description'])) {
$media['description'] = $imagedata['description'];
Logger::debug('Detected text for image', $media);
}
} else {
Logger::notice('No image data', ['media' => $media]);
}

View file

@ -22,10 +22,12 @@
namespace Friendica\Util;
use Friendica\Core\Logger;
use Friendica\Core\System;
use Friendica\DI;
use Friendica\Model\Photo;
use Friendica\Network\HTTPClient\Client\HttpClientAccept;
use Friendica\Object\Image;
use thiagoalessio\TesseractOCR\TesseractOCR;
/**
* Image utilities
@ -181,10 +183,11 @@ class Images
* Gets info array from given URL, cached data has priority
*
* @param string $url
* @param bool $ocr
* @return array Info
* @throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
public static function getInfoFromURLCached(string $url): array
public static function getInfoFromURLCached(string $url, bool $ocr = false): array
{
$data = [];
@ -192,12 +195,12 @@ class Images
return $data;
}
$cacheKey = 'getInfoFromURL:' . sha1($url);
$cacheKey = 'getInfoFromURL:' . sha1($url . $ocr);
$data = DI::cache()->get($cacheKey);
if (empty($data) || !is_array($data)) {
$data = self::getInfoFromURL($url);
$data = self::getInfoFromURL($url, $ocr);
DI::cache()->set($cacheKey, $data);
}
@ -209,10 +212,11 @@ class Images
* Gets info from URL uncached
*
* @param string $url
* @param bool $ocr
* @return array Info array
* @throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
public static function getInfoFromURL(string $url): array
public static function getInfoFromURL(string $url, bool $ocr = false): array
{
$data = [];
@ -257,6 +261,17 @@ class Images
if ($image->isValid()) {
$data['blurhash'] = $image->getBlurHash();
if ($ocr && DI::config()->get('system', 'tesseract_ocr')) {
$ocr = new TesseractOCR();
try {
$ocr->tempDir(System::getTempPath());
$ocr->imageData($img_str, strlen($img_str));
$data['description'] = $ocr->run();
} catch (\Throwable $th) {
Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
}
}
}
$data['size'] = $filesize;