[ENG-1634 / ENG-1636] Integrate Media Data Extractor and Thumbnail Actor with New Task System (#2423)

* Moving stuff around

* Media data extraction task

* New thumbnailer task

* Taking more metrics on thumbnailer

* First drafts on media processor job

* Better interruption latency on thumbnailer task

* Also solve the latency on interrupt at media data extraction

* Media processor job

* Shallow media processor

* Better prioritize files identification

* Integrate ffmpeg media data extraction
- Thumbnail quality improvements parameters
This commit is contained in:
Ericson "Fogo" Soares 2024-05-10 09:51:22 -03:00 committed by GitHub
parent 5a73df00a4
commit 69412accae
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 3893 additions and 560 deletions

11
Cargo.lock generated
View file

@ -3288,12 +3288,13 @@ dependencies = [
[[package]]
name = "futures-concurrency"
version = "7.4.3"
version = "7.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef6712e11cdeed5c8cf21ea0b90fec40fbe64afc9bbf2339356197eeca829fc3"
checksum = "51ee14e256b9143bfafbf2fddeede6f396650bacf95d06fc1b3f2b503df129a0"
dependencies = [
"bitvec",
"futures-core",
"futures-lite 1.13.0",
"pin-project",
"slab",
"smallvec 1.13.1",
@ -9077,8 +9078,10 @@ dependencies = [
"futures",
"futures-concurrency",
"globset",
"image",
"itertools 0.12.0",
"lending-stream",
"once_cell",
"prisma-client-rust",
"rmp-serde",
"rmpv",
@ -9087,7 +9090,10 @@ dependencies = [
"sd-core-indexer-rules",
"sd-core-prisma-helpers",
"sd-core-sync",
"sd-ffmpeg",
"sd-file-ext",
"sd-images",
"sd-media-metadata",
"sd-prisma",
"sd-sync",
"sd-task-system",
@ -9104,6 +9110,7 @@ dependencies = [
"tracing",
"tracing-test",
"uuid",
"webp",
]
[[package]]

View file

@ -52,7 +52,7 @@ blake3 = "1.5.0"
chrono = "0.4.38"
clap = "4.4.7"
futures = "0.3.30"
futures-concurrency = "7.4.3"
futures-concurrency = "7.6.0"
globset = "^0.4.13"
hex = "0.4.3"
http = "0.2.9"
@ -61,7 +61,7 @@ itertools = "0.12.0"
lending-stream = "1.0.0"
libc = "0.2"
normpath = "1.1.1"
once_cell = "1.18.0"
once_cell = "1.19.0"
pin-project-lite = "0.2.13"
rand = "0.8.5"
rand_chacha = "0.3.1"

View file

@ -2,14 +2,15 @@ use std::str::FromStr;
use serde::Deserialize;
use specta::Type;
use strum::{AsRefStr, EnumString};
use tauri::{
menu::{Menu, MenuItemKind},
AppHandle, Manager, Wry,
};
use tracing::error;
#[derive(Debug, Clone, Copy, EnumString, AsRefStr, Type, Deserialize)]
#[derive(
Debug, Clone, Copy, Type, Deserialize, strum::EnumString, strum::AsRefStr, strum::Display,
)]
pub enum MenuEvent {
NewLibrary,
NewFile,
@ -27,12 +28,6 @@ pub enum MenuEvent {
ReloadWebview,
}
impl ToString for MenuEvent {
fn to_string(&self) -> String {
self.as_ref().to_string()
}
}
/// Menu items which require a library to be open to use.
/// They will be disabled/enabled automatically.
const LIBRARY_LOCKED_MENU_IDS: &[MenuEvent] = &[

View file

@ -13,7 +13,7 @@ default = []
# This feature allows features to be disabled when the Core is running on mobile.
mobile = []
# This feature controls whether the Spacedrive Core contains functionality which requires FFmpeg.
ffmpeg = ["dep:sd-ffmpeg", "sd-media-metadata/ffmpeg"]
ffmpeg = ["dep:sd-ffmpeg", "sd-core-heavy-lifting/ffmpeg", "sd-media-metadata/ffmpeg"]
heif = ["sd-images/heif"]
ai = ["dep:sd-ai"]
crypto = ["dep:sd-crypto"]

View file

@ -8,6 +8,11 @@ edition = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[features]
default = []
# This feature controls whether the Spacedrive Heavy Lifting contains functionality which requires FFmpeg.
ffmpeg = ["dep:sd-ffmpeg"]
[dependencies]
# Inner Core Sub-crates
sd-core-file-path-helper = { path = "../file-path-helper" }
@ -15,7 +20,10 @@ sd-core-indexer-rules = { path = "../indexer-rules" }
sd-core-prisma-helpers = { path = "../prisma-helpers" }
sd-core-sync = { path = "../sync" }
# Sub-crates
sd-ffmpeg = { path = "../../../crates/ffmpeg", optional = true }
sd-file-ext = { path = "../../../crates/file-ext" }
sd-images = { path = "../../../crates/images" }
sd-media-metadata = { path = "../../../crates/media-metadata" }
sd-prisma = { path = "../../../crates/prisma" }
sd-sync = { path = "../../../crates/sync" }
sd-task-system = { path = "../../../crates/task-system" }
@ -28,8 +36,10 @@ chrono = { workspace = true, features = ["serde"] }
futures = { workspace = true }
futures-concurrency = { workspace = true }
globset = { workspace = true }
image = { workspace = true }
itertools = { workspace = true }
lending-stream = { workspace = true }
once_cell = { workspace = true }
prisma-client-rust = { workspace = true }
rmp-serde = { workspace = true }
rmpv = { workspace = true }
@ -44,6 +54,8 @@ tokio = { workspace = true, features = ["fs", "sync", "parking_lot"] }
tokio-stream = { workspace = true, features = ["fs"] }
tracing = { workspace = true }
uuid = { workspace = true, features = ["v4", "serde"] }
webp = { workspace = true }
[dev-dependencies]
tempfile = { workspace = true }

View file

@ -1,4 +1,5 @@
use crate::{
file_identifier,
job_system::{
job::{Job, JobReturn, JobTaskDispatcher, ReturnStatus},
report::ReportOutputMetadata,
@ -6,7 +7,7 @@ use crate::{
SerializableJob, SerializedTasks,
},
utils::sub_path::maybe_get_iso_file_path_from_sub_path,
Error, JobContext, JobName, LocationScanState, NonCriticalJobError, ProgressUpdate,
Error, JobName, LocationScanState, NonCriticalError, OuterContext, ProgressUpdate, UpdateEvent,
};
use sd_core_file_path_helper::IsolatedFilePathData;
@ -20,7 +21,7 @@ use sd_task_system::{
use sd_utils::db::maybe_missing;
use std::{
collections::HashMap,
collections::{HashMap, HashSet},
hash::{Hash, Hasher},
mem,
path::PathBuf,
@ -30,35 +31,36 @@ use std::{
use futures::{stream::FuturesUnordered, StreamExt};
use futures_concurrency::future::TryJoin;
use prisma_client_rust::or;
use serde::{Deserialize, Serialize};
use serde_json::json;
use tokio::time::Instant;
use tracing::warn;
use super::{
orphan_path_filters_deep, orphan_path_filters_shallow,
tasks::{
ExtractFileMetadataTask, ExtractFileMetadataTaskOutput, ObjectProcessorTask,
ObjectProcessorTaskMetrics,
extract_file_metadata, object_processor, ExtractFileMetadataTask, ObjectProcessorTask,
},
FileIdentifierError, CHUNK_SIZE,
CHUNK_SIZE,
};
#[derive(Debug)]
pub struct FileIdentifierJob {
pub struct FileIdentifier {
location: Arc<location::Data>,
location_path: Arc<PathBuf>,
sub_path: Option<PathBuf>,
metadata: Metadata,
errors: Vec<NonCriticalJobError>,
priority_tasks_ids: HashSet<TaskId>,
errors: Vec<NonCriticalError>,
pending_tasks_on_resume: Vec<TaskHandle<Error>>,
tasks_for_shutdown: Vec<Box<dyn Task<Error>>>,
}
impl Hash for FileIdentifierJob {
impl Hash for FileIdentifier {
fn hash<H: Hasher>(&self, state: &mut H) {
self.location.id.hash(state);
if let Some(ref sub_path) = self.sub_path {
@ -67,19 +69,19 @@ impl Hash for FileIdentifierJob {
}
}
impl Job for FileIdentifierJob {
impl Job for FileIdentifier {
const NAME: JobName = JobName::FileIdentifier;
async fn resume_tasks(
&mut self,
dispatcher: &JobTaskDispatcher,
ctx: &impl JobContext,
ctx: &impl OuterContext,
SerializedTasks(serialized_tasks): SerializedTasks,
) -> Result<(), Error> {
self.pending_tasks_on_resume = dispatcher
.dispatch_many_boxed(
rmp_serde::from_slice::<Vec<(TaskKind, Vec<u8>)>>(&serialized_tasks)
.map_err(FileIdentifierError::from)?
.map_err(file_identifier::Error::from)?
.into_iter()
.map(|(task_kind, task_bytes)| async move {
match task_kind {
@ -103,17 +105,17 @@ impl Job for FileIdentifierJob {
.collect::<Vec<_>>()
.try_join()
.await
.map_err(FileIdentifierError::from)?,
.map_err(file_identifier::Error::from)?,
)
.await;
Ok(())
}
async fn run(
async fn run<Ctx: OuterContext>(
mut self,
dispatcher: JobTaskDispatcher,
ctx: impl JobContext,
ctx: Ctx,
) -> Result<ReturnStatus, Error> {
let mut pending_running_tasks = FuturesUnordered::new();
@ -160,7 +162,9 @@ impl Job for FileIdentifierJob {
}
if !self.tasks_for_shutdown.is_empty() {
return Ok(ReturnStatus::Shutdown(self.serialize().await));
return Ok(ReturnStatus::Shutdown(
SerializableJob::<Ctx>::serialize(self).await,
));
}
// From this point onward, we are done with the job and it can't be interrupted anymore
@ -181,7 +185,7 @@ impl Job for FileIdentifierJob {
)
.exec()
.await
.map_err(FileIdentifierError::from)?;
.map_err(file_identifier::Error::from)?;
Ok(ReturnStatus::Completed(
JobReturn::builder()
@ -192,11 +196,11 @@ impl Job for FileIdentifierJob {
}
}
impl FileIdentifierJob {
impl FileIdentifier {
pub fn new(
location: location::Data,
sub_path: Option<PathBuf>,
) -> Result<Self, FileIdentifierError> {
) -> Result<Self, file_identifier::Error> {
Ok(Self {
location_path: maybe_missing(&location.path, "location.path")
.map(PathBuf::from)
@ -204,6 +208,7 @@ impl FileIdentifierJob {
location: Arc::new(location),
sub_path,
metadata: Metadata::default(),
priority_tasks_ids: HashSet::new(),
errors: Vec::new(),
pending_tasks_on_resume: Vec::new(),
tasks_for_shutdown: Vec::new(),
@ -213,12 +218,12 @@ impl FileIdentifierJob {
async fn init_or_resume(
&mut self,
pending_running_tasks: &mut FuturesUnordered<TaskHandle<Error>>,
job_ctx: &impl JobContext,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
) -> Result<(), FileIdentifierError> {
) -> Result<(), file_identifier::Error> {
// if we don't have any pending task, then this is a fresh job
if self.pending_tasks_on_resume.is_empty() {
let db = job_ctx.db();
let db = ctx.db();
let maybe_sub_iso_file_path = maybe_get_iso_file_path_from_sub_path(
self.location.id,
&self.sub_path,
@ -227,53 +232,43 @@ impl FileIdentifierJob {
)
.await?;
let mut orphans_count = 0;
let mut last_orphan_file_path_id = None;
let start = Instant::now();
loop {
#[allow(clippy::cast_possible_wrap)]
// SAFETY: we know that CHUNK_SIZE is a valid i64
let orphan_paths = db
.file_path()
.find_many(orphan_path_filters(
self.location.id,
last_orphan_file_path_id,
&maybe_sub_iso_file_path,
))
.order_by(file_path::id::order(SortOrder::Asc))
.take(CHUNK_SIZE as i64)
.select(file_path_for_file_identifier::select())
.exec()
.await?;
let location_root_iso_file_path = IsolatedFilePathData::new(
self.location.id,
&*self.location_path,
&*self.location_path,
true,
)
.map_err(file_identifier::Error::from)?;
if orphan_paths.is_empty() {
break;
}
// First we dispatch some shallow priority tasks to quickly identify orphans in the location
// root directory or in the desired sub-path
let file_paths_already_identifying = self
.dispatch_priority_identifier_tasks(
&mut last_orphan_file_path_id,
maybe_sub_iso_file_path
.as_ref()
.unwrap_or(&location_root_iso_file_path),
ctx,
dispatcher,
pending_running_tasks,
)
.await?;
orphans_count += orphan_paths.len() as u64;
last_orphan_file_path_id =
Some(orphan_paths.last().expect("orphan_paths is not empty").id);
job_ctx.progress(vec![
ProgressUpdate::TaskCount(orphans_count),
ProgressUpdate::Message(format!("{orphans_count} files to be identified")),
]);
pending_running_tasks.push(
dispatcher
.dispatch(ExtractFileMetadataTask::new_deep(
Arc::clone(&self.location),
Arc::clone(&self.location_path),
orphan_paths,
))
.await,
);
}
self.dispatch_deep_identifier_tasks(
&mut last_orphan_file_path_id,
&maybe_sub_iso_file_path,
ctx,
dispatcher,
pending_running_tasks,
&file_paths_already_identifying,
)
.await?;
self.metadata.seeking_orphans_time = start.elapsed();
self.metadata.total_found_orphans = orphans_count;
} else {
pending_running_tasks.extend(mem::take(&mut self.pending_tasks_on_resume));
}
@ -290,25 +285,27 @@ impl FileIdentifierJob {
&mut self,
task_id: TaskId,
any_task_output: Box<dyn AnyTaskOutput>,
job_ctx: &impl JobContext,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
) -> Option<TaskHandle<Error>> {
if any_task_output.is::<ExtractFileMetadataTaskOutput>() {
if any_task_output.is::<extract_file_metadata::Output>() {
return self
.process_extract_file_metadata_output(
task_id,
*any_task_output
.downcast::<ExtractFileMetadataTaskOutput>()
.downcast::<extract_file_metadata::Output>()
.expect("just checked"),
job_ctx,
ctx,
dispatcher,
)
.await;
} else if any_task_output.is::<ObjectProcessorTaskMetrics>() {
} else if any_task_output.is::<object_processor::Output>() {
self.process_object_processor_output(
task_id,
*any_task_output
.downcast::<ObjectProcessorTaskMetrics>()
.downcast::<object_processor::Output>()
.expect("just checked"),
job_ctx,
ctx,
);
} else {
unreachable!("Unexpected task output type: <id='{task_id}'>");
@ -319,12 +316,13 @@ impl FileIdentifierJob {
async fn process_extract_file_metadata_output(
&mut self,
ExtractFileMetadataTaskOutput {
task_id: TaskId,
extract_file_metadata::Output {
identified_files,
extract_metadata_time,
errors,
}: ExtractFileMetadataTaskOutput,
job_ctx: &impl JobContext,
}: extract_file_metadata::Output,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
) -> Option<TaskHandle<Error>> {
self.metadata.extract_metadata_time += extract_metadata_time;
@ -333,37 +331,46 @@ impl FileIdentifierJob {
if identified_files.is_empty() {
self.metadata.completed_tasks += 1;
job_ctx.progress(vec![ProgressUpdate::CompletedTaskCount(
ctx.progress(vec![ProgressUpdate::CompletedTaskCount(
self.metadata.completed_tasks,
)]);
None
} else {
job_ctx.progress_msg(format!("Identified {} files", identified_files.len()));
ctx.progress_msg(format!("Identified {} files", identified_files.len()));
Some(
dispatcher
.dispatch(ObjectProcessorTask::new_deep(
identified_files,
Arc::clone(job_ctx.db()),
Arc::clone(job_ctx.sync()),
))
.await,
)
let with_priority = self.priority_tasks_ids.remove(&task_id);
let task = dispatcher
.dispatch(ObjectProcessorTask::new(
identified_files,
Arc::clone(ctx.db()),
Arc::clone(ctx.sync()),
with_priority,
))
.await;
if with_priority {
self.priority_tasks_ids.insert(task.task_id());
}
Some(task)
}
}
fn process_object_processor_output(
&mut self,
ObjectProcessorTaskMetrics {
task_id: TaskId,
object_processor::Output {
file_path_ids_with_new_object,
assign_cas_ids_time,
fetch_existing_objects_time,
assign_to_existing_object_time,
create_object_time,
created_objects_count,
linked_objects_count,
}: ObjectProcessorTaskMetrics,
job_ctx: &impl JobContext,
}: object_processor::Output,
ctx: &impl OuterContext,
) {
self.metadata.assign_cas_ids_time += assign_cas_ids_time;
self.metadata.fetch_existing_objects_time += fetch_existing_objects_time;
@ -374,7 +381,7 @@ impl FileIdentifierJob {
self.metadata.completed_tasks += 1;
job_ctx.progress(vec![
ctx.progress(vec![
ProgressUpdate::CompletedTaskCount(self.metadata.completed_tasks),
ProgressUpdate::Message(format!(
"Processed {} of {} objects",
@ -382,6 +389,143 @@ impl FileIdentifierJob {
self.metadata.total_found_orphans
)),
]);
if self.priority_tasks_ids.remove(&task_id) {
ctx.report_update(UpdateEvent::NewIdentifiedObjects {
file_path_ids: file_path_ids_with_new_object,
});
}
}
async fn dispatch_priority_identifier_tasks(
&mut self,
last_orphan_file_path_id: &mut Option<i32>,
sub_iso_file_path: &IsolatedFilePathData<'static>,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
pending_running_tasks: &FuturesUnordered<TaskHandle<Error>>,
) -> Result<HashSet<file_path::id::Type>, file_identifier::Error> {
let db = ctx.db();
let mut file_paths_already_identifying = HashSet::new();
loop {
#[allow(clippy::cast_possible_wrap)]
// SAFETY: we know that CHUNK_SIZE is a valid i64
let orphan_paths = db
.file_path()
.find_many(orphan_path_filters_shallow(
self.location.id,
*last_orphan_file_path_id,
sub_iso_file_path,
))
.order_by(file_path::id::order(SortOrder::Asc))
.take(CHUNK_SIZE as i64)
.select(file_path_for_file_identifier::select())
.exec()
.await?;
if orphan_paths.is_empty() {
break;
}
file_paths_already_identifying.extend(orphan_paths.iter().map(|path| path.id));
self.metadata.total_found_orphans += orphan_paths.len() as u64;
*last_orphan_file_path_id =
Some(orphan_paths.last().expect("orphan_paths is not empty").id);
ctx.progress(vec![
ProgressUpdate::TaskCount(self.metadata.total_found_orphans),
ProgressUpdate::Message(format!(
"{} files to be identified",
self.metadata.total_found_orphans
)),
]);
let priority_task = dispatcher
.dispatch(ExtractFileMetadataTask::new(
Arc::clone(&self.location),
Arc::clone(&self.location_path),
orphan_paths,
true,
))
.await;
self.priority_tasks_ids.insert(priority_task.task_id());
pending_running_tasks.push(priority_task);
}
Ok(file_paths_already_identifying)
}
async fn dispatch_deep_identifier_tasks(
&mut self,
last_orphan_file_path_id: &mut Option<file_path::id::Type>,
maybe_sub_iso_file_path: &Option<IsolatedFilePathData<'static>>,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
pending_running_tasks: &FuturesUnordered<TaskHandle<Error>>,
file_paths_already_identifying: &HashSet<file_path::id::Type>,
) -> Result<(), file_identifier::Error> {
let db = ctx.db();
loop {
#[allow(clippy::cast_possible_wrap)]
// SAFETY: we know that CHUNK_SIZE is a valid i64
let mut orphan_paths = db
.file_path()
.find_many(orphan_path_filters_deep(
self.location.id,
*last_orphan_file_path_id,
maybe_sub_iso_file_path,
))
.order_by(file_path::id::order(SortOrder::Asc))
.take(CHUNK_SIZE as i64)
.select(file_path_for_file_identifier::select())
.exec()
.await?;
// No other orphans to identify, we can break the loop
if orphan_paths.is_empty() {
break;
}
// We grab the last id to use as a starting point for the next iteration, in case we skip this one
*last_orphan_file_path_id =
Some(orphan_paths.last().expect("orphan_paths is not empty").id);
orphan_paths.retain(|path| !file_paths_already_identifying.contains(&path.id));
// If we don't have any new orphan paths after filtering out, we can skip this iteration
if orphan_paths.is_empty() {
continue;
}
self.metadata.total_found_orphans += orphan_paths.len() as u64;
ctx.progress(vec![
ProgressUpdate::TaskCount(self.metadata.total_found_orphans),
ProgressUpdate::Message(format!(
"{} files to be identified",
self.metadata.total_found_orphans
)),
]);
pending_running_tasks.push(
dispatcher
.dispatch(ExtractFileMetadataTask::new(
Arc::clone(&self.location),
Arc::clone(&self.location_path),
orphan_paths,
false,
))
.await,
);
}
Ok(())
}
}
@ -399,7 +543,9 @@ struct SaveState {
metadata: Metadata,
errors: Vec<NonCriticalJobError>,
priority_tasks_ids: HashSet<TaskId>,
errors: Vec<NonCriticalError>,
tasks_for_shutdown_bytes: Option<SerializedTasks>,
}
@ -459,13 +605,14 @@ impl From<Metadata> for ReportOutputMetadata {
}
}
impl SerializableJob for FileIdentifierJob {
impl<Ctx: OuterContext> SerializableJob<Ctx> for FileIdentifier {
async fn serialize(self) -> Result<Option<Vec<u8>>, rmp_serde::encode::Error> {
let Self {
location,
location_path,
sub_path,
metadata,
priority_tasks_ids,
errors,
tasks_for_shutdown,
..
@ -476,6 +623,7 @@ impl SerializableJob for FileIdentifierJob {
location_path,
sub_path,
metadata,
priority_tasks_ids,
tasks_for_shutdown_bytes: Some(SerializedTasks(rmp_serde::to_vec_named(
&tasks_for_shutdown
.into_iter()
@ -509,14 +657,14 @@ impl SerializableJob for FileIdentifierJob {
async fn deserialize(
serialized_job: &[u8],
_: &impl JobContext,
_: &Ctx,
) -> Result<Option<(Self, Option<SerializedTasks>)>, rmp_serde::decode::Error> {
let SaveState {
location,
location_path,
sub_path,
metadata,
priority_tasks_ids,
errors,
tasks_for_shutdown_bytes,
} = rmp_serde::from_slice::<SaveState>(serialized_job)?;
@ -527,6 +675,7 @@ impl SerializableJob for FileIdentifierJob {
location_path,
sub_path,
metadata,
priority_tasks_ids,
errors,
pending_tasks_on_resume: Vec::new(),
tasks_for_shutdown: Vec::new(),
@ -535,32 +684,3 @@ impl SerializableJob for FileIdentifierJob {
)))
}
}
fn orphan_path_filters(
location_id: location::id::Type,
file_path_id: Option<file_path::id::Type>,
maybe_sub_iso_file_path: &Option<IsolatedFilePathData<'_>>,
) -> Vec<file_path::WhereParam> {
sd_utils::chain_optional_iter(
[
or!(
file_path::object_id::equals(None),
file_path::cas_id::equals(None)
),
file_path::is_dir::equals(Some(false)),
file_path::location_id::equals(Some(location_id)),
file_path::size_in_bytes_bytes::not(Some(0u64.to_be_bytes().to_vec())),
],
[
// this is a workaround for the cursor not working properly
file_path_id.map(file_path::id::gte),
maybe_sub_iso_file_path.as_ref().map(|sub_iso_file_path| {
file_path::materialized_path::starts_with(
sub_iso_file_path
.materialized_path_for_children()
.expect("sub path iso_file_path must be a directory"),
)
}),
],
)
}

View file

@ -1,13 +1,14 @@
use crate::utils::sub_path::SubPathError;
use crate::utils::sub_path;
use sd_core_file_path_helper::{FilePathError, IsolatedFilePathData};
use sd_file_ext::{extensions::Extension, kind::ObjectKind};
use sd_prisma::prisma::{file_path, location};
use sd_utils::{db::MissingFieldError, error::FileIOError};
use std::{fs::Metadata, path::Path};
use prisma_client_rust::QueryError;
use prisma_client_rust::{or, QueryError};
use rspc::ErrorCode;
use serde::{Deserialize, Serialize};
use specta::Type;
@ -15,20 +16,20 @@ use tokio::fs;
use tracing::trace;
mod cas_id;
mod job;
pub mod job;
mod shallow;
mod tasks;
use cas_id::generate_cas_id;
pub use job::FileIdentifierJob;
pub use job::FileIdentifier;
pub use shallow::shallow;
// we break these tasks into chunks of 100 to improve performance
const CHUNK_SIZE: usize = 100;
#[derive(thiserror::Error, Debug)]
pub enum FileIdentifierError {
pub enum Error {
#[error("missing field on database: {0}")]
MissingField(#[from] MissingFieldError),
#[error("failed to deserialized stored tasks for job resume: {0}")]
@ -39,13 +40,13 @@ pub enum FileIdentifierError {
#[error(transparent)]
FilePathError(#[from] FilePathError),
#[error(transparent)]
SubPath(#[from] SubPathError),
SubPath(#[from] sub_path::Error),
}
impl From<FileIdentifierError> for rspc::Error {
fn from(err: FileIdentifierError) -> Self {
impl From<Error> for rspc::Error {
fn from(err: Error) -> Self {
match err {
FileIdentifierError::SubPath(sub_path_err) => sub_path_err.into(),
Error::SubPath(sub_path_err) => sub_path_err.into(),
_ => Self::with_cause(ErrorCode::InternalServerError, err.to_string(), err),
}
@ -53,7 +54,7 @@ impl From<FileIdentifierError> for rspc::Error {
}
#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)]
pub enum NonCriticalFileIdentifierError {
pub enum NonCriticalError {
#[error("failed to extract file metadata: {0}")]
FailedToExtractFileMetadata(String),
#[cfg(target_os = "windows")]
@ -118,3 +119,56 @@ impl FileMetadata {
})
}
}
fn orphan_path_filters_shallow(
location_id: location::id::Type,
file_path_id: Option<file_path::id::Type>,
sub_iso_file_path: &IsolatedFilePathData<'_>,
) -> Vec<file_path::WhereParam> {
sd_utils::chain_optional_iter(
[
or!(
file_path::object_id::equals(None),
file_path::cas_id::equals(None)
),
file_path::is_dir::equals(Some(false)),
file_path::location_id::equals(Some(location_id)),
file_path::materialized_path::equals(Some(
sub_iso_file_path
.materialized_path_for_children()
.expect("sub path for shallow identifier must be a directory"),
)),
file_path::size_in_bytes_bytes::not(Some(0u64.to_be_bytes().to_vec())),
],
[file_path_id.map(file_path::id::gte)],
)
}
fn orphan_path_filters_deep(
location_id: location::id::Type,
file_path_id: Option<file_path::id::Type>,
maybe_sub_iso_file_path: &Option<IsolatedFilePathData<'_>>,
) -> Vec<file_path::WhereParam> {
sd_utils::chain_optional_iter(
[
or!(
file_path::object_id::equals(None),
file_path::cas_id::equals(None)
),
file_path::is_dir::equals(Some(false)),
file_path::location_id::equals(Some(location_id)),
file_path::size_in_bytes_bytes::not(Some(0u64.to_be_bytes().to_vec())),
],
[
// this is a workaround for the cursor not working properly
file_path_id.map(file_path::id::gte),
maybe_sub_iso_file_path.as_ref().map(|sub_iso_file_path| {
file_path::materialized_path::starts_with(
sub_iso_file_path
.materialized_path_for_children()
.expect("sub path iso_file_path must be a directory"),
)
}),
],
)
}

View file

@ -1,10 +1,12 @@
use crate::{utils::sub_path::maybe_get_iso_file_path_from_sub_path, Error, NonCriticalJobError};
use crate::{
file_identifier, utils::sub_path::maybe_get_iso_file_path_from_sub_path, Error,
NonCriticalError, OuterContext,
};
use sd_core_file_path_helper::IsolatedFilePathData;
use sd_core_prisma_helpers::file_path_for_file_identifier;
use sd_core_sync::Manager as SyncManager;
use sd_prisma::prisma::{file_path, location, PrismaClient, SortOrder};
use sd_prisma::prisma::{file_path, location, SortOrder};
use sd_task_system::{
BaseTaskDispatcher, CancelTaskOnDrop, TaskDispatcher, TaskOutput, TaskStatus,
};
@ -17,39 +19,40 @@ use std::{
use futures_concurrency::future::FutureGroup;
use lending_stream::{LendingStream, StreamExt};
use prisma_client_rust::or;
use tracing::{debug, warn};
use super::{
tasks::{ExtractFileMetadataTask, ExtractFileMetadataTaskOutput, ObjectProcessorTask},
FileIdentifierError, CHUNK_SIZE,
orphan_path_filters_shallow,
tasks::{
extract_file_metadata, object_processor, ExtractFileMetadataTask, ObjectProcessorTask,
},
CHUNK_SIZE,
};
pub async fn shallow(
location: location::Data,
sub_path: impl AsRef<Path> + Send,
dispatcher: BaseTaskDispatcher<Error>,
db: Arc<PrismaClient>,
sync: Arc<SyncManager>,
invalidate_query: impl Fn(&'static str) + Send + Sync,
) -> Result<Vec<NonCriticalJobError>, Error> {
ctx: impl OuterContext,
) -> Result<Vec<NonCriticalError>, Error> {
let sub_path = sub_path.as_ref();
let db = ctx.db();
let location_path = maybe_missing(&location.path, "location.path")
.map(PathBuf::from)
.map(Arc::new)
.map_err(FileIdentifierError::from)?;
.map_err(file_identifier::Error::from)?;
let location = Arc::new(location);
let sub_iso_file_path =
maybe_get_iso_file_path_from_sub_path(location.id, &Some(sub_path), &*location_path, &db)
maybe_get_iso_file_path_from_sub_path(location.id, &Some(sub_path), &*location_path, db)
.await
.map_err(FileIdentifierError::from)?
.map_err(file_identifier::Error::from)?
.map_or_else(
|| {
IsolatedFilePathData::new(location.id, &*location_path, &*location_path, true)
.map_err(FileIdentifierError::from)
.map_err(file_identifier::Error::from)
},
Ok,
)?;
@ -64,7 +67,7 @@ pub async fn shallow(
// SAFETY: we know that CHUNK_SIZE is a valid i64
let orphan_paths = db
.file_path()
.find_many(orphan_path_filters(
.find_many(orphan_path_filters_shallow(
location.id,
last_orphan_file_path_id,
&sub_iso_file_path,
@ -74,7 +77,7 @@ pub async fn shallow(
.select(file_path_for_file_identifier::select())
.exec()
.await
.map_err(FileIdentifierError::from)?;
.map_err(file_identifier::Error::from)?;
let Some(last_orphan) = orphan_paths.last() else {
// No orphans here!
@ -86,10 +89,11 @@ pub async fn shallow(
pending_running_tasks.insert(CancelTaskOnDrop(
dispatcher
.dispatch(ExtractFileMetadataTask::new_shallow(
.dispatch(ExtractFileMetadataTask::new(
Arc::clone(&location),
Arc::clone(&location_path),
orphan_paths,
true,
))
.await,
));
@ -104,10 +108,7 @@ pub async fn shallow(
return Ok(vec![]);
}
let errors = process_tasks(pending_running_tasks, dispatcher, db, sync).await?;
invalidate_query("search.paths");
invalidate_query("search.objects");
let errors = process_tasks(pending_running_tasks, dispatcher, ctx).await?;
Ok(errors)
}
@ -115,11 +116,13 @@ pub async fn shallow(
async fn process_tasks(
pending_running_tasks: FutureGroup<CancelTaskOnDrop<Error>>,
dispatcher: BaseTaskDispatcher<Error>,
db: Arc<PrismaClient>,
sync: Arc<SyncManager>,
) -> Result<Vec<NonCriticalJobError>, Error> {
ctx: impl OuterContext,
) -> Result<Vec<NonCriticalError>, Error> {
let mut pending_running_tasks = pending_running_tasks.lend_mut();
let db = ctx.db();
let sync = ctx.sync();
let mut errors = vec![];
while let Some((pending_running_tasks, task_result)) = pending_running_tasks.next().await {
@ -128,28 +131,36 @@ async fn process_tasks(
// We only care about ExtractFileMetadataTaskOutput because we need to dispatch further tasks
// and the ObjectProcessorTask only gives back some metrics not much important for
// shallow file identifier
if any_task_output.is::<ExtractFileMetadataTaskOutput>() {
let ExtractFileMetadataTaskOutput {
if any_task_output.is::<extract_file_metadata::Output>() {
let extract_file_metadata::Output {
identified_files,
errors: more_errors,
..
} = *any_task_output
.downcast::<ExtractFileMetadataTaskOutput>()
.expect("just checked");
} = *any_task_output.downcast().expect("just checked");
errors.extend(more_errors);
if !identified_files.is_empty() {
pending_running_tasks.insert(CancelTaskOnDrop(
dispatcher
.dispatch(ObjectProcessorTask::new_shallow(
.dispatch(ObjectProcessorTask::new(
identified_files,
Arc::clone(&db),
Arc::clone(&sync),
Arc::clone(db),
Arc::clone(sync),
true,
))
.await,
));
}
} else {
let object_processor::Output {
file_path_ids_with_new_object,
..
} = *any_task_output.downcast().expect("just checked");
ctx.report_update(crate::UpdateEvent::NewIdentifiedObjects {
file_path_ids: file_path_ids_with_new_object,
});
}
}
@ -181,27 +192,3 @@ async fn process_tasks(
Ok(errors)
}
fn orphan_path_filters(
location_id: location::id::Type,
file_path_id: Option<file_path::id::Type>,
sub_iso_file_path: &IsolatedFilePathData<'_>,
) -> Vec<file_path::WhereParam> {
sd_utils::chain_optional_iter(
[
or!(
file_path::object_id::equals(None),
file_path::cas_id::equals(None)
),
file_path::is_dir::equals(Some(false)),
file_path::location_id::equals(Some(location_id)),
file_path::materialized_path::equals(Some(
sub_iso_file_path
.materialized_path_for_children()
.expect("sub path for shallow identifier must be a directory"),
)),
file_path::size_in_bytes_bytes::not(Some(0u64.to_be_bytes().to_vec())),
],
[file_path_id.map(file_path::id::gte)],
)
}

View file

@ -1,6 +1,6 @@
use crate::{
file_identifier::{FileMetadata, NonCriticalFileIdentifierError},
Error, NonCriticalJobError,
file_identifier::{self, FileMetadata},
Error, NonCriticalError,
};
use sd_core_file_path_helper::IsolatedFilePathData;
@ -34,23 +34,24 @@ pub struct ExtractFileMetadataTask {
file_paths_by_id: HashMap<Uuid, file_path_for_file_identifier::Data>,
identified_files: HashMap<Uuid, IdentifiedFile>,
extract_metadata_time: Duration,
errors: Vec<NonCriticalJobError>,
is_shallow: bool,
errors: Vec<NonCriticalError>,
with_priority: bool,
}
#[derive(Debug)]
pub struct ExtractFileMetadataTaskOutput {
pub struct Output {
pub identified_files: HashMap<Uuid, IdentifiedFile>,
pub extract_metadata_time: Duration,
pub errors: Vec<NonCriticalJobError>,
pub errors: Vec<NonCriticalError>,
}
impl ExtractFileMetadataTask {
fn new(
#[must_use]
pub fn new(
location: Arc<location::Data>,
location_path: Arc<PathBuf>,
file_paths: Vec<file_path_for_file_identifier::Data>,
is_shallow: bool,
with_priority: bool,
) -> Self {
Self {
id: TaskId::new_v4(),
@ -69,27 +70,9 @@ impl ExtractFileMetadataTask {
.collect(),
extract_metadata_time: Duration::ZERO,
errors: Vec::new(),
is_shallow,
with_priority,
}
}
#[must_use]
pub fn new_deep(
location: Arc<location::Data>,
location_path: Arc<PathBuf>,
file_paths: Vec<file_path_for_file_identifier::Data>,
) -> Self {
Self::new(location, location_path, file_paths, false)
}
#[must_use]
pub fn new_shallow(
location: Arc<location::Data>,
location_path: Arc<PathBuf>,
file_paths: Vec<file_path_for_file_identifier::Data>,
) -> Self {
Self::new(location, location_path, file_paths, true)
}
}
#[async_trait::async_trait]
@ -99,7 +82,7 @@ impl Task<Error> for ExtractFileMetadataTask {
}
fn with_priority(&self) -> bool {
self.is_shallow
self.with_priority
}
async fn run(&mut self, interrupter: &Interrupter) -> Result<ExecStatus, Error> {
@ -196,7 +179,7 @@ impl Task<Error> for ExtractFileMetadataTask {
}
Ok(ExecStatus::Done(
ExtractFileMetadataTaskOutput {
Output {
identified_files: mem::take(identified_files),
extract_metadata_time: *extract_metadata_time + start_time.elapsed(),
errors: mem::take(errors),
@ -210,7 +193,7 @@ fn handle_non_critical_errors(
location_id: location::id::Type,
file_path_pub_id: Uuid,
e: &FileIOError,
errors: &mut Vec<NonCriticalJobError>,
errors: &mut Vec<NonCriticalError>,
) {
error!("Failed to extract file metadata <location_id={location_id}, file_path_pub_id='{file_path_pub_id}'>: {e:#?}");
@ -221,14 +204,15 @@ fn handle_non_critical_errors(
// Handle case where file is on-demand (NTFS only)
if e.source.raw_os_error().map_or(false, |code| code == 362) {
errors.push(
NonCriticalFileIdentifierError::FailedToExtractMetadataFromOnDemandFile(
file_identifier::NonCriticalError::FailedToExtractMetadataFromOnDemandFile(
formatted_error,
)
.into(),
);
} else {
errors.push(
NonCriticalFileIdentifierError::FailedToExtractFileMetadata(formatted_error).into(),
file_identifier::NonCriticalError::FailedToExtractFileMetadata(formatted_error)
.into(),
);
}
}
@ -236,7 +220,7 @@ fn handle_non_critical_errors(
#[cfg(not(target_os = "windows"))]
{
errors.push(
NonCriticalFileIdentifierError::FailedToExtractFileMetadata(formatted_error).into(),
file_identifier::NonCriticalError::FailedToExtractFileMetadata(formatted_error).into(),
);
}
}
@ -246,7 +230,7 @@ fn try_iso_file_path_extraction(
file_path_pub_id: Uuid,
file_path: &file_path_for_file_identifier::Data,
location_path: Arc<PathBuf>,
errors: &mut Vec<NonCriticalJobError>,
errors: &mut Vec<NonCriticalError>,
) -> Option<(Uuid, IsolatedFilePathData<'static>, Arc<PathBuf>)> {
IsolatedFilePathData::try_from((location_id, file_path))
.map(IsolatedFilePathData::to_owned)
@ -254,7 +238,7 @@ fn try_iso_file_path_extraction(
.map_err(|e| {
error!("Failed to extract isolated file path data: {e:#?}");
errors.push(
NonCriticalFileIdentifierError::FailedToExtractIsolatedFilePathData(format!(
file_identifier::NonCriticalError::FailedToExtractIsolatedFilePathData(format!(
"<file_path_pub_id='{file_path_pub_id}', error={e}>"
))
.into(),

View file

@ -4,11 +4,11 @@ use sd_file_ext::kind::ObjectKind;
use serde::{Deserialize, Serialize};
mod extract_file_metadata;
mod object_processor;
pub mod extract_file_metadata;
pub mod object_processor;
pub use extract_file_metadata::{ExtractFileMetadataTask, ExtractFileMetadataTaskOutput};
pub use object_processor::{ObjectProcessorTask, ObjectProcessorTaskMetrics};
pub use extract_file_metadata::ExtractFileMetadataTask;
pub use object_processor::ObjectProcessorTask;
#[derive(Debug, Serialize, Deserialize)]
pub(super) struct IdentifiedFile {

View file

@ -1,4 +1,4 @@
use crate::{file_identifier::FileIdentifierError, Error};
use crate::{file_identifier, Error};
use sd_core_prisma_helpers::{
file_path_for_file_identifier, file_path_pub_id, object_for_file_identifier,
@ -36,22 +36,23 @@ pub struct ObjectProcessorTask {
db: Arc<PrismaClient>,
sync: Arc<SyncManager>,
identified_files: HashMap<Uuid, IdentifiedFile>,
metrics: ObjectProcessorTaskMetrics,
output: Output,
stage: Stage,
is_shallow: bool,
with_priority: bool,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct SaveState {
id: TaskId,
identified_files: HashMap<Uuid, IdentifiedFile>,
metrics: ObjectProcessorTaskMetrics,
output: Output,
stage: Stage,
is_shallow: bool,
with_priority: bool,
}
#[derive(Debug, Serialize, Deserialize, Default)]
pub struct ObjectProcessorTaskMetrics {
pub struct Output {
pub file_path_ids_with_new_object: Vec<file_path::id::Type>,
pub assign_cas_ids_time: Duration,
pub fetch_existing_objects_time: Duration,
pub assign_to_existing_object_time: Duration,
@ -71,11 +72,12 @@ enum Stage {
}
impl ObjectProcessorTask {
fn new(
#[must_use]
pub fn new(
identified_files: HashMap<Uuid, IdentifiedFile>,
db: Arc<PrismaClient>,
sync: Arc<SyncManager>,
is_shallow: bool,
with_priority: bool,
) -> Self {
Self {
id: TaskId::new_v4(),
@ -83,26 +85,10 @@ impl ObjectProcessorTask {
sync,
identified_files,
stage: Stage::Starting,
metrics: ObjectProcessorTaskMetrics::default(),
is_shallow,
output: Output::default(),
with_priority,
}
}
pub fn new_deep(
identified_files: HashMap<Uuid, IdentifiedFile>,
db: Arc<PrismaClient>,
sync: Arc<SyncManager>,
) -> Self {
Self::new(identified_files, db, sync, false)
}
pub fn new_shallow(
identified_files: HashMap<Uuid, IdentifiedFile>,
db: Arc<PrismaClient>,
sync: Arc<SyncManager>,
) -> Self {
Self::new(identified_files, db, sync, true)
}
}
#[async_trait::async_trait]
@ -112,7 +98,7 @@ impl Task<Error> for ObjectProcessorTask {
}
fn with_priority(&self) -> bool {
self.is_shallow
self.with_priority
}
async fn run(&mut self, interrupter: &Interrupter) -> Result<ExecStatus, Error> {
@ -121,8 +107,9 @@ impl Task<Error> for ObjectProcessorTask {
sync,
identified_files,
stage,
metrics:
ObjectProcessorTaskMetrics {
output:
Output {
file_path_ids_with_new_object,
assign_cas_ids_time,
fetch_existing_objects_time,
assign_to_existing_object_time,
@ -193,6 +180,11 @@ impl Task<Error> for ObjectProcessorTask {
*created_objects_count = create_objects(identified_files, db, sync).await?;
*create_object_time = start.elapsed();
*file_path_ids_with_new_object = identified_files
.values()
.map(|IdentifiedFile { file_path, .. }| file_path.id)
.collect();
break;
}
}
@ -200,7 +192,7 @@ impl Task<Error> for ObjectProcessorTask {
check_interruption!(interrupter);
}
Ok(ExecStatus::Done(mem::take(&mut self.metrics).into_output()))
Ok(ExecStatus::Done(mem::take(&mut self.output).into_output()))
}
}
@ -208,7 +200,7 @@ async fn assign_cas_id_to_file_paths(
identified_files: &HashMap<Uuid, IdentifiedFile>,
db: &PrismaClient,
sync: &SyncManager,
) -> Result<(), FileIdentifierError> {
) -> Result<(), file_identifier::Error> {
// Assign cas_id to each file path
sync.write_ops(
db,
@ -243,7 +235,7 @@ async fn assign_cas_id_to_file_paths(
async fn fetch_existing_objects_by_cas_id(
identified_files: &HashMap<Uuid, IdentifiedFile>,
db: &PrismaClient,
) -> Result<HashMap<String, object_for_file_identifier::Data>, FileIdentifierError> {
) -> Result<HashMap<String, object_for_file_identifier::Data>, file_identifier::Error> {
// Retrieves objects that are already connected to file paths with the same id
db.object()
.find_many(vec![object::file_paths::some(vec![
@ -280,7 +272,7 @@ async fn assign_existing_objects_to_file_paths(
objects_by_cas_id: &HashMap<String, object_for_file_identifier::Data>,
db: &PrismaClient,
sync: &SyncManager,
) -> Result<Vec<file_path_pub_id::Data>, FileIdentifierError> {
) -> Result<Vec<file_path_pub_id::Data>, file_identifier::Error> {
// Attempt to associate each file path with an object that has been
// connected to file paths with the same cas_id
sync.write_ops(
@ -341,7 +333,7 @@ async fn create_objects(
identified_files: &HashMap<Uuid, IdentifiedFile>,
db: &PrismaClient,
sync: &SyncManager,
) -> Result<u64, FileIdentifierError> {
) -> Result<u64, file_identifier::Error> {
trace!("Creating {} new Objects", identified_files.len(),);
let (object_create_args, file_path_update_args) = identified_files
@ -433,18 +425,18 @@ impl SerializableTask<Error> for ObjectProcessorTask {
let Self {
id,
identified_files,
metrics,
output,
stage,
is_shallow,
with_priority,
..
} = self;
rmp_serde::to_vec_named(&SaveState {
id,
identified_files,
metrics,
output,
stage,
is_shallow,
with_priority,
})
}
@ -456,17 +448,17 @@ impl SerializableTask<Error> for ObjectProcessorTask {
|SaveState {
id,
identified_files,
metrics,
output,
stage,
is_shallow,
with_priority,
}| Self {
id,
db,
sync,
identified_files,
metrics,
output,
stage,
is_shallow,
with_priority,
},
)
}

View file

@ -1,15 +1,15 @@
use crate::{
indexer::BATCH_SIZE,
indexer,
job_system::{
job::{
Job, JobContext, JobName, JobReturn, JobTaskDispatcher, ProgressUpdate, ReturnStatus,
Job, JobName, JobReturn, JobTaskDispatcher, OuterContext, ProgressUpdate, ReturnStatus,
},
report::ReportOutputMetadata,
utils::cancel_pending_tasks,
SerializableJob, SerializedTasks,
},
utils::sub_path::get_full_path_from_sub_path,
Error, LocationScanState, NonCriticalJobError,
Error, LocationScanState, NonCriticalError,
};
use sd_core_file_path_helper::IsolatedFilePathData;
@ -47,11 +47,11 @@ use super::{
updater::{UpdateTask, UpdateTaskOutput},
walker::{WalkDirTask, WalkTaskOutput, WalkedEntry},
},
update_directory_sizes, update_location_size, IndexerError, IsoFilePathFactory, WalkerDBProxy,
update_directory_sizes, update_location_size, IsoFilePathFactory, WalkerDBProxy, BATCH_SIZE,
};
#[derive(Debug)]
pub struct IndexerJob {
pub struct Indexer {
location: location_with_indexer_rules::Data,
sub_path: Option<PathBuf>,
metadata: Metadata,
@ -63,19 +63,19 @@ pub struct IndexerJob {
ancestors_already_indexed: HashSet<IsolatedFilePathData<'static>>,
iso_paths_and_sizes: HashMap<IsolatedFilePathData<'static>, u64>,
errors: Vec<NonCriticalJobError>,
errors: Vec<NonCriticalError>,
pending_tasks_on_resume: Vec<TaskHandle<Error>>,
tasks_for_shutdown: Vec<Box<dyn Task<Error>>>,
}
impl Job for IndexerJob {
impl Job for Indexer {
const NAME: JobName = JobName::Indexer;
async fn resume_tasks(
&mut self,
dispatcher: &JobTaskDispatcher,
ctx: &impl JobContext,
ctx: &impl OuterContext,
SerializedTasks(serialized_tasks): SerializedTasks,
) -> Result<(), Error> {
let location_id = self.location.id;
@ -83,7 +83,7 @@ impl Job for IndexerJob {
self.pending_tasks_on_resume = dispatcher
.dispatch_many_boxed(
rmp_serde::from_slice::<Vec<(TaskKind, Vec<u8>)>>(&serialized_tasks)
.map_err(IndexerError::from)?
.map_err(indexer::Error::from)?
.into_iter()
.map(|(task_kind, task_bytes)| {
let indexer_ruler = self.indexer_ruler.clone();
@ -123,17 +123,17 @@ impl Job for IndexerJob {
.collect::<Vec<_>>()
.try_join()
.await
.map_err(IndexerError::from)?,
.map_err(indexer::Error::from)?,
)
.await;
Ok(())
}
async fn run(
async fn run<Ctx: OuterContext>(
mut self,
dispatcher: JobTaskDispatcher,
ctx: impl JobContext,
ctx: Ctx,
) -> Result<ReturnStatus, Error> {
let mut pending_running_tasks = FuturesUnordered::new();
@ -148,7 +148,9 @@ impl Job for IndexerJob {
}
if !self.tasks_for_shutdown.is_empty() {
return Ok(ReturnStatus::Shutdown(self.serialize().await));
return Ok(ReturnStatus::Shutdown(
SerializableJob::<Ctx>::serialize(self).await,
));
}
if !self.ancestors_needing_indexing.is_empty() {
@ -182,7 +184,9 @@ impl Job for IndexerJob {
}
if !self.tasks_for_shutdown.is_empty() {
return Ok(ReturnStatus::Shutdown(self.serialize().await));
return Ok(ReturnStatus::Shutdown(
SerializableJob::<Ctx>::serialize(self).await,
));
}
}
@ -217,7 +221,7 @@ impl Job for IndexerJob {
.await?;
}
update_location_size(location.id, ctx.db(), &ctx.query_invalidator()).await?;
update_location_size(location.id, ctx.db(), &ctx).await?;
metadata.db_write_time += start_size_update_time.elapsed();
}
@ -243,7 +247,7 @@ impl Job for IndexerJob {
)
.exec()
.await
.map_err(IndexerError::from)?;
.map_err(indexer::Error::from)?;
Ok(ReturnStatus::Completed(
JobReturn::builder()
@ -254,11 +258,11 @@ impl Job for IndexerJob {
}
}
impl IndexerJob {
impl Indexer {
pub fn new(
location: location_with_indexer_rules::Data,
sub_path: Option<PathBuf>,
) -> Result<Self, IndexerError> {
) -> Result<Self, indexer::Error> {
Ok(Self {
indexer_ruler: location
.indexer_rules
@ -295,12 +299,12 @@ impl IndexerJob {
&mut self,
task_id: TaskId,
any_task_output: Box<dyn AnyTaskOutput>,
job_ctx: &impl JobContext,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
) -> Result<Vec<TaskHandle<Error>>, IndexerError> {
) -> Result<Vec<TaskHandle<Error>>, indexer::Error> {
self.metadata.completed_tasks += 1;
job_ctx.progress(vec![ProgressUpdate::CompletedTaskCount(
ctx.progress(vec![ProgressUpdate::CompletedTaskCount(
self.metadata.completed_tasks,
)]);
@ -310,7 +314,7 @@ impl IndexerJob {
*any_task_output
.downcast::<WalkTaskOutput>()
.expect("just checked"),
job_ctx,
ctx,
dispatcher,
)
.await;
@ -319,14 +323,14 @@ impl IndexerJob {
*any_task_output
.downcast::<SaveTaskOutput>()
.expect("just checked"),
job_ctx,
ctx,
);
} else if any_task_output.is::<UpdateTaskOutput>() {
self.process_update_output(
*any_task_output
.downcast::<UpdateTaskOutput>()
.expect("just checked"),
job_ctx,
ctx,
);
} else {
unreachable!("Unexpected task output type: <id='{task_id}'>");
@ -348,9 +352,9 @@ impl IndexerJob {
mut handles,
scan_time,
}: WalkTaskOutput,
job_ctx: &impl JobContext,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
) -> Result<Vec<TaskHandle<Error>>, IndexerError> {
) -> Result<Vec<TaskHandle<Error>>, indexer::Error> {
self.metadata.scan_read_time += scan_time;
let (to_create_count, to_update_count) = (to_create.len(), to_update.len());
@ -398,7 +402,7 @@ impl IndexerJob {
let db_delete_time = Instant::now();
self.metadata.removed_count +=
remove_non_existing_file_paths(to_remove, job_ctx.db(), job_ctx.sync()).await?;
remove_non_existing_file_paths(to_remove, ctx.db(), ctx.sync()).await?;
self.metadata.db_write_time += db_delete_time.elapsed();
let save_tasks = to_create
@ -414,8 +418,8 @@ impl IndexerJob {
self.location.id,
self.location.pub_id.clone(),
chunked_saves,
Arc::clone(job_ctx.db()),
Arc::clone(job_ctx.sync()),
Arc::clone(ctx.db()),
Arc::clone(ctx.sync()),
)
})
.collect::<Vec<_>>();
@ -431,8 +435,8 @@ impl IndexerJob {
UpdateTask::new_deep(
chunked_updates,
Arc::clone(job_ctx.db()),
Arc::clone(job_ctx.sync()),
Arc::clone(ctx.db()),
Arc::clone(ctx.sync()),
)
})
.collect::<Vec<_>>();
@ -442,7 +446,7 @@ impl IndexerJob {
self.metadata.total_tasks += handles.len() as u64;
job_ctx.progress(vec![
ctx.progress(vec![
ProgressUpdate::TaskCount(handles.len() as u64),
ProgressUpdate::message(format!(
"Found {to_create_count} new files and {to_update_count} to update"
@ -458,12 +462,12 @@ impl IndexerJob {
saved_count,
save_duration,
}: SaveTaskOutput,
job_ctx: &impl JobContext,
ctx: &impl OuterContext,
) {
self.metadata.indexed_count += saved_count;
self.metadata.db_write_time += save_duration;
job_ctx.progress_msg(format!("Saved {saved_count} files"));
ctx.progress_msg(format!("Saved {saved_count} files"));
}
fn process_update_output(
@ -472,25 +476,25 @@ impl IndexerJob {
updated_count,
update_duration,
}: UpdateTaskOutput,
job_ctx: &impl JobContext,
ctx: &impl OuterContext,
) {
self.metadata.updated_count += updated_count;
self.metadata.db_write_time += update_duration;
job_ctx.progress_msg(format!("Updated {updated_count} files"));
ctx.progress_msg(format!("Updated {updated_count} files"));
}
async fn process_handles(
&mut self,
pending_running_tasks: &mut FuturesUnordered<TaskHandle<Error>>,
job_ctx: &impl JobContext,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
) -> Option<Result<ReturnStatus, Error>> {
while let Some(task) = pending_running_tasks.next().await {
match task {
Ok(TaskStatus::Done((task_id, TaskOutput::Out(out)))) => {
let more_handles = match self
.process_task_output(task_id, out, job_ctx, dispatcher)
.process_task_output(task_id, out, ctx, dispatcher)
.await
{
Ok(more_handles) => more_handles,
@ -538,9 +542,9 @@ impl IndexerJob {
async fn init_or_resume(
&mut self,
pending_running_tasks: &mut FuturesUnordered<TaskHandle<Error>>,
job_ctx: &impl JobContext,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
) -> Result<(), IndexerError> {
) -> Result<(), indexer::Error> {
// if we don't have any pending task, then this is a fresh job
if self.pending_tasks_on_resume.is_empty() {
let walker_root_path = Arc::new(
@ -548,7 +552,7 @@ impl IndexerJob {
self.location.id,
&self.sub_path,
&*self.iso_file_path_factory.location_path,
job_ctx.db(),
ctx.db(),
)
.await?,
);
@ -562,7 +566,7 @@ impl IndexerJob {
self.iso_file_path_factory.clone(),
WalkerDBProxy {
location_id: self.location.id,
db: Arc::clone(job_ctx.db()),
db: Arc::clone(ctx.db()),
},
dispatcher.clone(),
)?)
@ -633,12 +637,12 @@ struct SaveState {
ancestors_already_indexed: HashSet<IsolatedFilePathData<'static>>,
paths_and_sizes: HashMap<IsolatedFilePathData<'static>, u64>,
errors: Vec<NonCriticalJobError>,
errors: Vec<NonCriticalError>,
tasks_for_shutdown_bytes: Option<SerializedTasks>,
}
impl SerializableJob for IndexerJob {
impl<Ctx: OuterContext> SerializableJob<Ctx> for Indexer {
async fn serialize(self) -> Result<Option<Vec<u8>>, rmp_serde::encode::Error> {
let Self {
location,
@ -706,7 +710,7 @@ impl SerializableJob for IndexerJob {
async fn deserialize(
serialized_job: &[u8],
_: &impl JobContext,
_: &Ctx,
) -> Result<Option<(Self, Option<SerializedTasks>)>, rmp_serde::decode::Error> {
let SaveState {
location,
@ -744,7 +748,7 @@ impl SerializableJob for IndexerJob {
}
}
impl Hash for IndexerJob {
impl Hash for Indexer {
fn hash<H: Hasher>(&self, state: &mut H) {
self.location.id.hash(state);
if let Some(ref sub_path) = self.sub_path {

View file

@ -1,4 +1,4 @@
use crate::{utils::sub_path::SubPathError, NonCriticalJobError};
use crate::{utils::sub_path, OuterContext};
use sd_core_file_path_helper::{FilePathError, IsolatedFilePathData};
use sd_core_indexer_rules::IndexerRuleError;
@ -8,7 +8,7 @@ use sd_core_prisma_helpers::{
use sd_core_sync::Manager as SyncManager;
use sd_prisma::{
prisma::{file_path, location, PrismaClient, SortOrder},
prisma::{file_path, indexer_rule, location, PrismaClient, SortOrder},
prisma_sync,
};
use sd_sync::OperationFactory;
@ -33,11 +33,10 @@ use serde::{Deserialize, Serialize};
use specta::Type;
use tracing::warn;
mod job;
pub mod job;
mod shallow;
mod tasks;
pub use job::IndexerJob;
pub use shallow::shallow;
use tasks::walker;
@ -46,12 +45,12 @@ use tasks::walker;
const BATCH_SIZE: usize = 1000;
#[derive(thiserror::Error, Debug)]
pub enum IndexerError {
pub enum Error {
// Not Found errors
#[error("indexer rule not found: <id='{0}'>")]
IndexerRuleNotFound(i32),
IndexerRuleNotFound(indexer_rule::id::Type),
#[error(transparent)]
SubPath(#[from] SubPathError),
SubPath(#[from] sub_path::Error),
// Internal Errors
#[error("database Error: {0}")]
@ -72,16 +71,16 @@ pub enum IndexerError {
Rules(#[from] IndexerRuleError),
}
impl From<IndexerError> for rspc::Error {
fn from(err: IndexerError) -> Self {
impl From<Error> for rspc::Error {
fn from(err: Error) -> Self {
match err {
IndexerError::IndexerRuleNotFound(_) => {
Error::IndexerRuleNotFound(_) => {
Self::with_cause(ErrorCode::NotFound, err.to_string(), err)
}
IndexerError::SubPath(sub_path_err) => sub_path_err.into(),
Error::SubPath(sub_path_err) => sub_path_err.into(),
IndexerError::Rules(rule_err) => rule_err.into(),
Error::Rules(rule_err) => rule_err.into(),
_ => Self::with_cause(ErrorCode::InternalServerError, err.to_string(), err),
}
@ -89,7 +88,7 @@ impl From<IndexerError> for rspc::Error {
}
#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)]
pub enum NonCriticalIndexerError {
pub enum NonCriticalError {
#[error("failed to read directory entry: {0}")]
FailedDirectoryEntry(String),
#[error("failed to fetch metadata: {0}")]
@ -134,7 +133,7 @@ async fn update_directory_sizes(
iso_paths_and_sizes: HashMap<IsolatedFilePathData<'_>, u64, impl BuildHasher + Send>,
db: &PrismaClient,
sync: &SyncManager,
) -> Result<(), IndexerError> {
) -> Result<(), Error> {
let to_sync_and_update = db
._batch(chunk_db_queries(iso_paths_and_sizes.keys(), db))
.await?
@ -160,7 +159,7 @@ async fn update_directory_sizes(
),
))
})
.collect::<Result<Vec<_>, IndexerError>>()?
.collect::<Result<Vec<_>, Error>>()?
.into_iter()
.unzip::<_, _, Vec<_>, Vec<_>>();
@ -169,11 +168,11 @@ async fn update_directory_sizes(
Ok(())
}
async fn update_location_size<InvalidateQuery: Fn(&'static str) + Send + Sync>(
async fn update_location_size(
location_id: location::id::Type,
db: &PrismaClient,
invalidate_query: &InvalidateQuery,
) -> Result<(), IndexerError> {
ctx: &impl OuterContext,
) -> Result<(), Error> {
let total_size = db
.file_path()
.find_many(vec![
@ -201,8 +200,8 @@ async fn update_location_size<InvalidateQuery: Fn(&'static str) + Send + Sync>(
.exec()
.await?;
invalidate_query("locations.list");
invalidate_query("locations.get");
ctx.invalidate_query("locations.list");
ctx.invalidate_query("locations.get");
Ok(())
}
@ -211,7 +210,7 @@ async fn remove_non_existing_file_paths(
to_remove: Vec<file_path_pub_and_cas_ids::Data>,
db: &PrismaClient,
sync: &sd_core_sync::Manager,
) -> Result<u64, IndexerError> {
) -> Result<u64, Error> {
#[allow(clippy::cast_sign_loss)]
let (sync_params, db_params): (Vec<_>, Vec<_>) = to_remove
.into_iter()
@ -248,8 +247,8 @@ async fn reverse_update_directories_sizes(
location_path: impl AsRef<Path> + Send,
db: &PrismaClient,
sync: &SyncManager,
errors: &mut Vec<NonCriticalJobError>,
) -> Result<(), IndexerError> {
errors: &mut Vec<crate::NonCriticalError>,
) -> Result<(), Error> {
let location_path = location_path.as_ref();
let ancestors = base_path
@ -279,7 +278,7 @@ async fn reverse_update_directories_sizes(
IsolatedFilePathData::try_from(file_path)
.map_err(|e| {
errors.push(
NonCriticalIndexerError::MissingFilePathData(format!(
NonCriticalError::MissingFilePathData(format!(
"Found a file_path missing data: <pub_id='{:#?}'>, error: {e:#?}",
from_bytes_to_uuid(&pub_id)
))
@ -345,8 +344,8 @@ async fn compute_sizes(
materialized_paths: Vec<String>,
pub_id_by_ancestor_materialized_path: &mut HashMap<String, (file_path::pub_id::Type, u64)>,
db: &PrismaClient,
errors: &mut Vec<NonCriticalJobError>,
) -> Result<(), IndexerError> {
errors: &mut Vec<crate::NonCriticalError>,
) -> Result<(), Error> {
db.file_path()
.find_many(vec![
file_path::location_id::equals(Some(location_id)),
@ -371,7 +370,7 @@ async fn compute_sizes(
}
} else {
errors.push(
NonCriticalIndexerError::MissingFilePathData(format!(
NonCriticalError::MissingFilePathData(format!(
"Corrupt database possessing a file_path entry without materialized_path: <pub_id='{:#?}'>",
from_bytes_to_uuid(&file_path.pub_id)
))
@ -409,7 +408,7 @@ impl walker::WalkerDBProxy for WalkerDBProxy {
async fn fetch_file_paths(
&self,
found_paths: Vec<file_path::WhereParam>,
) -> Result<Vec<file_path_walker::Data>, IndexerError> {
) -> Result<Vec<file_path_walker::Data>, Error> {
// Each found path is a AND with 4 terms, and SQLite has a expression tree limit of 1000 terms
// so we will use chunks of 200 just to be safe
self.db
@ -435,7 +434,7 @@ impl walker::WalkerDBProxy for WalkerDBProxy {
&self,
parent_iso_file_path: &IsolatedFilePathData<'_>,
unique_location_id_materialized_path_name_extension_params: Vec<file_path::WhereParam>,
) -> Result<Vec<file_path_pub_and_cas_ids::Data>, NonCriticalIndexerError> {
) -> Result<Vec<file_path_pub_and_cas_ids::Data>, NonCriticalError> {
// NOTE: This batch size can be increased if we wish to trade memory for more performance
const BATCH_SIZE: i64 = 1000;
@ -461,7 +460,7 @@ impl walker::WalkerDBProxy for WalkerDBProxy {
.flat_map(|file_paths| file_paths.into_iter().map(|file_path| file_path.id))
.collect::<HashSet<_>>()
})
.map_err(|e| NonCriticalIndexerError::FetchAlreadyExistingFilePathIds(e.to_string()))?;
.map_err(|e| NonCriticalError::FetchAlreadyExistingFilePathIds(e.to_string()))?;
let mut to_remove = vec![];
let mut cursor = 1;
@ -484,7 +483,7 @@ impl walker::WalkerDBProxy for WalkerDBProxy {
.select(file_path_pub_and_cas_ids::select())
.exec()
.await
.map_err(|e| NonCriticalIndexerError::FetchFilePathsToRemove(e.to_string()))?;
.map_err(|e| NonCriticalError::FetchFilePathsToRemove(e.to_string()))?;
#[allow(clippy::cast_possible_truncation)] // Safe because we are using a constant
let should_stop = found.len() < BATCH_SIZE as usize;

View file

@ -1,4 +1,6 @@
use crate::{utils::sub_path::get_full_path_from_sub_path, Error, NonCriticalJobError};
use crate::{
indexer, utils::sub_path::get_full_path_from_sub_path, Error, NonCriticalError, OuterContext,
};
use sd_core_indexer_rules::{IndexerRule, IndexerRuler};
use sd_core_prisma_helpers::location_with_indexer_rules;
@ -25,29 +27,28 @@ use super::{
updater::{UpdateTask, UpdateTaskOutput},
walker::{ToWalkEntry, WalkDirTask, WalkTaskOutput, WalkedEntry},
},
update_directory_sizes, update_location_size, IndexerError, IsoFilePathFactory, WalkerDBProxy,
BATCH_SIZE,
update_directory_sizes, update_location_size, IsoFilePathFactory, WalkerDBProxy, BATCH_SIZE,
};
pub async fn shallow(
location: location_with_indexer_rules::Data,
sub_path: impl AsRef<Path> + Send,
dispatcher: BaseTaskDispatcher<Error>,
db: Arc<PrismaClient>,
sync: Arc<SyncManager>,
invalidate_query: impl Fn(&'static str) + Send + Sync,
) -> Result<Vec<NonCriticalJobError>, Error> {
ctx: impl OuterContext,
) -> Result<Vec<NonCriticalError>, Error> {
let sub_path = sub_path.as_ref();
let db = ctx.db();
let sync = ctx.sync();
let location_path = maybe_missing(&location.path, "location.path")
.map(PathBuf::from)
.map(Arc::new)
.map_err(IndexerError::from)?;
.map_err(indexer::Error::from)?;
let to_walk_path = Arc::new(
get_full_path_from_sub_path(location.id, &Some(sub_path), &*location_path, &db)
get_full_path_from_sub_path(location.id, &Some(sub_path), &*location_path, db)
.await
.map_err(IndexerError::from)?,
.map_err(indexer::Error::from)?,
);
let Some(WalkTaskOutput {
@ -62,7 +63,7 @@ pub async fn shallow(
&location,
Arc::clone(&location_path),
Arc::clone(&to_walk_path),
Arc::clone(&db),
Arc::clone(db),
&dispatcher,
)
.await?
@ -70,7 +71,7 @@ pub async fn shallow(
return Ok(vec![]);
};
let removed_count = remove_non_existing_file_paths(to_remove, &db, &sync).await?;
let removed_count = remove_non_existing_file_paths(to_remove, db, sync).await?;
let Some(Metadata {
indexed_count,
@ -79,8 +80,8 @@ pub async fn shallow(
&location,
to_create,
to_update,
Arc::clone(&db),
Arc::clone(&sync),
Arc::clone(db),
Arc::clone(sync),
&dispatcher,
)
.await?
@ -91,8 +92,8 @@ pub async fn shallow(
if indexed_count > 0 || removed_count > 0 || updated_count > 0 {
update_directory_sizes(
HashMap::from([(directory_iso_file_path, total_size)]),
&db,
&sync,
db,
sync,
)
.await?;
@ -101,18 +102,18 @@ pub async fn shallow(
&*to_walk_path,
location.id,
&*location_path,
&db,
&sync,
db,
sync,
&mut errors,
)
.await?;
}
update_location_size(location.id, &db, &invalidate_query).await?;
update_location_size(location.id, db, &ctx).await?;
}
if indexed_count > 0 || removed_count > 0 {
invalidate_query("search.paths");
ctx.invalidate_query("search.paths");
}
Ok(errors)
@ -135,7 +136,7 @@ async fn walk(
.map(|rule| IndexerRule::try_from(&rule.indexer_rule))
.collect::<Result<Vec<_>, _>>()
.map(IndexerRuler::new)
.map_err(IndexerError::from)?,
.map_err(indexer::Error::from)?,
IsoFilePathFactory {
location_id: location.id,
location_path,

View file

@ -1,4 +1,4 @@
use crate::{indexer::IndexerError, Error};
use crate::{indexer, Error};
use sd_core_file_path_helper::IsolatedFilePathDataParts;
use sd_core_sync::Manager as SyncManager;
@ -234,7 +234,7 @@ impl Task<Error> for SaveTask {
),
)
.await
.map_err(IndexerError::from)? as u64;
.map_err(indexer::Error::from)? as u64;
trace!("Inserted {saved_count} records");

View file

@ -1,4 +1,4 @@
use crate::{indexer::IndexerError, Error};
use crate::{indexer, Error};
use sd_core_file_path_helper::IsolatedFilePathDataParts;
use sd_core_sync::Manager as SyncManager;
@ -222,7 +222,7 @@ impl Task<Error> for UpdateTask {
(sync_stuff.into_iter().flatten().collect(), paths_to_update),
)
.await
.map_err(IndexerError::from)?;
.map_err(indexer::Error::from)?;
trace!("Updated {updated:?} records");
@ -240,7 +240,7 @@ async fn fetch_objects_ids_to_unlink(
walked_entries: &[WalkedEntry],
object_ids_that_should_be_unlinked: &mut HashSet<object::id::Type>,
db: &PrismaClient,
) -> Result<(), IndexerError> {
) -> Result<(), indexer::Error> {
if object_ids_that_should_be_unlinked.is_empty() {
// First we consult which file paths we should unlink
let object_ids = walked_entries

View file

@ -1,7 +1,4 @@
use crate::{
indexer::{IndexerError, NonCriticalIndexerError},
Error, NonCriticalJobError,
};
use crate::{indexer, Error, NonCriticalError};
use sd_core_file_path_helper::{FilePathError, FilePathMetadata, IsolatedFilePathData};
use sd_core_indexer_rules::{IndexerRuler, MetadataForIndexerRules, RuleKind};
@ -111,13 +108,14 @@ pub trait WalkerDBProxy: Clone + Send + Sync + fmt::Debug + 'static {
fn fetch_file_paths(
&self,
found_paths: Vec<file_path::WhereParam>,
) -> impl Future<Output = Result<Vec<file_path_walker::Data>, IndexerError>> + Send;
) -> impl Future<Output = Result<Vec<file_path_walker::Data>, indexer::Error>> + Send;
fn fetch_file_paths_to_remove(
&self,
parent_iso_file_path: &IsolatedFilePathData<'_>,
unique_location_id_materialized_path_name_extension_params: Vec<file_path::WhereParam>,
) -> impl Future<Output = Result<Vec<file_path_pub_and_cas_ids::Data>, NonCriticalIndexerError>> + Send;
) -> impl Future<Output = Result<Vec<file_path_pub_and_cas_ids::Data>, indexer::NonCriticalError>>
+ Send;
}
#[derive(Debug, Serialize, Deserialize)]
@ -141,7 +139,7 @@ pub struct WalkTaskOutput {
pub to_update: Vec<WalkedEntry>,
pub to_remove: Vec<file_path_pub_and_cas_ids::Data>,
pub accepted_ancestors: HashSet<WalkedEntry>,
pub errors: Vec<NonCriticalJobError>,
pub errors: Vec<NonCriticalError>,
pub directory_iso_file_path: IsolatedFilePathData<'static>,
pub total_size: u64,
pub handles: Vec<TaskHandle<Error>>,
@ -160,7 +158,7 @@ struct InnerMetadata {
}
impl InnerMetadata {
fn new(path: impl AsRef<Path>, metadata: &Metadata) -> Result<Self, NonCriticalIndexerError> {
fn new(path: impl AsRef<Path>, metadata: &Metadata) -> Result<Self, indexer::NonCriticalError> {
let FilePathMetadata {
inode,
size_in_bytes,
@ -168,7 +166,7 @@ impl InnerMetadata {
modified_at,
hidden,
} = FilePathMetadata::from_path(path, metadata)
.map_err(|e| NonCriticalIndexerError::FilePathMetadata(e.to_string()))?;
.map_err(|e| indexer::NonCriticalError::FilePathMetadata(e.to_string()))?;
Ok(Self {
is_dir: metadata.is_dir(),
@ -237,7 +235,7 @@ struct WalkDirSaveState {
root: Arc<PathBuf>,
entry_iso_file_path: IsolatedFilePathData<'static>,
stage: WalkerStageSaveState,
errors: Vec<NonCriticalJobError>,
errors: Vec<NonCriticalError>,
scan_time: Duration,
is_shallow: bool,
}
@ -367,7 +365,7 @@ where
db_proxy: DBProxy,
stage: WalkerStage,
maybe_dispatcher: Option<Dispatcher>,
errors: Vec<NonCriticalJobError>,
errors: Vec<NonCriticalError>,
scan_time: Duration,
is_shallow: bool,
}
@ -385,7 +383,7 @@ where
iso_file_path_factory: IsoPathFactory,
db_proxy: DBProxy,
dispatcher: Dispatcher,
) -> Result<Self, IndexerError> {
) -> Result<Self, indexer::Error> {
let entry = entry.into();
Ok(Self {
id: TaskId::new_v4(),
@ -415,7 +413,7 @@ where
indexer_ruler: IndexerRuler,
iso_file_path_factory: IsoPathFactory,
db_proxy: DBProxy,
) -> Result<Self, IndexerError> {
) -> Result<Self, indexer::Error> {
let entry = entry.into();
Ok(Self {
id: TaskId::new_v4(),
@ -545,7 +543,7 @@ where
*stage = WalkerStage::Walking {
read_dir_stream: ReadDirStream::new(fs::read_dir(&path).await.map_err(
|e| {
IndexerError::FileIO(
indexer::Error::FileIO(
(&path, e, "Failed to open directory to read its entries")
.into(),
)
@ -565,8 +563,8 @@ where
found_paths.push(dir_entry.path());
}
Err(e) => {
errors.push(NonCriticalJobError::Indexer(
NonCriticalIndexerError::FailedDirectoryEntry(
errors.push(NonCriticalError::Indexer(
indexer::NonCriticalError::FailedDirectoryEntry(
FileIOError::from((&path, e)).to_string(),
),
));
@ -709,7 +707,7 @@ where
async fn segregate_creates_and_updates(
walking_entries: &mut Vec<WalkingEntry>,
db_proxy: &impl WalkerDBProxy,
) -> Result<(Vec<WalkedEntry>, Vec<WalkedEntry>, u64), IndexerError> {
) -> Result<(Vec<WalkedEntry>, Vec<WalkedEntry>, u64), Error> {
if walking_entries.is_empty() {
Ok((vec![], vec![], 0))
} else {
@ -791,7 +789,7 @@ async fn keep_walking(
db_proxy: &impl WalkerDBProxy,
maybe_to_keep_walking: &mut Option<Vec<ToWalkEntry>>,
dispatcher: &Option<impl TaskDispatcher<Error>>,
errors: &mut Vec<NonCriticalJobError>,
errors: &mut Vec<NonCriticalError>,
) -> Vec<TaskHandle<Error>> {
if let (Some(dispatcher), Some(to_keep_walking)) = (dispatcher, maybe_to_keep_walking) {
dispatcher
@ -807,7 +805,7 @@ async fn keep_walking(
db_proxy.clone(),
dispatcher.clone(),
)
.map_err(|e| NonCriticalIndexerError::DispatchKeepWalking(e.to_string()))
.map_err(|e| indexer::NonCriticalError::DispatchKeepWalking(e.to_string()))
})
.filter_map(|res| res.map_err(|e| errors.push(e.into())).ok()),
)
@ -819,7 +817,7 @@ async fn keep_walking(
async fn collect_metadata(
found_paths: &mut Vec<PathBuf>,
errors: &mut Vec<NonCriticalJobError>,
errors: &mut Vec<NonCriticalError>,
) -> HashMap<PathBuf, InnerMetadata> {
found_paths
.drain(..)
@ -827,7 +825,7 @@ async fn collect_metadata(
fs::metadata(&current_path)
.await
.map_err(|e| {
NonCriticalIndexerError::Metadata(
indexer::NonCriticalError::Metadata(
FileIOError::from((&current_path, e)).to_string(),
)
})
@ -847,7 +845,7 @@ async fn collect_metadata(
async fn apply_indexer_rules(
paths_and_metadatas: &mut HashMap<PathBuf, InnerMetadata>,
indexer_ruler: &IndexerRuler,
errors: &mut Vec<NonCriticalJobError>,
errors: &mut Vec<NonCriticalError>,
) -> HashMap<PathBuf, (InnerMetadata, HashMap<RuleKind, Vec<bool>>)> {
paths_and_metadatas
.drain()
@ -860,7 +858,7 @@ async fn apply_indexer_rules(
.map(|acceptance_per_rule_kind| {
(current_path, (metadata, acceptance_per_rule_kind))
})
.map_err(|e| NonCriticalIndexerError::IndexerRule(e.to_string()))
.map_err(|e| indexer::NonCriticalError::IndexerRule(e.to_string()))
})
.collect::<Vec<_>>()
.join()
@ -879,7 +877,7 @@ async fn process_rules_results(
(InnerMetadata, HashMap<RuleKind, Vec<bool>>),
>,
maybe_to_keep_walking: &mut Option<Vec<ToWalkEntry>>,
errors: &mut Vec<NonCriticalJobError>,
errors: &mut Vec<NonCriticalError>,
) -> (HashMap<PathBuf, InnerMetadata>, HashSet<WalkedEntry>) {
let root = root.as_ref();
@ -951,7 +949,7 @@ async fn process_rules_results(
fs::metadata(&ancestor_path)
.await
.map_err(|e| {
NonCriticalIndexerError::Metadata(
indexer::NonCriticalError::Metadata(
FileIOError::from((&ancestor_path, e)).to_string(),
)
})
@ -964,7 +962,7 @@ async fn process_rules_results(
}
.into()
})
.map_err(|e| NonCriticalIndexerError::FilePathMetadata(e.to_string()))
.map_err(|e| indexer::NonCriticalError::FilePathMetadata(e.to_string()))
})
})
.collect::<Vec<_>>()
@ -1023,7 +1021,7 @@ fn accept_ancestors(
accepted: &mut HashMap<PathBuf, InnerMetadata>,
iso_file_path_factory: &impl IsoFilePathFactory,
accepted_ancestors: &mut HashMap<IsolatedFilePathData<'static>, PathBuf>,
errors: &mut Vec<NonCriticalJobError>,
errors: &mut Vec<NonCriticalError>,
) {
// If the ancestors directories wasn't indexed before, now we do
for ancestor in current_path
@ -1033,7 +1031,7 @@ fn accept_ancestors(
{
if let Ok(iso_file_path) = iso_file_path_factory
.build(ancestor, true)
.map_err(|e| errors.push(NonCriticalIndexerError::IsoFilePath(e.to_string()).into()))
.map_err(|e| errors.push(indexer::NonCriticalError::IsoFilePath(e.to_string()).into()))
{
match accepted_ancestors.entry(iso_file_path) {
Entry::Occupied(_) => {
@ -1083,7 +1081,7 @@ async fn gather_file_paths_to_remove(
entry_iso_file_path: &IsolatedFilePathData<'_>,
iso_file_path_factory: &impl IsoFilePathFactory,
db_proxy: &impl WalkerDBProxy,
errors: &mut Vec<NonCriticalJobError>,
errors: &mut Vec<NonCriticalError>,
) -> (Vec<WalkingEntry>, Vec<file_path_pub_and_cas_ids::Data>) {
let (walking, to_delete_params) = accepted_paths
.drain()
@ -1102,7 +1100,7 @@ async fn gather_file_paths_to_remove(
)
})
.map_err(|e| {
errors.push(NonCriticalIndexerError::IsoFilePath(e.to_string()).into());
errors.push(indexer::NonCriticalError::IsoFilePath(e.to_string()).into());
})
.ok()
})
@ -1158,7 +1156,7 @@ mod tests {
async fn fetch_file_paths(
&self,
_: Vec<file_path::WhereParam>,
) -> Result<Vec<file_path_walker::Data>, IndexerError> {
) -> Result<Vec<file_path_walker::Data>, indexer::Error> {
Ok(vec![])
}
@ -1166,7 +1164,7 @@ mod tests {
&self,
_: &IsolatedFilePathData<'_>,
_: Vec<file_path::WhereParam>,
) -> Result<Vec<file_path_pub_and_cas_ids::Data>, NonCriticalIndexerError> {
) -> Result<Vec<file_path_pub_and_cas_ids::Data>, indexer::NonCriticalError> {
Ok(vec![])
}
}

View file

@ -1,4 +1,4 @@
use crate::{Error, NonCriticalJobError};
use crate::{Error, NonCriticalError, UpdateEvent};
use sd_core_sync::Manager as SyncManager;
@ -11,6 +11,7 @@ use std::{
collections::{hash_map::DefaultHasher, VecDeque},
hash::{Hash, Hasher},
marker::PhantomData,
path::Path,
pin::pin,
sync::Arc,
};
@ -46,6 +47,7 @@ use super::{
pub enum JobName {
Indexer,
FileIdentifier,
MediaProcessor,
// TODO: Add more job names as needed
}
@ -72,7 +74,7 @@ impl ProgressUpdate {
}
}
pub trait JobContext: Send + Sync + Clone + 'static {
pub trait OuterContext: Send + Sync + Clone + 'static {
fn id(&self) -> Uuid;
fn db(&self) -> &Arc<PrismaClient>;
fn sync(&self) -> &Arc<SyncManager>;
@ -82,6 +84,8 @@ pub trait JobContext: Send + Sync + Clone + 'static {
fn progress_msg(&self, msg: impl Into<String>) {
self.progress(vec![ProgressUpdate::Message(msg.into())]);
}
fn report_update(&self, update: UpdateEvent);
fn get_data_directory(&self) -> &Path;
}
pub trait Job: Send + Sync + Hash + 'static {
@ -91,31 +95,31 @@ pub trait Job: Send + Sync + Hash + 'static {
fn resume_tasks(
&mut self,
dispatcher: &JobTaskDispatcher,
ctx: &impl JobContext,
ctx: &impl OuterContext,
serialized_tasks: SerializedTasks,
) -> impl Future<Output = Result<(), Error>> + Send {
async move { Ok(()) }
}
fn run(
fn run<Ctx: OuterContext>(
self,
dispatcher: JobTaskDispatcher,
ctx: impl JobContext,
ctx: Ctx,
) -> impl Future<Output = Result<ReturnStatus, Error>> + Send;
}
pub trait IntoJob<J, Ctx>
where
J: Job + SerializableJob,
Ctx: JobContext,
J: Job + SerializableJob<Ctx>,
Ctx: OuterContext,
{
fn into_job(self) -> Box<dyn DynJob<Ctx>>;
}
impl<J, Ctx> IntoJob<J, Ctx> for J
where
J: Job + SerializableJob,
Ctx: JobContext,
J: Job + SerializableJob<Ctx>,
Ctx: OuterContext,
{
fn into_job(self) -> Box<dyn DynJob<Ctx>> {
let id = JobId::new_v4();
@ -132,8 +136,8 @@ where
impl<J, Ctx> IntoJob<J, Ctx> for JobBuilder<J, Ctx>
where
J: Job + SerializableJob,
Ctx: JobContext,
J: Job + SerializableJob<Ctx>,
Ctx: OuterContext,
{
fn into_job(self) -> Box<dyn DynJob<Ctx>> {
self.build()
@ -144,7 +148,7 @@ where
pub struct JobReturn {
data: JobOutputData,
metadata: Option<ReportOutputMetadata>,
non_critical_errors: Vec<NonCriticalJobError>,
non_critical_errors: Vec<NonCriticalError>,
}
impl JobReturn {
@ -185,7 +189,7 @@ impl JobReturnBuilder {
}
#[must_use]
pub fn with_non_critical_errors(mut self, errors: Vec<NonCriticalJobError>) -> Self {
pub fn with_non_critical_errors(mut self, errors: Vec<NonCriticalError>) -> Self {
if self.job_return.non_critical_errors.is_empty() {
self.job_return.non_critical_errors = errors;
} else {
@ -207,7 +211,7 @@ pub struct JobOutput {
job_name: JobName,
data: JobOutputData,
metadata: Vec<ReportMetadata>,
non_critical_errors: Vec<NonCriticalJobError>,
non_critical_errors: Vec<NonCriticalError>,
}
impl JobOutput {
@ -260,8 +264,8 @@ pub enum JobOutputData {
pub struct JobBuilder<J, Ctx>
where
J: Job + SerializableJob,
Ctx: JobContext,
J: Job + SerializableJob<Ctx>,
Ctx: OuterContext,
{
id: JobId,
job: J,
@ -272,8 +276,8 @@ where
impl<J, Ctx> JobBuilder<J, Ctx>
where
J: Job + SerializableJob,
Ctx: JobContext,
J: Job + SerializableJob<Ctx>,
Ctx: OuterContext,
{
pub fn build(self) -> Box<JobHolder<J, Ctx>> {
Box::new(JobHolder {
@ -315,7 +319,7 @@ where
}
#[must_use]
pub fn enqueue_next(mut self, next: impl Job + SerializableJob) -> Self {
pub fn enqueue_next(mut self, next: impl Job + SerializableJob<Ctx>) -> Self {
let next_job_order = self.next_jobs.len() + 1;
let mut child_job_builder = JobBuilder::new(next).with_parent_id(self.id);
@ -333,8 +337,8 @@ where
pub struct JobHolder<J, Ctx>
where
J: Job + SerializableJob,
Ctx: JobContext,
J: Job + SerializableJob<Ctx>,
Ctx: OuterContext,
{
pub(super) id: JobId,
pub(super) job: J,
@ -343,14 +347,14 @@ where
pub(super) _ctx: PhantomData<Ctx>,
}
pub struct JobHandle<Ctx: JobContext> {
pub struct JobHandle<Ctx: OuterContext> {
pub(crate) next_jobs: VecDeque<Box<dyn DynJob<Ctx>>>,
pub(crate) job_ctx: Ctx,
pub(crate) ctx: Ctx,
pub(crate) report: Report,
pub(crate) commands_tx: chan::Sender<Command>,
}
impl<Ctx: JobContext> JobHandle<Ctx> {
impl<Ctx: OuterContext> JobHandle<Ctx> {
pub async fn send_command(&mut self, command: Command) -> Result<(), JobSystemError> {
if self.commands_tx.send(command).await.is_err() {
warn!("Tried to send a {command:?} to a job that was already completed");
@ -375,7 +379,7 @@ impl<Ctx: JobContext> JobHandle<Ctx> {
next_job_report.status = new_status;
next_job_report.completed_at = completed_at;
next_job_report.update(self.job_ctx.db()).await
next_job_report.update(self.ctx.db()).await
})
.collect::<Vec<_>>()
.try_join()
@ -391,7 +395,7 @@ impl<Ctx: JobContext> JobHandle<Ctx> {
let Self {
next_jobs,
report,
job_ctx,
ctx,
..
} = self;
@ -400,7 +404,7 @@ impl<Ctx: JobContext> JobHandle<Ctx> {
report.started_at = Some(start_time);
}
let db = job_ctx.db();
let db = ctx.db();
// If the report doesn't have a created_at date, it's a new report
if report.created_at.is_none() {
@ -432,21 +436,17 @@ impl<Ctx: JobContext> JobHandle<Ctx> {
&mut self,
job_return: JobReturn,
) -> Result<JobOutput, JobSystemError> {
let Self {
report, job_ctx, ..
} = self;
let Self { report, ctx, .. } = self;
let output = JobOutput::prepare_output_and_report(job_return, report);
report.update(job_ctx.db()).await?;
report.update(ctx.db()).await?;
Ok(output)
}
pub async fn failed_job(&mut self, e: &Error) -> Result<(), JobSystemError> {
let Self {
report, job_ctx, ..
} = self;
let Self { report, ctx, .. } = self;
error!(
"Job<id='{}', name='{}'> failed with a critical error: {e:#?};",
report.id, report.name
@ -456,15 +456,13 @@ impl<Ctx: JobContext> JobHandle<Ctx> {
report.critical_error = Some(e.to_string());
report.completed_at = Some(Utc::now());
report.update(job_ctx.db()).await?;
report.update(ctx.db()).await?;
self.command_children(Command::Cancel).await
}
pub async fn shutdown_pause_job(&mut self) -> Result<(), JobSystemError> {
let Self {
report, job_ctx, ..
} = self;
let Self { report, ctx, .. } = self;
info!(
"Job<id='{}', name='{}'> paused due to system shutdown, we will pause all children jobs",
report.id, report.name
@ -472,15 +470,13 @@ impl<Ctx: JobContext> JobHandle<Ctx> {
report.status = Status::Paused;
report.update(job_ctx.db()).await?;
report.update(ctx.db()).await?;
self.command_children(Command::Pause).await
}
pub async fn cancel_job(&mut self) -> Result<(), JobSystemError> {
let Self {
report, job_ctx, ..
} = self;
let Self { report, ctx, .. } = self;
info!(
"Job<id='{}', name='{}'> canceled, we will cancel all children jobs",
report.id, report.name
@ -489,14 +485,14 @@ impl<Ctx: JobContext> JobHandle<Ctx> {
report.status = Status::Canceled;
report.completed_at = Some(Utc::now());
report.update(job_ctx.db()).await?;
report.update(ctx.db()).await?;
self.command_children(Command::Cancel).await
}
}
#[async_trait::async_trait]
pub trait DynJob<Ctx: JobContext>: Send + Sync + 'static {
pub trait DynJob<Ctx: OuterContext>: Send + Sync + 'static {
fn id(&self) -> JobId;
fn job_name(&self) -> JobName;
@ -514,14 +510,14 @@ pub trait DynJob<Ctx: JobContext>: Send + Sync + 'static {
fn dispatch(
self: Box<Self>,
base_dispatcher: BaseTaskDispatcher<Error>,
job_ctx: Ctx,
ctx: Ctx,
done_tx: chan::Sender<(JobId, Result<ReturnStatus, Error>)>,
) -> JobHandle<Ctx>;
fn resume(
self: Box<Self>,
base_dispatcher: BaseTaskDispatcher<Error>,
job_ctx: Ctx,
ctx: Ctx,
serialized_tasks: Option<SerializedTasks>,
done_tx: chan::Sender<(JobId, Result<ReturnStatus, Error>)>,
) -> JobHandle<Ctx>;
@ -530,8 +526,8 @@ pub trait DynJob<Ctx: JobContext>: Send + Sync + 'static {
#[async_trait::async_trait]
impl<J, Ctx> DynJob<Ctx> for JobHolder<J, Ctx>
where
J: Job + SerializableJob,
Ctx: JobContext,
J: Job + SerializableJob<Ctx>,
Ctx: OuterContext,
{
fn id(&self) -> JobId {
self.id
@ -567,7 +563,7 @@ where
fn dispatch(
self: Box<Self>,
base_dispatcher: BaseTaskDispatcher<Error>,
job_ctx: Ctx,
ctx: Ctx,
done_tx: chan::Sender<(JobId, Result<ReturnStatus, Error>)>,
) -> JobHandle<Ctx> {
let (commands_tx, commands_rx) = chan::bounded(8);
@ -575,7 +571,7 @@ where
spawn(to_spawn_job(
self.id,
self.job,
job_ctx.clone(),
ctx.clone(),
None,
base_dispatcher,
commands_rx,
@ -584,7 +580,7 @@ where
JobHandle {
next_jobs: self.next_jobs,
job_ctx,
ctx,
report: self.report,
commands_tx,
}
@ -593,7 +589,7 @@ where
fn resume(
self: Box<Self>,
base_dispatcher: BaseTaskDispatcher<Error>,
job_ctx: Ctx,
ctx: Ctx,
serialized_tasks: Option<SerializedTasks>,
done_tx: chan::Sender<(JobId, Result<ReturnStatus, Error>)>,
) -> JobHandle<Ctx> {
@ -602,7 +598,7 @@ where
spawn(to_spawn_job(
self.id,
self.job,
job_ctx.clone(),
ctx.clone(),
serialized_tasks,
base_dispatcher,
commands_rx,
@ -611,17 +607,17 @@ where
JobHandle {
next_jobs: self.next_jobs,
job_ctx,
ctx,
report: self.report,
commands_tx,
}
}
}
async fn to_spawn_job<Ctx: JobContext>(
async fn to_spawn_job<Ctx: OuterContext>(
id: JobId,
mut job: impl Job,
job_ctx: Ctx,
ctx: Ctx,
existing_tasks: Option<SerializedTasks>,
base_dispatcher: BaseTaskDispatcher<Error>,
commands_rx: chan::Receiver<Command>,
@ -641,10 +637,7 @@ async fn to_spawn_job<Ctx: JobContext>(
JobTaskDispatcher::new(base_dispatcher, running_state_rx);
if let Some(existing_tasks) = existing_tasks {
if let Err(e) = job
.resume_tasks(&dispatcher, &job_ctx, existing_tasks)
.await
{
if let Err(e) = job.resume_tasks(&dispatcher, &ctx, existing_tasks).await {
done_tx
.send((id, Err(e)))
.await
@ -657,7 +650,7 @@ async fn to_spawn_job<Ctx: JobContext>(
let mut msgs_stream = pin!((
commands_rx.map(StreamMessage::Commands),
remote_controllers_rx.map(StreamMessage::NewRemoteController),
stream::once(job.run(dispatcher, job_ctx)).map(StreamMessage::Done),
stream::once(job.run(dispatcher, ctx)).map(StreamMessage::Done),
)
.merge());

View file

@ -21,7 +21,7 @@ mod store;
pub mod utils;
use error::JobSystemError;
use job::{IntoJob, Job, JobContext, JobName, JobOutput};
use job::{IntoJob, Job, JobName, JobOutput, OuterContext};
use runner::{run, JobSystemRunner, RunnerMessage};
use store::{load_jobs, StoredJobEntry};
@ -38,13 +38,13 @@ pub enum Command {
Cancel,
}
pub struct JobSystem<Ctx: JobContext> {
pub struct JobSystem<Ctx: OuterContext> {
msgs_tx: chan::Sender<RunnerMessage<Ctx>>,
job_outputs_rx: chan::Receiver<(JobId, Result<JobOutput, JobSystemError>)>,
runner_handle: RefCell<Option<JoinHandle<()>>>,
}
impl<Ctx: JobContext> JobSystem<Ctx> {
impl<Ctx: OuterContext> JobSystem<Ctx> {
pub async fn new(
base_dispatcher: BaseTaskDispatcher<Error>,
data_directory: impl AsRef<Path> + Send,
@ -164,11 +164,11 @@ impl<Ctx: JobContext> JobSystem<Ctx> {
/// Dispatch a new job to the system
/// # Panics
/// Panics only happen if internal channels are unexpectedly closed
pub async fn dispatch<J: Job + SerializableJob>(
pub async fn dispatch<J: Job + SerializableJob<Ctx>>(
&mut self,
job: impl IntoJob<J, Ctx> + Send,
location_id: location::id::Type,
job_ctx: Ctx,
ctx: Ctx,
) -> Result<JobId, JobSystemError> {
let dyn_job = job.into_job();
let id = dyn_job.id();
@ -179,7 +179,7 @@ impl<Ctx: JobContext> JobSystem<Ctx> {
id,
location_id,
dyn_job,
job_ctx,
ctx,
ack_tx,
})
.await
@ -230,9 +230,9 @@ impl<Ctx: JobContext> JobSystem<Ctx> {
/// SAFETY: Due to usage of refcell we lost `Sync` impl, but we only use it to have a shutdown method
/// receiving `&self` which is called once, and we also use `try_borrow_mut` so we never panic
unsafe impl<Ctx: JobContext> Sync for JobSystem<Ctx> {}
unsafe impl<Ctx: OuterContext> Sync for JobSystem<Ctx> {}
async fn load_stored_job_entries<Ctx: JobContext>(
async fn load_stored_job_entries<Ctx: OuterContext>(
store_jobs_file: impl AsRef<Path> + Send,
previously_existing_job_contexts: &HashMap<Uuid, Ctx>,
msgs_tx: &chan::Sender<RunnerMessage<Ctx>>,
@ -273,11 +273,11 @@ async fn load_stored_job_entries<Ctx: JobContext>(
res.map_err(|e| error!("Failed to load stored jobs: {e:#?}"))
.ok()
})
.flat_map(|(stored_jobs, job_ctx)| {
.flat_map(|(stored_jobs, ctx)| {
stored_jobs
.into_iter()
.map(move |(location_id, dyn_job, serialized_tasks)| {
let job_ctx = job_ctx.clone();
let ctx = ctx.clone();
async move {
let (ack_tx, ack_rx) = oneshot::channel();
@ -286,7 +286,7 @@ async fn load_stored_job_entries<Ctx: JobContext>(
id: dyn_job.id(),
location_id,
dyn_job,
job_ctx,
ctx,
serialized_tasks,
ack_tx,
})

View file

@ -26,7 +26,7 @@ use tracing::{debug, error, info, warn};
use uuid::Uuid;
use super::{
job::{DynJob, JobContext, JobHandle, JobName, JobOutput, ReturnStatus},
job::{DynJob, JobHandle, JobName, JobOutput, OuterContext, ReturnStatus},
report,
store::{StoredJob, StoredJobEntry},
Command, JobId, JobSystemError, SerializedTasks,
@ -35,19 +35,19 @@ use super::{
const JOBS_INITIAL_CAPACITY: usize = 32;
const FIVE_MINUTES: Duration = Duration::from_secs(5 * 60);
pub(super) enum RunnerMessage<Ctx: JobContext> {
pub(super) enum RunnerMessage<Ctx: OuterContext> {
NewJob {
id: JobId,
location_id: location::id::Type,
dyn_job: Box<dyn DynJob<Ctx>>,
job_ctx: Ctx,
ctx: Ctx,
ack_tx: oneshot::Sender<Result<(), JobSystemError>>,
},
ResumeStoredJob {
id: JobId,
location_id: location::id::Type,
dyn_job: Box<dyn DynJob<Ctx>>,
job_ctx: Ctx,
ctx: Ctx,
serialized_tasks: Option<SerializedTasks>,
ack_tx: oneshot::Sender<Result<(), JobSystemError>>,
},
@ -64,7 +64,7 @@ pub(super) enum RunnerMessage<Ctx: JobContext> {
Shutdown,
}
pub(super) struct JobSystemRunner<Ctx: JobContext> {
pub(super) struct JobSystemRunner<Ctx: OuterContext> {
base_dispatcher: BaseTaskDispatcher<Error>,
handles: HashMap<JobId, JobHandle<Ctx>>,
job_hashes: HashMap<u64, JobId>,
@ -76,7 +76,7 @@ pub(super) struct JobSystemRunner<Ctx: JobContext> {
job_outputs_tx: chan::Sender<(JobId, Result<JobOutput, JobSystemError>)>,
}
impl<Ctx: JobContext> JobSystemRunner<Ctx> {
impl<Ctx: OuterContext> JobSystemRunner<Ctx> {
pub(super) fn new(
base_dispatcher: BaseTaskDispatcher<Error>,
job_return_status_tx: chan::Sender<(JobId, Result<ReturnStatus, Error>)>,
@ -100,7 +100,7 @@ impl<Ctx: JobContext> JobSystemRunner<Ctx> {
id: JobId,
location_id: location::id::Type,
dyn_job: Box<dyn DynJob<Ctx>>,
job_ctx: Ctx,
ctx: Ctx,
maybe_existing_tasks: Option<SerializedTasks>,
) -> Result<(), JobSystemError> {
let Self {
@ -114,7 +114,7 @@ impl<Ctx: JobContext> JobSystemRunner<Ctx> {
..
} = self;
let db = job_ctx.db();
let db = ctx.db();
let job_name = dyn_job.job_name();
let job_hash = dyn_job.hash();
@ -137,14 +137,14 @@ impl<Ctx: JobContext> JobSystemRunner<Ctx> {
let mut handle = if maybe_existing_tasks.is_some() {
dyn_job.resume(
base_dispatcher.clone(),
job_ctx.clone(),
ctx.clone(),
maybe_existing_tasks,
job_return_status_tx.clone(),
)
} else {
dyn_job.dispatch(
base_dispatcher.clone(),
job_ctx.clone(),
ctx.clone(),
job_return_status_tx.clone(),
)
};
@ -169,7 +169,7 @@ impl<Ctx: JobContext> JobSystemRunner<Ctx> {
.map(|dyn_job| dyn_job.report_mut())
.map(|next_job_report| async {
if next_job_report.created_at.is_none() {
next_job_report.create(job_ctx.db()).await
next_job_report.create(ctx.db()).await
} else {
Ok(())
}
@ -277,7 +277,7 @@ impl<Ctx: JobContext> JobSystemRunner<Ctx> {
};
jobs_to_store_by_ctx_id
.entry(handle.job_ctx.id())
.entry(handle.ctx.id())
.or_default()
.push(StoredJobEntry {
location_id,
@ -384,7 +384,7 @@ impl<Ctx: JobContext> JobSystemRunner<Ctx> {
}
}
fn try_dispatch_next_job<Ctx: JobContext>(
fn try_dispatch_next_job<Ctx: OuterContext>(
handle: &mut JobHandle<Ctx>,
base_dispatcher: BaseTaskDispatcher<Error>,
(job_hashes, job_hashes_by_id): (&mut HashMap<u64, JobId>, &mut HashMap<JobId, u64>),
@ -397,11 +397,8 @@ fn try_dispatch_next_job<Ctx: JobContext>(
if let Entry::Vacant(e) = job_hashes.entry(next_hash) {
e.insert(next_id);
job_hashes_by_id.insert(next_id, next_hash);
let mut next_handle = next.dispatch(
base_dispatcher,
handle.job_ctx.clone(),
job_return_status_tx,
);
let mut next_handle =
next.dispatch(base_dispatcher, handle.ctx.clone(), job_return_status_tx);
assert!(
next_handle.next_jobs.is_empty(),
@ -418,13 +415,13 @@ fn try_dispatch_next_job<Ctx: JobContext>(
}
}
pub(super) async fn run<Ctx: JobContext>(
pub(super) async fn run<Ctx: OuterContext>(
mut runner: JobSystemRunner<Ctx>,
store_jobs_file: impl AsRef<Path> + Send,
msgs_rx: chan::Receiver<RunnerMessage<Ctx>>,
job_return_status_rx: chan::Receiver<(JobId, Result<ReturnStatus, Error>)>,
) {
enum StreamMessage<Ctx: JobContext> {
enum StreamMessage<Ctx: OuterContext> {
ReturnStatus((JobId, Result<ReturnStatus, Error>)),
RunnerMessage(RunnerMessage<Ctx>),
CleanMemoryTick,
@ -453,15 +450,11 @@ pub(super) async fn run<Ctx: JobContext>(
id,
location_id,
dyn_job,
job_ctx,
ctx,
ack_tx,
}) => {
ack_tx
.send(
runner
.new_job(id, location_id, dyn_job, job_ctx, None)
.await,
)
.send(runner.new_job(id, location_id, dyn_job, ctx, None).await)
.expect("ack channel closed before sending new job response");
}
@ -469,14 +462,14 @@ pub(super) async fn run<Ctx: JobContext>(
id,
location_id,
dyn_job,
job_ctx,
ctx,
serialized_tasks,
ack_tx,
}) => {
ack_tx
.send(
runner
.new_job(id, location_id, dyn_job, job_ctx, serialized_tasks)
.new_job(id, location_id, dyn_job, ctx, serialized_tasks)
.await,
)
.expect("ack channel closed before sending resume job response");

View file

@ -1,4 +1,4 @@
use crate::{file_identifier::FileIdentifierJob, indexer::IndexerJob};
use crate::{file_identifier, indexer, media_processor};
use sd_prisma::prisma::{job, location};
use sd_utils::uuid_to_bytes;
@ -14,7 +14,7 @@ use futures_concurrency::future::TryJoin;
use serde::{Deserialize, Serialize};
use super::{
job::{DynJob, Job, JobContext, JobHolder, JobName},
job::{DynJob, Job, JobHolder, JobName, OuterContext},
report::{Report, ReportError},
JobId, JobSystemError,
};
@ -22,7 +22,7 @@ use super::{
#[derive(Debug, Serialize, Deserialize)]
pub struct SerializedTasks(pub Vec<u8>);
pub trait SerializableJob: 'static
pub trait SerializableJob<Ctx: OuterContext>: 'static
where
Self: Sized,
{
@ -35,7 +35,7 @@ where
#[allow(unused_variables)]
fn deserialize(
serialized_job: &[u8],
ctx: &impl JobContext,
ctx: &Ctx,
) -> impl Future<
Output = Result<Option<(Self, Option<SerializedTasks>)>, rmp_serde::decode::Error>,
> + Send {
@ -57,9 +57,9 @@ pub struct StoredJobEntry {
pub(super) next_jobs: Vec<StoredJob>,
}
pub async fn load_jobs<Ctx: JobContext>(
pub async fn load_jobs<Ctx: OuterContext>(
entries: Vec<StoredJobEntry>,
job_ctx: &Ctx,
ctx: &Ctx,
) -> Result<
Vec<(
location::id::Type,
@ -68,7 +68,7 @@ pub async fn load_jobs<Ctx: JobContext>(
)>,
JobSystemError,
> {
let mut reports = job_ctx
let mut reports = ctx
.db()
.job()
.find_many(vec![job::id::in_vec(
@ -105,7 +105,7 @@ pub async fn load_jobs<Ctx: JobContext>(
.ok_or(ReportError::MissingReport(root_job.id))?;
Ok(async move {
load_job(root_job, report, job_ctx)
load_job(root_job, report, ctx)
.await
.map(|maybe_loaded_job| {
maybe_loaded_job
@ -135,7 +135,7 @@ pub async fn load_jobs<Ctx: JobContext>(
next_jobs_and_reports
.into_iter()
.map(|(next_job, report)| async move {
load_job(next_job, report, job_ctx)
load_job(next_job, report, ctx)
.await
.map(|maybe_loaded_next_job| {
maybe_loaded_next_job.map(|(next_dyn_job, next_tasks)| {
@ -166,7 +166,7 @@ pub async fn load_jobs<Ctx: JobContext>(
}
macro_rules! match_deserialize_job {
($stored_job:ident, $report:ident, $job_ctx:ident, $ctx_type:ty, [$($job_type:ty),+ $(,)?]) => {{
($stored_job:ident, $report:ident, $ctx:ident, $ctx_type:ty, [$($job_type:ty),+ $(,)?]) => {{
let StoredJob {
id,
name,
@ -175,9 +175,9 @@ macro_rules! match_deserialize_job {
match name {
$(<$job_type as Job>::NAME => <$job_type as SerializableJob>::deserialize(
$(<$job_type as Job>::NAME => <$job_type as SerializableJob<$ctx_type>>::deserialize(
&serialized_job,
$job_ctx,
$ctx,
).await
.map(|maybe_job| maybe_job.map(|(job, tasks)| -> (
Box<dyn DynJob<$ctx_type>>,
@ -200,21 +200,21 @@ macro_rules! match_deserialize_job {
}};
}
async fn load_job<Ctx: JobContext>(
async fn load_job<Ctx: OuterContext>(
stored_job: StoredJob,
report: Report,
job_ctx: &Ctx,
ctx: &Ctx,
) -> Result<Option<(Box<dyn DynJob<Ctx>>, Option<SerializedTasks>)>, JobSystemError> {
match_deserialize_job!(
stored_job,
report,
job_ctx,
ctx,
Ctx,
[
IndexerJob,
FileIdentifierJob,
indexer::job::Indexer,
file_identifier::job::FileIdentifier,
media_processor::job::MediaProcessor,
// TODO: Add more jobs here
// e.g.: FileIdentifierJob, MediaProcessorJob, etc.,
]
)
}

View file

@ -27,6 +27,7 @@
#![forbid(deprecated_in_future)]
#![allow(clippy::missing_errors_doc, clippy::module_name_repetitions)]
use sd_prisma::prisma::file_path;
use sd_task_system::TaskSystemError;
use serde::{Deserialize, Serialize};
@ -36,22 +37,24 @@ use thiserror::Error;
pub mod file_identifier;
pub mod indexer;
pub mod job_system;
pub mod media_processor;
pub mod utils;
use file_identifier::{FileIdentifierError, NonCriticalFileIdentifierError};
use indexer::{IndexerError, NonCriticalIndexerError};
use media_processor::ThumbKey;
pub use job_system::{
job::{IntoJob, JobBuilder, JobContext, JobName, JobOutput, JobOutputData, ProgressUpdate},
job::{IntoJob, JobBuilder, JobName, JobOutput, JobOutputData, OuterContext, ProgressUpdate},
JobId, JobSystem,
};
#[derive(Error, Debug)]
pub enum Error {
#[error(transparent)]
Indexer(#[from] IndexerError),
Indexer(#[from] indexer::Error),
#[error(transparent)]
FileIdentifier(#[from] FileIdentifierError),
FileIdentifier(#[from] file_identifier::Error),
#[error(transparent)]
MediaProcessor(#[from] media_processor::Error),
#[error(transparent)]
TaskSystem(#[from] TaskSystemError),
@ -62,6 +65,7 @@ impl From<Error> for rspc::Error {
match e {
Error::Indexer(e) => e.into(),
Error::FileIdentifier(e) => e.into(),
Error::MediaProcessor(e) => e.into(),
Error::TaskSystem(e) => {
Self::with_cause(rspc::ErrorCode::InternalServerError, e.to_string(), e)
}
@ -70,12 +74,14 @@ impl From<Error> for rspc::Error {
}
#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)]
pub enum NonCriticalJobError {
pub enum NonCriticalError {
// TODO: Add variants as needed
#[error(transparent)]
Indexer(#[from] NonCriticalIndexerError),
Indexer(#[from] indexer::NonCriticalError),
#[error(transparent)]
FileIdentifier(#[from] NonCriticalFileIdentifierError),
FileIdentifier(#[from] file_identifier::NonCriticalError),
#[error(transparent)]
MediaProcessor(#[from] media_processor::NonCriticalError),
}
#[repr(i32)]
@ -86,3 +92,13 @@ pub enum LocationScanState {
FilesIdentified = 2,
Completed = 3,
}
#[derive(Debug, Serialize, Type)]
pub enum UpdateEvent {
NewThumbnailEvent {
thumb_key: ThumbKey,
},
NewIdentifiedObjects {
file_path_ids: Vec<file_path::id::Type>,
},
}

View file

@ -0,0 +1,85 @@
use crate::media_processor::{self, media_data_extractor};
use sd_file_ext::extensions::{Extension, ImageExtension, ALL_IMAGE_EXTENSIONS};
use sd_media_metadata::ExifMetadata;
use sd_prisma::prisma::{exif_data, object, PrismaClient};
use std::path::Path;
use once_cell::sync::Lazy;
pub static AVAILABLE_EXTENSIONS: Lazy<Vec<Extension>> = Lazy::new(|| {
ALL_IMAGE_EXTENSIONS
.iter()
.copied()
.filter(|&ext| can_extract(ext))
.map(Extension::Image)
.collect()
});
pub const fn can_extract(image_extension: ImageExtension) -> bool {
use ImageExtension::{
Avci, Avcs, Avif, Dng, Heic, Heif, Heifs, Hif, Jpeg, Jpg, Png, Tiff, Webp,
};
matches!(
image_extension,
Tiff | Dng | Jpeg | Jpg | Heif | Heifs | Heic | Avif | Avcs | Avci | Hif | Png | Webp
)
}
pub fn to_query(
mdi: ExifMetadata,
object_id: exif_data::object_id::Type,
) -> exif_data::CreateUnchecked {
exif_data::CreateUnchecked {
object_id,
_params: vec![
exif_data::camera_data::set(serde_json::to_vec(&mdi.camera_data).ok()),
exif_data::media_date::set(serde_json::to_vec(&mdi.date_taken).ok()),
exif_data::resolution::set(serde_json::to_vec(&mdi.resolution).ok()),
exif_data::media_location::set(serde_json::to_vec(&mdi.location).ok()),
exif_data::artist::set(mdi.artist),
exif_data::description::set(mdi.description),
exif_data::copyright::set(mdi.copyright),
exif_data::exif_version::set(mdi.exif_version),
exif_data::epoch_time::set(mdi.date_taken.map(|x| x.unix_timestamp())),
],
}
}
pub async fn extract(
path: impl AsRef<Path> + Send,
) -> Result<Option<ExifMetadata>, media_processor::NonCriticalError> {
let path = path.as_ref();
ExifMetadata::from_path(&path).await.map_err(|e| {
media_data_extractor::NonCriticalError::FailedToExtractImageMediaData(
path.to_path_buf(),
e.to_string(),
)
.into()
})
}
pub async fn save(
media_datas: Vec<(ExifMetadata, object::id::Type)>,
db: &PrismaClient,
) -> Result<u64, media_processor::Error> {
db.exif_data()
.create_many(
media_datas
.into_iter()
.map(|(exif_data, object_id)| to_query(exif_data, object_id))
.collect(),
)
.skip_duplicates()
.exec()
.await
.map(|created| {
#[allow(clippy::cast_sign_loss)]
{
created as u64
}
})
.map_err(Into::into)
}

View file

@ -0,0 +1,572 @@
use crate::media_processor::{self, media_data_extractor};
use sd_file_ext::extensions::{
AudioExtension, Extension, VideoExtension, ALL_AUDIO_EXTENSIONS, ALL_VIDEO_EXTENSIONS,
};
use sd_media_metadata::{
ffmpeg::{
audio_props::AudioProps,
chapter::Chapter,
codec::{Codec, Props},
metadata::Metadata,
program::Program,
stream::Stream,
video_props::VideoProps,
},
FFmpegMetadata,
};
use sd_prisma::prisma::{
ffmpeg_data, ffmpeg_media_audio_props, ffmpeg_media_chapter, ffmpeg_media_codec,
ffmpeg_media_program, ffmpeg_media_stream, ffmpeg_media_video_props, object, PrismaClient,
};
use sd_utils::db::ffmpeg_data_field_to_db;
use std::{collections::HashMap, path::Path};
use futures_concurrency::future::TryJoin;
use once_cell::sync::Lazy;
use prisma_client_rust::QueryError;
use tracing::error;
pub static AVAILABLE_EXTENSIONS: Lazy<Vec<Extension>> = Lazy::new(|| {
ALL_AUDIO_EXTENSIONS
.iter()
.copied()
.filter(|&ext| can_extract_for_audio(ext))
.map(Extension::Audio)
.chain(
ALL_VIDEO_EXTENSIONS
.iter()
.copied()
.filter(|&ext| can_extract_for_video(ext))
.map(Extension::Video),
)
.collect()
});
pub const fn can_extract_for_audio(audio_extension: AudioExtension) -> bool {
use AudioExtension::{
Aac, Adts, Aif, Aiff, Amr, Aptx, Ast, Caf, Flac, Loas, M4a, Mid, Mp2, Mp3, Oga, Ogg, Opus,
Tta, Voc, Wav, Wma, Wv,
};
matches!(
audio_extension,
Mp3 | Mp2
| M4a | Wav | Aiff
| Aif | Flac | Ogg
| Oga | Opus | Wma
| Amr | Aac | Wv
| Voc | Tta | Loas
| Caf | Aptx | Adts
| Ast | Mid
)
}
pub const fn can_extract_for_video(video_extension: VideoExtension) -> bool {
use VideoExtension::{
Asf, Avi, Avifs, F4v, Flv, Hevc, M2ts, M2v, M4v, Mjpeg, Mkv, Mov, Mp4, Mpe, Mpeg, Mpg, Mts,
Mxf, Ogv, Qt, Swf, Ts, Vob, Webm, Wm, Wmv, Wtv, _3gp,
};
matches!(
video_extension,
Avi | Avifs
| Qt | Mov | Swf
| Mjpeg | Ts | Mts
| Mpeg | Mxf | M2v
| Mpg | Mpe | M2ts
| Flv | Wm | _3gp
| M4v | Wmv | Asf
| Mp4 | Webm | Mkv
| Vob | Ogv | Wtv
| Hevc | F4v
)
}
pub async fn extract(
path: impl AsRef<Path> + Send,
) -> Result<FFmpegMetadata, media_processor::NonCriticalError> {
let path = path.as_ref();
FFmpegMetadata::from_path(&path).await.map_err(|e| {
media_data_extractor::NonCriticalError::FailedToExtractImageMediaData(
path.to_path_buf(),
e.to_string(),
)
.into()
})
}
pub async fn save(
ffmpeg_datas: impl IntoIterator<Item = (FFmpegMetadata, object::id::Type)> + Send,
db: &PrismaClient,
) -> Result<u64, media_processor::Error> {
ffmpeg_datas
.into_iter()
.map(
move |(
FFmpegMetadata {
formats,
duration,
start_time,
bit_rate,
chapters,
programs,
metadata,
},
object_id,
)| {
db._transaction()
.with_timeout(30 * 1000)
.run(move |db| async move {
let data_id = create_ffmpeg_data(
formats, bit_rate, duration, start_time, metadata, object_id, &db,
)
.await?;
create_ffmpeg_chapters(data_id, chapters, &db).await?;
let streams = create_ffmpeg_programs(data_id, programs, &db).await?;
let codecs = create_ffmpeg_streams(data_id, streams, &db).await?;
let (audio_props, video_props) =
create_ffmpeg_codecs(data_id, codecs, &db).await?;
(
create_ffmpeg_audio_props(audio_props, &db),
create_ffmpeg_video_props(video_props, &db),
)
.try_join()
.await
.map(|_| ())
})
},
)
.collect::<Vec<_>>()
.try_join()
.await
.map(|created| created.len() as u64)
.map_err(Into::into)
}
async fn create_ffmpeg_data(
formats: Vec<String>,
(bit_rate_high, bit_rate_low): (i32, u32),
maybe_duration: Option<(i32, u32)>,
maybe_start_time: Option<(i32, u32)>,
metadata: Metadata,
object_id: i32,
db: &PrismaClient,
) -> Result<ffmpeg_data::id::Type, QueryError> {
db.ffmpeg_data()
.create(
formats.join(","),
ffmpeg_data_field_to_db(i64::from(bit_rate_high) << 32 | i64::from(bit_rate_low)),
object::id::equals(object_id),
vec![
ffmpeg_data::duration::set(maybe_duration.map(|(duration_high, duration_low)| {
ffmpeg_data_field_to_db(
i64::from(duration_high) << 32 | i64::from(duration_low),
)
})),
ffmpeg_data::start_time::set(maybe_start_time.map(
|(start_time_high, start_time_low)| {
ffmpeg_data_field_to_db(
i64::from(start_time_high) << 32 | i64::from(start_time_low),
)
},
)),
ffmpeg_data::metadata::set(
serde_json::to_vec(&metadata)
.map_err(|err| {
error!("Error reading FFmpegData metadata: {err:#?}");
err
})
.ok(),
),
],
)
.select(ffmpeg_data::select!({ id }))
.exec()
.await
.map(|data| data.id)
}
async fn create_ffmpeg_chapters(
ffmpeg_data_id: ffmpeg_data::id::Type,
chapters: Vec<Chapter>,
db: &PrismaClient,
) -> Result<(), QueryError> {
db.ffmpeg_media_chapter()
.create_many(
chapters
.into_iter()
.map(
|Chapter {
id: chapter_id,
start: (start_high, start_low),
end: (end_high, end_low),
time_base_den,
time_base_num,
metadata,
}| ffmpeg_media_chapter::CreateUnchecked {
chapter_id,
start: ffmpeg_data_field_to_db(
i64::from(start_high) << 32 | i64::from(start_low),
),
end: ffmpeg_data_field_to_db(
i64::from(end_high) << 32 | i64::from(end_low),
),
time_base_den,
time_base_num,
ffmpeg_data_id,
_params: vec![ffmpeg_media_chapter::metadata::set(
serde_json::to_vec(&metadata)
.map_err(|err| {
error!("Error reading FFmpegMediaChapter metadata: {err:#?}");
err
})
.ok(),
)],
},
)
.collect(),
)
.exec()
.await
.map(|_| ())
}
async fn create_ffmpeg_programs(
data_id: i32,
programs: Vec<Program>,
db: &PrismaClient,
) -> Result<Vec<(ffmpeg_media_program::program_id::Type, Vec<Stream>)>, QueryError> {
let (creates, streams_by_program_id) =
programs
.into_iter()
.map(
|Program {
id: program_id,
name,
metadata,
streams,
}| {
(
ffmpeg_media_program::CreateUnchecked {
program_id,
ffmpeg_data_id: data_id,
_params: vec![
ffmpeg_media_program::name::set(name),
ffmpeg_media_program::metadata::set(
serde_json::to_vec(&metadata)
.map_err(|err| {
error!("Error reading FFmpegMediaProgram metadata: {err:#?}");
err
})
.ok(),
),
],
},
(program_id, streams),
)
},
)
.unzip::<_, _, Vec<_>, Vec<_>>();
db.ffmpeg_media_program()
.create_many(creates)
.exec()
.await
.map(|_| streams_by_program_id)
}
async fn create_ffmpeg_streams(
ffmpeg_data_id: ffmpeg_data::id::Type,
streams: Vec<(ffmpeg_media_program::program_id::Type, Vec<Stream>)>,
db: &PrismaClient,
) -> Result<
Vec<(
ffmpeg_media_program::program_id::Type,
ffmpeg_media_stream::stream_id::Type,
Codec,
)>,
QueryError,
> {
let (creates, maybe_codecs) = streams
.into_iter()
.flat_map(|(program_id, streams)| {
streams.into_iter().map(
move |Stream {
id: stream_id,
name,
codec: maybe_codec,
aspect_ratio_num,
aspect_ratio_den,
frames_per_second_num,
frames_per_second_den,
time_base_real_den,
time_base_real_num,
dispositions,
metadata,
}| {
(
ffmpeg_media_stream::CreateUnchecked {
stream_id,
aspect_ratio_num,
aspect_ratio_den,
frames_per_second_num,
frames_per_second_den,
time_base_real_den,
time_base_real_num,
program_id,
ffmpeg_data_id,
_params: vec![
ffmpeg_media_stream::name::set(name),
ffmpeg_media_stream::dispositions::set(
(!dispositions.is_empty()).then_some(dispositions.join(",")),
),
ffmpeg_media_stream::title::set(metadata.title.clone()),
ffmpeg_media_stream::encoder::set(metadata.encoder.clone()),
ffmpeg_media_stream::language::set(metadata.language.clone()),
ffmpeg_media_stream::metadata::set(
serde_json::to_vec(&metadata)
.map_err(|err| {
error!("Error reading FFmpegMediaStream metadata: {err:#?}");
err
})
.ok(),
),
],
},
maybe_codec.map(|codec| (program_id, stream_id, codec)),
)
},
)
})
.unzip::<_, _, Vec<_>, Vec<_>>();
db.ffmpeg_media_stream()
.create_many(creates)
.exec()
.await
.map(|_| maybe_codecs.into_iter().flatten().collect())
}
async fn create_ffmpeg_codecs(
ffmpeg_data_id: ffmpeg_data::id::Type,
codecs: Vec<(
ffmpeg_media_program::program_id::Type,
ffmpeg_media_stream::stream_id::Type,
Codec,
)>,
db: &PrismaClient,
) -> Result<
(
Vec<(ffmpeg_media_codec::id::Type, AudioProps)>,
Vec<(ffmpeg_media_codec::id::Type, VideoProps)>,
),
QueryError,
> {
let expected_creates = codecs.len();
let (creates, mut audio_props, mut video_props) = codecs.into_iter().enumerate().fold(
(
Vec::with_capacity(expected_creates),
HashMap::with_capacity(expected_creates),
HashMap::with_capacity(expected_creates),
),
|(mut creates, mut audio_props, mut video_props),
(
idx,
(
program_id,
stream_id,
Codec {
kind,
sub_kind,
tag,
name,
profile,
bit_rate,
props: maybe_props,
},
),
)| {
creates.push(ffmpeg_media_codec::CreateUnchecked {
bit_rate,
stream_id,
program_id,
ffmpeg_data_id,
_params: vec![
ffmpeg_media_codec::kind::set(kind),
ffmpeg_media_codec::sub_kind::set(sub_kind),
ffmpeg_media_codec::tag::set(tag),
ffmpeg_media_codec::name::set(name),
ffmpeg_media_codec::profile::set(profile),
],
});
if let Some(props) = maybe_props {
match props {
Props::Audio(props) => {
audio_props.insert(idx, props);
}
Props::Video(props) => {
video_props.insert(idx, props);
}
Props::Subtitle(_) => {
// We don't care about subtitles props for now :D
}
}
}
(creates, audio_props, video_props)
},
);
let created_ids = creates
.into_iter()
.map(
|ffmpeg_media_codec::CreateUnchecked {
bit_rate,
stream_id,
program_id,
ffmpeg_data_id,
_params: params,
}| {
db.ffmpeg_media_codec()
.create_unchecked(bit_rate, stream_id, program_id, ffmpeg_data_id, params)
.select(ffmpeg_media_codec::select!({ id }))
.exec()
},
)
.collect::<Vec<_>>()
.try_join()
.await?;
assert_eq!(
created_ids.len(),
expected_creates,
"Not all codecs were created and our invariant is broken!"
);
debug_assert!(
created_ids
.windows(2)
.all(|window| window[0].id < window[1].id),
"Codecs were created in a different order than we expected, our invariant is broken!"
);
Ok(created_ids.into_iter().enumerate().fold(
(
Vec::with_capacity(audio_props.len()),
Vec::with_capacity(video_props.len()),
),
|(mut a_props, mut v_props), (idx, codec_data)| {
if let Some(audio_props) = audio_props.remove(&idx) {
a_props.push((codec_data.id, audio_props));
} else if let Some(video_props) = video_props.remove(&idx) {
v_props.push((codec_data.id, video_props));
}
(a_props, v_props)
},
))
}
async fn create_ffmpeg_audio_props(
audio_props: Vec<(ffmpeg_media_codec::id::Type, AudioProps)>,
db: &PrismaClient,
) -> Result<(), QueryError> {
db.ffmpeg_media_audio_props()
.create_many(
audio_props
.into_iter()
.map(
|(
codec_id,
AudioProps {
delay,
padding,
sample_rate,
sample_format,
bit_per_sample,
channel_layout,
},
)| ffmpeg_media_audio_props::CreateUnchecked {
delay,
padding,
codec_id,
_params: vec![
ffmpeg_media_audio_props::sample_rate::set(sample_rate),
ffmpeg_media_audio_props::sample_format::set(sample_format),
ffmpeg_media_audio_props::bit_per_sample::set(bit_per_sample),
ffmpeg_media_audio_props::channel_layout::set(channel_layout),
],
},
)
.collect(),
)
.exec()
.await
.map(|_| ())
}
async fn create_ffmpeg_video_props(
video_props: Vec<(ffmpeg_media_codec::id::Type, VideoProps)>,
db: &PrismaClient,
) -> Result<(), QueryError> {
db.ffmpeg_media_video_props()
.create_many(
video_props
.into_iter()
.map(
|(
codec_id,
VideoProps {
pixel_format,
color_range,
bits_per_channel,
color_space,
color_primaries,
color_transfer,
field_order,
chroma_location,
width,
height,
aspect_ratio_num,
aspect_ratio_den,
properties,
},
)| {
ffmpeg_media_video_props::CreateUnchecked {
width,
height,
codec_id,
_params: vec![
ffmpeg_media_video_props::pixel_format::set(pixel_format),
ffmpeg_media_video_props::color_range::set(color_range),
ffmpeg_media_video_props::bits_per_channel::set(bits_per_channel),
ffmpeg_media_video_props::color_space::set(color_space),
ffmpeg_media_video_props::color_primaries::set(color_primaries),
ffmpeg_media_video_props::color_transfer::set(color_transfer),
ffmpeg_media_video_props::field_order::set(field_order),
ffmpeg_media_video_props::chroma_location::set(chroma_location),
ffmpeg_media_video_props::aspect_ratio_num::set(aspect_ratio_num),
ffmpeg_media_video_props::aspect_ratio_den::set(aspect_ratio_den),
ffmpeg_media_video_props::properties::set(Some(
properties.join(","),
)),
],
}
},
)
.collect(),
)
.exec()
.await
.map(|_| ())
}

View file

@ -0,0 +1,3 @@
pub mod exif_media_data;
pub mod ffmpeg_media_data;
pub mod thumbnailer;

View file

@ -0,0 +1,135 @@
use once_cell::sync::Lazy;
use sd_file_ext::extensions::{
DocumentExtension, Extension, ImageExtension, ALL_DOCUMENT_EXTENSIONS, ALL_IMAGE_EXTENSIONS,
};
#[cfg(feature = "ffmpeg")]
use sd_file_ext::extensions::{VideoExtension, ALL_VIDEO_EXTENSIONS};
use std::time::Duration;
use serde::{Deserialize, Serialize};
use specta::Type;
use uuid::Uuid;
// Files names constants
pub const THUMBNAIL_CACHE_DIR_NAME: &str = "thumbnails";
pub const WEBP_EXTENSION: &str = "webp";
pub const EPHEMERAL_DIR: &str = "ephemeral";
/// This is the target pixel count for all thumbnails to be resized to, and it is eventually downscaled
/// to [`TARGET_QUALITY`].
pub const TARGET_PX: f32 = 1_048_576.0; // 1024x1024
/// This is the target quality that we render thumbnails at, it is a float between 0-100
/// and is treated as a percentage (so 60% in this case, or it's the same as multiplying by `0.6`).
pub const TARGET_QUALITY: f32 = 60.0;
/// How much time we allow for the thumbnail generation process to complete before we give up.
pub const THUMBNAIL_GENERATION_TIMEOUT: Duration = Duration::from_secs(60);
#[cfg(feature = "ffmpeg")]
pub static THUMBNAILABLE_VIDEO_EXTENSIONS: Lazy<Vec<Extension>> = Lazy::new(|| {
ALL_VIDEO_EXTENSIONS
.iter()
.copied()
.filter(|&ext| can_generate_thumbnail_for_video(ext))
.map(Extension::Video)
.collect()
});
pub static THUMBNAILABLE_EXTENSIONS: Lazy<Vec<Extension>> = Lazy::new(|| {
ALL_IMAGE_EXTENSIONS
.iter()
.copied()
.filter(|&ext| can_generate_thumbnail_for_image(ext))
.map(Extension::Image)
.chain(
ALL_DOCUMENT_EXTENSIONS
.iter()
.copied()
.filter(|&ext| can_generate_thumbnail_for_document(ext))
.map(Extension::Document),
)
.collect()
});
pub static ALL_THUMBNAILABLE_EXTENSIONS: Lazy<Vec<Extension>> = Lazy::new(|| {
#[cfg(feature = "ffmpeg")]
return THUMBNAILABLE_EXTENSIONS
.iter()
.cloned()
.chain(THUMBNAILABLE_VIDEO_EXTENSIONS.iter().cloned())
.collect();
#[cfg(not(feature = "ffmpeg"))]
THUMBNAILABLE_EXTENSIONS.clone()
});
/// This type is used to pass the relevant data to the frontend so it can request the thumbnail.
/// Tt supports extending the shard hex to support deeper directory structures in the future
#[derive(Debug, Serialize, Deserialize, Type)]
pub struct ThumbKey {
pub shard_hex: String,
pub cas_id: String,
pub base_directory_str: String,
}
impl ThumbKey {
#[must_use]
pub fn new(cas_id: &str, kind: &ThumbnailKind) -> Self {
Self {
shard_hex: get_shard_hex(cas_id).to_string(),
cas_id: cas_id.to_string(),
base_directory_str: match kind {
ThumbnailKind::Ephemeral => String::from(EPHEMERAL_DIR),
ThumbnailKind::Indexed(library_id) => library_id.to_string(),
},
}
}
}
#[derive(Debug, Serialize, Deserialize, Type, Clone, Copy)]
pub enum ThumbnailKind {
Ephemeral,
Indexed(Uuid),
}
/// The practice of dividing files into hex coded folders, often called "sharding,"
/// is mainly used to optimize file system performance. File systems can start to slow down
/// as the number of files in a directory increases. Thus, it's often beneficial to split
/// files into multiple directories to avoid this performance degradation.
///
/// `get_shard_hex` takes a `cas_id` (a hexadecimal hash) as input and returns the first
/// three characters of the hash as the directory name. Because we're using these first
/// three characters of a the hash, this will give us 4096 (16^3) possible directories,
/// named 000 to fff.
#[inline]
pub fn get_shard_hex(cas_id: &str) -> &str {
// Use the first three characters of the hash as the directory name
&cas_id[0..3]
}
#[cfg(feature = "ffmpeg")]
pub const fn can_generate_thumbnail_for_video(video_extension: VideoExtension) -> bool {
use VideoExtension::{Hevc, M2ts, M2v, Mpg, Mts, Swf, Ts};
// File extensions that are specifically not supported by the thumbnailer
!matches!(video_extension, Mpg | Swf | M2v | Hevc | M2ts | Mts | Ts)
}
pub const fn can_generate_thumbnail_for_image(image_extension: ImageExtension) -> bool {
use ImageExtension::{
Avif, Bmp, Gif, Heic, Heics, Heif, Heifs, Ico, Jpeg, Jpg, Png, Svg, Webp,
};
matches!(
image_extension,
Jpg | Jpeg | Png | Webp | Gif | Svg | Heic | Heics | Heif | Heifs | Avif | Bmp | Ico
)
}
pub const fn can_generate_thumbnail_for_document(document_extension: DocumentExtension) -> bool {
use DocumentExtension::Pdf;
matches!(document_extension, Pdf)
}

View file

@ -0,0 +1,825 @@
use crate::{
job_system::{
job::{Job, JobReturn, JobTaskDispatcher, ReturnStatus},
report::ReportOutputMetadata,
utils::cancel_pending_tasks,
SerializableJob, SerializedTasks,
},
media_processor::{self, helpers::thumbnailer::THUMBNAIL_CACHE_DIR_NAME},
utils::sub_path::{self, maybe_get_iso_file_path_from_sub_path},
Error, JobName, LocationScanState, OuterContext, ProgressUpdate,
};
use sd_core_file_path_helper::IsolatedFilePathData;
use sd_core_prisma_helpers::file_path_for_media_processor;
use sd_file_ext::extensions::Extension;
use sd_prisma::prisma::{location, PrismaClient};
use sd_task_system::{
AnyTaskOutput, IntoTask, SerializableTask, Task, TaskDispatcher, TaskHandle, TaskOutput,
TaskStatus,
};
use sd_utils::db::maybe_missing;
use std::{
collections::HashMap,
fmt,
hash::{Hash, Hasher},
mem,
path::PathBuf,
sync::Arc,
time::Duration,
};
use futures::{stream::FuturesUnordered, StreamExt};
use futures_concurrency::future::TryJoin;
use itertools::Itertools;
use prisma_client_rust::{raw, PrismaValue};
use serde::{Deserialize, Serialize};
use serde_json::json;
use tracing::{debug, warn};
use super::{
helpers,
tasks::{self, media_data_extractor, thumbnailer},
NewThumbnailsReporter, BATCH_SIZE,
};
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
enum TaskKind {
MediaDataExtractor,
Thumbnailer,
}
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
enum Phase {
MediaDataExtraction,
ThumbnailGeneration,
// LabelsGeneration, // TODO: Implement labels generation
}
impl Default for Phase {
fn default() -> Self {
Self::MediaDataExtraction
}
}
impl fmt::Display for Phase {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::MediaDataExtraction => write!(f, "media_data"),
Self::ThumbnailGeneration => write!(f, "thumbnails"),
// Self::LabelsGeneration => write!(f, "labels"), // TODO: Implement labels generation
}
}
}
#[derive(Debug)]
pub struct MediaProcessor {
location: Arc<location::Data>,
location_path: Arc<PathBuf>,
sub_path: Option<PathBuf>,
regenerate_thumbnails: bool,
total_media_data_extraction_tasks: u64,
total_thumbnailer_tasks: u64,
total_thumbnailer_files: u64,
phase: Phase,
metadata: Metadata,
errors: Vec<crate::NonCriticalError>,
pending_tasks_on_resume: Vec<TaskHandle<Error>>,
tasks_for_shutdown: Vec<Box<dyn Task<Error>>>,
}
impl Job for MediaProcessor {
const NAME: JobName = JobName::MediaProcessor;
async fn resume_tasks(
&mut self,
dispatcher: &JobTaskDispatcher,
ctx: &impl OuterContext,
SerializedTasks(serialized_tasks): SerializedTasks,
) -> Result<(), Error> {
let reporter = Arc::new(NewThumbnailsReporter { ctx: ctx.clone() });
self.pending_tasks_on_resume = dispatcher
.dispatch_many_boxed(
rmp_serde::from_slice::<Vec<(TaskKind, Vec<u8>)>>(&serialized_tasks)
.map_err(media_processor::Error::from)?
.into_iter()
.map(|(task_kind, task_bytes)| {
let reporter = Arc::clone(&reporter);
async move {
match task_kind {
TaskKind::MediaDataExtractor => {
tasks::MediaDataExtractor::deserialize(
&task_bytes,
Arc::clone(ctx.db()),
)
.await
.map(IntoTask::into_task)
}
TaskKind::Thumbnailer => tasks::Thumbnailer::deserialize(
&task_bytes,
Arc::clone(&reporter),
)
.await
.map(IntoTask::into_task),
}
}
})
.collect::<Vec<_>>()
.try_join()
.await
.map_err(media_processor::Error::from)?,
)
.await;
Ok(())
}
async fn run<Ctx: OuterContext>(
mut self,
dispatcher: JobTaskDispatcher,
ctx: Ctx,
) -> Result<ReturnStatus, Error> {
let mut pending_running_tasks = FuturesUnordered::new();
self.init_or_resume(&mut pending_running_tasks, &ctx, &dispatcher)
.await?;
if let Some(res) = self.process_handles(&mut pending_running_tasks, &ctx).await {
return res;
}
if !self.tasks_for_shutdown.is_empty() {
return Ok(ReturnStatus::Shutdown(
SerializableJob::<Ctx>::serialize(self).await,
));
}
// From this point onward, we are done with the job and it can't be interrupted anymore
let Self {
location,
metadata,
errors,
..
} = self;
ctx.db()
.location()
.update(
location::id::equals(location.id),
vec![location::scan_state::set(
LocationScanState::Completed as i32,
)],
)
.exec()
.await
.map_err(media_processor::Error::from)?;
Ok(ReturnStatus::Completed(
JobReturn::builder()
.with_metadata(metadata)
.with_non_critical_errors(errors)
.build(),
))
}
}
impl MediaProcessor {
pub fn new(
location: location::Data,
sub_path: Option<PathBuf>,
regenerate_thumbnails: bool,
) -> Result<Self, media_processor::Error> {
Ok(Self {
location_path: maybe_missing(&location.path, "location.path")
.map(PathBuf::from)
.map(Arc::new)?,
location: Arc::new(location),
sub_path,
regenerate_thumbnails,
total_media_data_extraction_tasks: 0,
total_thumbnailer_tasks: 0,
total_thumbnailer_files: 0,
phase: Phase::default(),
metadata: Metadata::default(),
errors: Vec::new(),
pending_tasks_on_resume: Vec::new(),
tasks_for_shutdown: Vec::new(),
})
}
async fn init_or_resume(
&mut self,
pending_running_tasks: &mut FuturesUnordered<TaskHandle<Error>>,
ctx: &impl OuterContext,
dispatcher: &JobTaskDispatcher,
) -> Result<(), media_processor::Error> {
// if we don't have any pending task, then this is a fresh job
if self.pending_tasks_on_resume.is_empty() {
let location_id = self.location.id;
let location_path = &*self.location_path;
let iso_file_path = maybe_get_iso_file_path_from_sub_path(
location_id,
&self.sub_path,
&*self.location_path,
ctx.db(),
)
.await?
.map_or_else(
|| {
IsolatedFilePathData::new(location_id, location_path, location_path, true)
.map_err(sub_path::Error::from)
},
Ok,
)?;
debug!(
"Searching for media files in location {location_id} at directory \"{iso_file_path}\""
);
// First we will dispatch all tasks for media data extraction so we have a nice reporting
let (total_media_data_extraction_files, task_handles) =
dispatch_media_data_extractor_tasks(
ctx.db(),
&iso_file_path,
&self.location_path,
dispatcher,
)
.await?;
self.total_media_data_extraction_tasks = task_handles.len() as u64;
pending_running_tasks.extend(task_handles);
ctx.progress(vec![
ProgressUpdate::TaskCount(total_media_data_extraction_files),
ProgressUpdate::Phase(self.phase.to_string()),
ProgressUpdate::Message(format!(
"Preparing to process {total_media_data_extraction_files} files in {} chunks",
self.total_media_data_extraction_tasks
)),
]);
// Now we dispatch thumbnailer tasks
let (total_thumbnailer_tasks, task_handles) = dispatch_thumbnailer_tasks(
&iso_file_path,
self.regenerate_thumbnails,
&self.location_path,
dispatcher,
ctx,
)
.await?;
pending_running_tasks.extend(task_handles);
self.total_thumbnailer_tasks = total_thumbnailer_tasks;
} else {
pending_running_tasks.extend(mem::take(&mut self.pending_tasks_on_resume));
}
Ok(())
}
async fn process_handles(
&mut self,
pending_running_tasks: &mut FuturesUnordered<TaskHandle<Error>>,
ctx: &impl OuterContext,
) -> Option<Result<ReturnStatus, Error>> {
while let Some(task) = pending_running_tasks.next().await {
match task {
Ok(TaskStatus::Done((task_id, TaskOutput::Out(out)))) => {
self.process_task_output(task_id, out, ctx);
}
Ok(TaskStatus::Done((task_id, TaskOutput::Empty))) => {
warn!("Task <id='{task_id}'> returned an empty output");
}
Ok(TaskStatus::Shutdown(task)) => {
self.tasks_for_shutdown.push(task);
}
Ok(TaskStatus::Error(e)) => {
cancel_pending_tasks(&*pending_running_tasks).await;
return Some(Err(e));
}
Ok(TaskStatus::Canceled | TaskStatus::ForcedAbortion) => {
cancel_pending_tasks(&*pending_running_tasks).await;
return Some(Ok(ReturnStatus::Canceled));
}
Err(e) => {
cancel_pending_tasks(&*pending_running_tasks).await;
return Some(Err(e.into()));
}
}
}
None
}
fn process_task_output(
&mut self,
task_id: uuid::Uuid,
any_task_output: Box<dyn AnyTaskOutput>,
ctx: &impl OuterContext,
) {
if any_task_output.is::<media_data_extractor::Output>() {
let media_data_extractor::Output {
extracted,
skipped,
db_read_time,
filtering_time,
extraction_time,
db_write_time,
errors,
} = *any_task_output.downcast().expect("just checked");
self.metadata.media_data_metrics.extracted += extracted;
self.metadata.media_data_metrics.skipped += skipped;
self.metadata.media_data_metrics.db_read_time += db_read_time;
self.metadata.media_data_metrics.filtering_time += filtering_time;
self.metadata.media_data_metrics.extraction_time += extraction_time;
self.metadata.media_data_metrics.db_write_time += db_write_time;
self.metadata.media_data_metrics.total_successful_tasks += 1;
self.errors.extend(errors);
debug!(
"Processed {}/{} media data extraction tasks",
self.metadata.media_data_metrics.total_successful_tasks,
self.total_media_data_extraction_tasks
);
ctx.progress(vec![ProgressUpdate::CompletedTaskCount(
self.metadata.media_data_metrics.extracted
+ self.metadata.media_data_metrics.skipped,
)]);
if self.total_media_data_extraction_tasks
== self.metadata.media_data_metrics.total_successful_tasks
{
debug!("All media data extraction tasks have been processed");
self.phase = Phase::ThumbnailGeneration;
ctx.progress(vec![
ProgressUpdate::TaskCount(self.total_thumbnailer_files),
ProgressUpdate::Phase(self.phase.to_string()),
ProgressUpdate::Message(format!(
"Waiting for processing of {} thumbnails in {} tasks",
self.total_thumbnailer_files, self.total_thumbnailer_tasks
)),
]);
}
} else if any_task_output.is::<thumbnailer::Output>() {
let thumbnailer::Output {
generated,
skipped,
errors,
total_time,
mean_time_acc,
std_dev_acc,
} = *any_task_output.downcast().expect("just checked");
self.metadata.thumbnailer_metrics_acc.generated += generated;
self.metadata.thumbnailer_metrics_acc.skipped += skipped;
self.metadata.thumbnailer_metrics_acc.total_time += total_time;
self.metadata.thumbnailer_metrics_acc.mean_time_acc += mean_time_acc;
self.metadata.thumbnailer_metrics_acc.std_dev_acc += std_dev_acc;
self.metadata.thumbnailer_metrics_acc.total_successful_tasks += 1;
self.errors.extend(errors);
ctx.progress(vec![ProgressUpdate::CompletedTaskCount(
self.metadata.thumbnailer_metrics_acc.generated
+ self.metadata.thumbnailer_metrics_acc.skipped,
)]);
// if self.total_thumbnailer_tasks
// == self.metadata.thumbnailer_metrics_acc.total_successful_tasks
// {
// debug!("All thumbnailer tasks have been processed");
// self.phase = Phase::LabelsGeneration;
// ctx.progress(vec![
// ProgressUpdate::TaskCount(self.total_thumbnailer_files),
// ProgressUpdate::Phase(self.phase.to_string()),
// ProgressUpdate::Message(format!(
// "Waiting for processing of {} labels in {} tasks",
// self.total_labeller_files, self.total_labeller_tasks
// )),
// ]);
// }
} else {
unreachable!("Unexpected task output type: <id='{task_id}'>");
}
}
}
#[derive(Debug, Serialize, Deserialize, Default)]
struct Metadata {
media_data_metrics: MediaExtractorMetrics,
thumbnailer_metrics_acc: ThumbnailerMetricsAccumulator,
}
impl From<Metadata> for ReportOutputMetadata {
fn from(
Metadata {
media_data_metrics,
thumbnailer_metrics_acc: thumbnailer_metrics_accumulator,
}: Metadata,
) -> Self {
let thumbnailer_metrics = ThumbnailerMetrics::from(thumbnailer_metrics_accumulator);
Self::Metrics(HashMap::from([
//
// Media data extractor
//
(
"media_data_extraction_metrics".into(),
json!(media_data_metrics),
),
//
// Thumbnailer
//
("thumbnailer_metrics".into(), json!(thumbnailer_metrics)),
]))
}
}
#[derive(Debug, Serialize, Deserialize, Default)]
struct MediaExtractorMetrics {
extracted: u64,
skipped: u64,
db_read_time: Duration,
filtering_time: Duration,
extraction_time: Duration,
db_write_time: Duration,
total_successful_tasks: u64,
}
#[derive(Debug, Serialize, Deserialize, Default)]
struct ThumbnailerMetricsAccumulator {
generated: u64,
skipped: u64,
total_time: Duration,
mean_time_acc: f64,
std_dev_acc: f64,
total_successful_tasks: u64,
}
#[derive(Debug, Serialize, Deserialize, Default)]
struct ThumbnailerMetrics {
generated: u64,
skipped: u64,
total_generation_time: Duration,
mean_generation_time: Duration,
std_dev: Duration,
total_successful_tasks: u64,
}
impl From<ThumbnailerMetricsAccumulator> for ThumbnailerMetrics {
fn from(
ThumbnailerMetricsAccumulator {
generated,
skipped,
total_time: total_generation_time,
mean_time_acc: mean_generation_time_acc,
std_dev_acc,
total_successful_tasks,
}: ThumbnailerMetricsAccumulator,
) -> Self {
#[allow(clippy::cast_precision_loss)]
// SAFETY: we're probably won't have 2^52 thumbnails being generated on a single job for this cast to have
// a precision loss issue
let total = (generated + skipped) as f64;
let mean_generation_time = mean_generation_time_acc / total;
let std_dev = Duration::from_secs_f64(
(mean_generation_time.mul_add(-mean_generation_time, std_dev_acc / total)).sqrt(),
);
Self {
generated,
skipped,
total_generation_time,
mean_generation_time: Duration::from_secs_f64(mean_generation_time),
std_dev,
total_successful_tasks,
}
}
}
#[derive(Serialize, Deserialize)]
struct SaveState {
location: Arc<location::Data>,
location_path: Arc<PathBuf>,
sub_path: Option<PathBuf>,
regenerate_thumbnails: bool,
total_media_data_extraction_tasks: u64,
total_thumbnailer_tasks: u64,
total_thumbnailer_files: u64,
phase: Phase,
metadata: Metadata,
errors: Vec<crate::NonCriticalError>,
tasks_for_shutdown_bytes: Option<SerializedTasks>,
}
impl<Ctx: OuterContext> SerializableJob<Ctx> for MediaProcessor {
async fn serialize(self) -> Result<Option<Vec<u8>>, rmp_serde::encode::Error> {
let Self {
location,
location_path,
sub_path,
regenerate_thumbnails,
total_media_data_extraction_tasks,
total_thumbnailer_tasks,
total_thumbnailer_files,
phase,
metadata,
errors,
tasks_for_shutdown,
..
} = self;
rmp_serde::to_vec_named(&SaveState {
location,
location_path,
sub_path,
regenerate_thumbnails,
total_media_data_extraction_tasks,
total_thumbnailer_tasks,
total_thumbnailer_files,
phase,
metadata,
tasks_for_shutdown_bytes: Some(SerializedTasks(rmp_serde::to_vec_named(
&tasks_for_shutdown
.into_iter()
.map(|task| async move {
if task.is::<tasks::MediaDataExtractor>() {
task.downcast::<tasks::MediaDataExtractor>()
.expect("just checked")
.serialize()
.await
.map(|bytes| (TaskKind::MediaDataExtractor, bytes))
} else if task.is::<tasks::Thumbnailer<NewThumbnailsReporter<Ctx>>>() {
task.downcast::<tasks::Thumbnailer<NewThumbnailsReporter<Ctx>>>()
.expect("just checked")
.serialize()
.await
.map(|bytes| (TaskKind::Thumbnailer, bytes))
} else {
unreachable!("Unexpected task type")
}
})
.collect::<Vec<_>>()
.try_join()
.await?,
)?)),
errors,
})
.map(Some)
}
async fn deserialize(
serialized_job: &[u8],
_: &Ctx,
) -> Result<Option<(Self, Option<SerializedTasks>)>, rmp_serde::decode::Error> {
let SaveState {
location,
location_path,
sub_path,
regenerate_thumbnails,
total_media_data_extraction_tasks,
total_thumbnailer_tasks,
total_thumbnailer_files,
phase,
metadata,
errors,
tasks_for_shutdown_bytes,
} = rmp_serde::from_slice::<SaveState>(serialized_job)?;
Ok(Some((
Self {
location,
location_path,
sub_path,
regenerate_thumbnails,
total_media_data_extraction_tasks,
total_thumbnailer_tasks,
total_thumbnailer_files,
phase,
metadata,
errors,
pending_tasks_on_resume: Vec::new(),
tasks_for_shutdown: Vec::new(),
},
tasks_for_shutdown_bytes,
)))
}
}
impl Hash for MediaProcessor {
fn hash<H: Hasher>(&self, state: &mut H) {
self.location.id.hash(state);
if let Some(ref sub_path) = self.sub_path {
sub_path.hash(state);
}
}
}
async fn dispatch_media_data_extractor_tasks(
db: &Arc<PrismaClient>,
parent_iso_file_path: &IsolatedFilePathData<'_>,
location_path: &Arc<PathBuf>,
dispatcher: &JobTaskDispatcher,
) -> Result<(u64, Vec<TaskHandle<Error>>), media_processor::Error> {
let (extract_exif_file_paths, extract_ffmpeg_file_paths) = (
get_all_children_files_by_extensions(
db,
parent_iso_file_path,
&helpers::exif_media_data::AVAILABLE_EXTENSIONS,
),
get_all_children_files_by_extensions(
db,
parent_iso_file_path,
&helpers::ffmpeg_media_data::AVAILABLE_EXTENSIONS,
),
)
.try_join()
.await?;
let files_count = (extract_exif_file_paths.len() + extract_ffmpeg_file_paths.len()) as u64;
let tasks = extract_exif_file_paths
.into_iter()
.chunks(BATCH_SIZE)
.into_iter()
.map(Iterator::collect::<Vec<_>>)
.map(|chunked_file_paths| {
tasks::MediaDataExtractor::new_exif(
&chunked_file_paths,
parent_iso_file_path.location_id(),
Arc::clone(location_path),
Arc::clone(db),
)
})
.map(IntoTask::into_task)
.chain(
extract_ffmpeg_file_paths
.into_iter()
.chunks(BATCH_SIZE)
.into_iter()
.map(Iterator::collect::<Vec<_>>)
.map(|chunked_file_paths| {
tasks::MediaDataExtractor::new_ffmpeg(
&chunked_file_paths,
parent_iso_file_path.location_id(),
Arc::clone(location_path),
Arc::clone(db),
)
})
.map(IntoTask::into_task),
)
.collect::<Vec<_>>();
Ok((files_count, dispatcher.dispatch_many_boxed(tasks).await))
}
async fn get_all_children_files_by_extensions(
db: &PrismaClient,
parent_iso_file_path: &IsolatedFilePathData<'_>,
extensions: &[Extension],
) -> Result<Vec<file_path_for_media_processor::Data>, media_processor::Error> {
// FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite
// We have no data coming from the user, so this is sql injection safe
db._query_raw(raw!(
&format!(
"SELECT id, materialized_path, is_dir, name, extension, cas_id, object_id
FROM file_path
WHERE
location_id={{}}
AND cas_id IS NOT NULL
AND LOWER(extension) IN ({})
AND materialized_path LIKE {{}}
ORDER BY materialized_path ASC",
// Ordering by materialized_path so we can prioritize processing the first files
// in the above part of the directories tree
extensions
.iter()
.map(|ext| format!("LOWER('{ext}')"))
.collect::<Vec<_>>()
.join(",")
),
PrismaValue::Int(i64::from(parent_iso_file_path.location_id())),
PrismaValue::String(format!(
"{}%",
parent_iso_file_path
.materialized_path_for_children()
.expect("sub path iso_file_path must be a directory")
))
))
.exec()
.await
.map_err(Into::into)
}
async fn dispatch_thumbnailer_tasks(
parent_iso_file_path: &IsolatedFilePathData<'_>,
should_regenerate: bool,
location_path: &PathBuf,
dispatcher: &JobTaskDispatcher,
ctx: &impl OuterContext,
) -> Result<(u64, Vec<TaskHandle<Error>>), media_processor::Error> {
let thumbnails_directory_path =
Arc::new(ctx.get_data_directory().join(THUMBNAIL_CACHE_DIR_NAME));
let location_id = parent_iso_file_path.location_id();
let library_id = ctx.id();
let db = ctx.db();
let reporter = Arc::new(NewThumbnailsReporter { ctx: ctx.clone() });
let mut file_paths = get_all_children_files_by_extensions(
db,
parent_iso_file_path,
&helpers::thumbnailer::ALL_THUMBNAILABLE_EXTENSIONS,
)
.await?;
let thumbs_count = file_paths.len() as u64;
let first_materialized_path = file_paths[0].materialized_path.clone();
// Only the first materialized_path should be processed with priority as the user must see the thumbnails ASAP
let different_materialized_path_idx = file_paths
.iter()
.position(|file_path| file_path.materialized_path != first_materialized_path);
let non_priority_tasks = different_materialized_path_idx
.map(|idx| {
file_paths
.drain(idx..)
.chunks(BATCH_SIZE)
.into_iter()
.map(|chunk| {
tasks::Thumbnailer::new_indexed(
Arc::clone(&thumbnails_directory_path),
&chunk.collect::<Vec<_>>(),
(location_id, location_path),
library_id,
should_regenerate,
false,
Arc::clone(&reporter),
)
})
.map(IntoTask::into_task)
.collect::<Vec<_>>()
})
.unwrap_or_default();
let priority_tasks = file_paths
.into_iter()
.chunks(BATCH_SIZE)
.into_iter()
.map(|chunk| {
tasks::Thumbnailer::new_indexed(
Arc::clone(&thumbnails_directory_path),
&chunk.collect::<Vec<_>>(),
(location_id, location_path),
library_id,
should_regenerate,
true,
Arc::clone(&reporter),
)
})
.map(IntoTask::into_task)
.collect::<Vec<_>>();
debug!(
"Dispatching {thumbs_count} thumbnails to be processed, {} with priority and {} without priority tasks",
priority_tasks.len(),
non_priority_tasks.len()
);
Ok((
thumbs_count,
dispatcher
.dispatch_many_boxed(priority_tasks.into_iter().chain(non_priority_tasks))
.await,
))
}

View file

@ -0,0 +1,73 @@
use crate::{utils::sub_path, OuterContext, UpdateEvent};
use sd_core_file_path_helper::FilePathError;
use sd_utils::db::MissingFieldError;
use std::fmt;
use serde::{Deserialize, Serialize};
use specta::Type;
mod helpers;
pub mod job;
mod shallow;
mod tasks;
pub use tasks::{
media_data_extractor::{self, MediaDataExtractor},
thumbnailer::{self, Thumbnailer},
};
pub use helpers::thumbnailer::{ThumbKey, ThumbnailKind};
pub use shallow::shallow;
use self::thumbnailer::NewThumbnailReporter;
const BATCH_SIZE: usize = 10;
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("missing field on database: {0}")]
MissingField(#[from] MissingFieldError),
#[error("database error: {0}")]
Database(#[from] prisma_client_rust::QueryError),
#[error("failed to deserialized stored tasks for job resume: {0}")]
DeserializeTasks(#[from] rmp_serde::decode::Error),
#[error(transparent)]
FilePathError(#[from] FilePathError),
#[error(transparent)]
SubPath(#[from] sub_path::Error),
}
impl From<Error> for rspc::Error {
fn from(e: Error) -> Self {
Self::with_cause(rspc::ErrorCode::InternalServerError, e.to_string(), e)
}
}
#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)]
pub enum NonCriticalError {
#[error(transparent)]
MediaDataExtractor(#[from] media_data_extractor::NonCriticalError),
#[error(transparent)]
Thumbnailer(#[from] thumbnailer::NonCriticalError),
}
struct NewThumbnailsReporter<Ctx: OuterContext> {
ctx: Ctx,
}
impl<Ctx: OuterContext> fmt::Debug for NewThumbnailsReporter<Ctx> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("NewThumbnailsReporter").finish()
}
}
impl<Ctx: OuterContext> NewThumbnailReporter for NewThumbnailsReporter<Ctx> {
fn new_thumbnail(&self, thumb_key: ThumbKey) {
self.ctx
.report_update(UpdateEvent::NewThumbnailEvent { thumb_key });
}
}

View file

@ -0,0 +1,258 @@
use crate::{
media_processor, utils::sub_path::maybe_get_iso_file_path_from_sub_path, Error,
NonCriticalError, OuterContext,
};
use sd_core_file_path_helper::IsolatedFilePathData;
use sd_core_prisma_helpers::file_path_for_media_processor;
use sd_file_ext::extensions::Extension;
use sd_prisma::prisma::{location, PrismaClient};
use sd_task_system::{
BaseTaskDispatcher, CancelTaskOnDrop, IntoTask, TaskDispatcher, TaskHandle, TaskOutput,
TaskStatus,
};
use sd_utils::db::maybe_missing;
use std::{
path::{Path, PathBuf},
sync::Arc,
};
use futures::StreamExt;
use futures_concurrency::future::{FutureGroup, TryJoin};
use itertools::Itertools;
use prisma_client_rust::{raw, PrismaValue};
use tracing::{debug, warn};
use super::{
helpers::{self, exif_media_data, ffmpeg_media_data, thumbnailer::THUMBNAIL_CACHE_DIR_NAME},
tasks::{self, media_data_extractor, thumbnailer},
NewThumbnailsReporter, BATCH_SIZE,
};
#[allow(clippy::missing_panics_doc)] // SAFETY: It doesn't actually panics
pub async fn shallow(
location: location::Data,
sub_path: impl AsRef<Path> + Send,
dispatcher: BaseTaskDispatcher<Error>,
ctx: impl OuterContext,
) -> Result<Vec<NonCriticalError>, Error> {
let sub_path = sub_path.as_ref();
let location_path = maybe_missing(&location.path, "location.path")
.map(PathBuf::from)
.map(Arc::new)
.map_err(media_processor::Error::from)?;
let location = Arc::new(location);
let sub_iso_file_path = maybe_get_iso_file_path_from_sub_path(
location.id,
&Some(sub_path),
&*location_path,
ctx.db(),
)
.await
.map_err(media_processor::Error::from)?
.map_or_else(
|| {
IsolatedFilePathData::new(location.id, &*location_path, &*location_path, true)
.map_err(media_processor::Error::from)
},
Ok,
)?;
let mut errors = vec![];
let mut futures = dispatch_media_data_extractor_tasks(
ctx.db(),
&sub_iso_file_path,
&location_path,
&dispatcher,
)
.await?
.into_iter()
.map(CancelTaskOnDrop)
.chain(
dispatch_thumbnailer_tasks(&sub_iso_file_path, false, &location_path, &dispatcher, &ctx)
.await?
.into_iter()
.map(CancelTaskOnDrop),
)
.collect::<FutureGroup<_>>();
while let Some(res) = futures.next().await {
match res {
Ok(TaskStatus::Done((_, TaskOutput::Out(out)))) => {
if out.is::<media_data_extractor::Output>() {
errors.extend(
out.downcast::<media_data_extractor::Output>()
.expect("just checked")
.errors,
);
} else if out.is::<thumbnailer::Output>() {
errors.extend(
out.downcast::<thumbnailer::Output>()
.expect("just checked")
.errors,
);
} else {
unreachable!(
"Task returned unexpected output type on media processor shallow job"
);
}
}
Ok(TaskStatus::Done((_, TaskOutput::Empty))) => {
warn!("Task returned empty output on media processor shallow job");
}
Ok(TaskStatus::Canceled | TaskStatus::ForcedAbortion | TaskStatus::Shutdown(_)) => {
return Ok(errors);
}
Ok(TaskStatus::Error(e)) => return Err(e),
Err(e) => return Err(e.into()),
}
}
Ok(errors)
}
async fn dispatch_media_data_extractor_tasks(
db: &Arc<PrismaClient>,
parent_iso_file_path: &IsolatedFilePathData<'_>,
location_path: &Arc<PathBuf>,
dispatcher: &BaseTaskDispatcher<Error>,
) -> Result<Vec<TaskHandle<Error>>, media_processor::Error> {
let (extract_exif_file_paths, extract_ffmpeg_file_paths) = (
get_files_by_extensions(
db,
parent_iso_file_path,
&exif_media_data::AVAILABLE_EXTENSIONS,
),
get_files_by_extensions(
db,
parent_iso_file_path,
&ffmpeg_media_data::AVAILABLE_EXTENSIONS,
),
)
.try_join()
.await?;
let tasks = extract_exif_file_paths
.into_iter()
.chunks(BATCH_SIZE)
.into_iter()
.map(Iterator::collect::<Vec<_>>)
.map(|chunked_file_paths| {
tasks::MediaDataExtractor::new_exif(
&chunked_file_paths,
parent_iso_file_path.location_id(),
Arc::clone(location_path),
Arc::clone(db),
)
})
.map(IntoTask::into_task)
.chain(
extract_ffmpeg_file_paths
.into_iter()
.chunks(BATCH_SIZE)
.into_iter()
.map(Iterator::collect::<Vec<_>>)
.map(|chunked_file_paths| {
tasks::MediaDataExtractor::new_ffmpeg(
&chunked_file_paths,
parent_iso_file_path.location_id(),
Arc::clone(location_path),
Arc::clone(db),
)
})
.map(IntoTask::into_task),
)
.collect::<Vec<_>>();
Ok(dispatcher.dispatch_many_boxed(tasks).await)
}
async fn get_files_by_extensions(
db: &PrismaClient,
parent_iso_file_path: &IsolatedFilePathData<'_>,
extensions: &[Extension],
) -> Result<Vec<file_path_for_media_processor::Data>, media_processor::Error> {
// FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite
// We have no data coming from the user, so this is sql injection safe
db._query_raw(raw!(
&format!(
"SELECT id, materialized_path, is_dir, name, extension, cas_id, object_id
FROM file_path
WHERE
location_id={{}}
AND cas_id IS NOT NULL
AND LOWER(extension) IN ({})
AND materialized_path = {{}}",
extensions
.iter()
.map(|ext| format!("LOWER('{ext}')"))
.collect::<Vec<_>>()
.join(",")
),
PrismaValue::Int(i64::from(parent_iso_file_path.location_id())),
PrismaValue::String(
parent_iso_file_path
.materialized_path_for_children()
.expect("sub path iso_file_path must be a directory")
)
))
.exec()
.await
.map_err(Into::into)
}
async fn dispatch_thumbnailer_tasks(
parent_iso_file_path: &IsolatedFilePathData<'_>,
should_regenerate: bool,
location_path: &PathBuf,
dispatcher: &BaseTaskDispatcher<Error>,
ctx: &impl OuterContext,
) -> Result<Vec<TaskHandle<Error>>, media_processor::Error> {
let thumbnails_directory_path =
Arc::new(ctx.get_data_directory().join(THUMBNAIL_CACHE_DIR_NAME));
let location_id = parent_iso_file_path.location_id();
let library_id = ctx.id();
let db = ctx.db();
let reporter = Arc::new(NewThumbnailsReporter { ctx: ctx.clone() });
let file_paths = get_files_by_extensions(
db,
parent_iso_file_path,
&helpers::thumbnailer::ALL_THUMBNAILABLE_EXTENSIONS,
)
.await?;
let thumbs_count = file_paths.len() as u64;
let tasks = file_paths
.into_iter()
.chunks(BATCH_SIZE)
.into_iter()
.map(|chunk| {
tasks::Thumbnailer::new_indexed(
Arc::clone(&thumbnails_directory_path),
&chunk.collect::<Vec<_>>(),
(location_id, location_path),
library_id,
should_regenerate,
true,
Arc::clone(&reporter),
)
})
.map(IntoTask::into_task)
.collect::<Vec<_>>();
debug!(
"Dispatching {thumbs_count} thumbnails to be processed, in {} priority tasks",
tasks.len(),
);
Ok(dispatcher.dispatch_many_boxed(tasks).await)
}

View file

@ -0,0 +1,525 @@
use crate::{
media_processor::{
self,
helpers::{exif_media_data, ffmpeg_media_data},
},
Error,
};
use sd_core_file_path_helper::IsolatedFilePathData;
use sd_core_prisma_helpers::file_path_for_media_processor;
use sd_media_metadata::{ExifMetadata, FFmpegMetadata};
use sd_prisma::prisma::{exif_data, ffmpeg_data, file_path, location, object, PrismaClient};
use sd_task_system::{
check_interruption, ExecStatus, Interrupter, InterruptionKind, IntoAnyTaskOutput,
SerializableTask, Task, TaskId,
};
use std::{
collections::{HashMap, HashSet},
future::{Future, IntoFuture},
mem,
path::{Path, PathBuf},
pin::pin,
sync::Arc,
time::Duration,
};
use futures::{FutureExt, StreamExt};
use futures_concurrency::future::{FutureGroup, Race};
use serde::{Deserialize, Serialize};
use specta::Type;
use tokio::time::Instant;
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
enum Kind {
Exif,
FFmpeg,
}
#[derive(Debug)]
pub struct MediaDataExtractor {
id: TaskId,
kind: Kind,
file_paths: Vec<file_path_for_media_processor::Data>,
location_id: location::id::Type,
location_path: Arc<PathBuf>,
stage: Stage,
db: Arc<PrismaClient>,
output: Output,
}
#[derive(Debug, Serialize, Deserialize)]
enum Stage {
Starting,
FetchedObjectsAlreadyWithMediaData(Vec<object::id::Type>),
ExtractingMediaData {
paths_by_id: HashMap<file_path::id::Type, (PathBuf, object::id::Type)>,
exif_media_datas: Vec<(ExifMetadata, object::id::Type)>,
ffmpeg_media_datas: Vec<(FFmpegMetadata, object::id::Type)>,
extract_ids_to_remove_from_map: Vec<file_path::id::Type>,
},
SaveMediaData {
exif_media_datas: Vec<(ExifMetadata, object::id::Type)>,
ffmpeg_media_datas: Vec<(FFmpegMetadata, object::id::Type)>,
},
}
impl MediaDataExtractor {
fn new(
kind: Kind,
file_paths: &[file_path_for_media_processor::Data],
location_id: location::id::Type,
location_path: Arc<PathBuf>,
db: Arc<PrismaClient>,
) -> Self {
let mut output = Output::default();
Self {
id: TaskId::new_v4(),
kind,
file_paths: file_paths
.iter()
.filter(|file_path| {
if file_path.object_id.is_some() {
true
} else {
output.errors.push(
media_processor::NonCriticalError::from(
NonCriticalError::FilePathMissingObjectId(file_path.id),
)
.into(),
);
false
}
})
.cloned()
.collect(),
location_id,
location_path,
stage: Stage::Starting,
db,
output,
}
}
#[must_use]
pub fn new_exif(
file_paths: &[file_path_for_media_processor::Data],
location_id: location::id::Type,
location_path: Arc<PathBuf>,
db: Arc<PrismaClient>,
) -> Self {
Self::new(Kind::Exif, file_paths, location_id, location_path, db)
}
#[must_use]
pub fn new_ffmpeg(
file_paths: &[file_path_for_media_processor::Data],
location_id: location::id::Type,
location_path: Arc<PathBuf>,
db: Arc<PrismaClient>,
) -> Self {
Self::new(Kind::FFmpeg, file_paths, location_id, location_path, db)
}
}
#[async_trait::async_trait]
impl Task<Error> for MediaDataExtractor {
fn id(&self) -> TaskId {
self.id
}
/// MediaDataExtractor never needs priority, as the data it generates are only accessed through
/// the media inspector, so it isn't latency sensitive like other tasks, like FileIdentifier or
/// the Thumbnailer
fn with_priority(&self) -> bool {
false
}
async fn run(&mut self, interrupter: &Interrupter) -> Result<ExecStatus, Error> {
loop {
match &mut self.stage {
Stage::Starting => {
let db_read_start = Instant::now();
let object_ids = fetch_objects_already_with_media_data(
self.kind,
&self.file_paths,
&self.db,
)
.await?;
self.output.db_read_time = db_read_start.elapsed();
self.stage = Stage::FetchedObjectsAlreadyWithMediaData(object_ids);
}
Stage::FetchedObjectsAlreadyWithMediaData(objects_already_with_media_data) => {
let filtering_start = Instant::now();
if self.file_paths.len() == objects_already_with_media_data.len() {
// All files already have media data, skipping
self.output.skipped = self.file_paths.len() as u64;
break;
}
let paths_by_id = filter_files_to_extract_media_data(
mem::take(objects_already_with_media_data),
self.location_id,
&self.location_path,
&mut self.file_paths,
&mut self.output,
);
self.output.filtering_time = filtering_start.elapsed();
self.stage = Stage::ExtractingMediaData {
extract_ids_to_remove_from_map: Vec::with_capacity(paths_by_id.len()),
exif_media_datas: if self.kind == Kind::Exif {
Vec::with_capacity(paths_by_id.len())
} else {
Vec::new()
},
ffmpeg_media_datas: if self.kind == Kind::FFmpeg {
Vec::with_capacity(paths_by_id.len())
} else {
Vec::new()
},
paths_by_id,
};
}
Stage::ExtractingMediaData {
paths_by_id,
exif_media_datas,
ffmpeg_media_datas,
extract_ids_to_remove_from_map,
} => {
{
// This inner scope is necessary to appease the mighty borrowck
let extraction_start = Instant::now();
for id in extract_ids_to_remove_from_map.drain(..) {
paths_by_id.remove(&id);
}
let mut futures = pin!(prepare_extraction_futures(
self.kind,
paths_by_id,
interrupter
));
while let Some(race_output) = futures.next().await {
match race_output {
InterruptRace::Processed(out) => {
process_output(
out,
exif_media_datas,
ffmpeg_media_datas,
extract_ids_to_remove_from_map,
&mut self.output,
);
}
InterruptRace::Interrupted(kind) => {
self.output.extraction_time += extraction_start.elapsed();
return Ok(match kind {
InterruptionKind::Pause => ExecStatus::Paused,
InterruptionKind::Cancel => ExecStatus::Canceled,
});
}
}
}
}
self.stage = Stage::SaveMediaData {
exif_media_datas: mem::take(exif_media_datas),
ffmpeg_media_datas: mem::take(ffmpeg_media_datas),
};
}
Stage::SaveMediaData {
exif_media_datas,
ffmpeg_media_datas,
} => {
let db_write_start = Instant::now();
self.output.extracted =
save(self.kind, exif_media_datas, ffmpeg_media_datas, &self.db).await?;
self.output.db_write_time = db_write_start.elapsed();
self.output.skipped += self.output.errors.len() as u64;
break;
}
}
check_interruption!(interrupter);
}
Ok(ExecStatus::Done(mem::take(&mut self.output).into_output()))
}
}
#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)]
pub enum NonCriticalError {
#[error("failed to extract media data from <file='{}'>: {1}", .0.display())]
FailedToExtractImageMediaData(PathBuf, String),
#[error("file path missing object id: <file_path_id='{0}'>")]
FilePathMissingObjectId(file_path::id::Type),
#[error("failed to construct isolated file path data: <file_path_id='{0}'>: {1}")]
FailedToConstructIsolatedFilePathData(file_path::id::Type, String),
}
#[derive(Serialize, Deserialize, Default, Debug)]
pub struct Output {
pub extracted: u64,
pub skipped: u64,
pub db_read_time: Duration,
pub filtering_time: Duration,
pub extraction_time: Duration,
pub db_write_time: Duration,
pub errors: Vec<crate::NonCriticalError>,
}
#[derive(Debug, Serialize, Deserialize)]
struct SaveState {
id: TaskId,
kind: Kind,
file_paths: Vec<file_path_for_media_processor::Data>,
location_id: location::id::Type,
location_path: Arc<PathBuf>,
stage: Stage,
output: Output,
}
impl SerializableTask<Error> for MediaDataExtractor {
type SerializeError = rmp_serde::encode::Error;
type DeserializeError = rmp_serde::decode::Error;
type DeserializeCtx = Arc<PrismaClient>;
async fn serialize(self) -> Result<Vec<u8>, Self::SerializeError> {
let Self {
id,
kind,
file_paths,
location_id,
location_path,
stage,
output,
..
} = self;
rmp_serde::to_vec_named(&SaveState {
id,
kind,
file_paths,
location_id,
location_path,
stage,
output,
})
}
async fn deserialize(
data: &[u8],
db: Self::DeserializeCtx,
) -> Result<Self, Self::DeserializeError> {
rmp_serde::from_slice(data).map(
|SaveState {
id,
kind,
file_paths,
location_id,
location_path,
stage,
output,
}| Self {
id,
kind,
file_paths,
location_id,
location_path,
stage,
db,
output,
},
)
}
}
#[inline]
async fn fetch_objects_already_with_media_data(
kind: Kind,
file_paths: &[file_path_for_media_processor::Data],
db: &PrismaClient,
) -> Result<Vec<object::id::Type>, media_processor::Error> {
let object_ids = file_paths
.iter()
.filter_map(|file_path| file_path.object_id)
.collect();
match kind {
Kind::Exif => db
.exif_data()
.find_many(vec![exif_data::object_id::in_vec(object_ids)])
.select(exif_data::select!({ object_id }))
.exec()
.await
.map(|object_ids| object_ids.into_iter().map(|data| data.object_id).collect())
.map_err(Into::into),
Kind::FFmpeg => db
.ffmpeg_data()
.find_many(vec![ffmpeg_data::object_id::in_vec(object_ids)])
.select(ffmpeg_data::select!({ object_id }))
.exec()
.await
.map(|object_ids| object_ids.into_iter().map(|data| data.object_id).collect())
.map_err(Into::into),
}
}
#[inline]
fn filter_files_to_extract_media_data(
objects_already_with_media_data: Vec<object::id::Type>,
location_id: location::id::Type,
location_path: &Path,
file_paths: &mut Vec<file_path_for_media_processor::Data>,
Output {
skipped, errors, ..
}: &mut Output,
) -> HashMap<file_path::id::Type, (PathBuf, object::id::Type)> {
let unique_objects_already_with_media_data = objects_already_with_media_data
.into_iter()
.collect::<HashSet<_>>();
*skipped = unique_objects_already_with_media_data.len() as u64;
file_paths.retain(|file_path| {
!unique_objects_already_with_media_data
.contains(&file_path.object_id.expect("already checked"))
});
file_paths
.iter()
.filter_map(|file_path| {
IsolatedFilePathData::try_from((location_id, file_path))
.map_err(|e| {
errors.push(
media_processor::NonCriticalError::from(
NonCriticalError::FailedToConstructIsolatedFilePathData(
file_path.id,
e.to_string(),
),
)
.into(),
);
})
.map(|iso_file_path| {
(
file_path.id,
(
location_path.join(iso_file_path),
file_path.object_id.expect("already checked"),
),
)
})
.ok()
})
.collect()
}
enum ExtractionOutputKind {
Exif(Result<Option<ExifMetadata>, media_processor::NonCriticalError>),
FFmpeg(Result<FFmpegMetadata, media_processor::NonCriticalError>),
}
struct ExtractionOutput {
file_path_id: file_path::id::Type,
object_id: object::id::Type,
kind: ExtractionOutputKind,
}
#[allow(clippy::large_enum_variant)]
/*
* NOTE(fogodev): Interrupts will be pretty rare, so paying the boxing price for
* the Processed variant isn't worth it to avoid the enum size disparity between variants
*/
enum InterruptRace {
Interrupted(InterruptionKind),
Processed(ExtractionOutput),
}
#[inline]
fn prepare_extraction_futures<'a>(
kind: Kind,
paths_by_id: &'a HashMap<file_path::id::Type, (PathBuf, object::id::Type)>,
interrupter: &'a Interrupter,
) -> FutureGroup<impl Future<Output = InterruptRace> + 'a> {
paths_by_id
.iter()
.map(|(file_path_id, (path, object_id))| async move {
InterruptRace::Processed(ExtractionOutput {
file_path_id: *file_path_id,
object_id: *object_id,
kind: match kind {
Kind::Exif => ExtractionOutputKind::Exif(exif_media_data::extract(path).await),
Kind::FFmpeg => {
ExtractionOutputKind::FFmpeg(ffmpeg_media_data::extract(path).await)
}
},
})
})
.map(|fut| {
(
fut,
interrupter.into_future().map(InterruptRace::Interrupted),
)
.race()
})
.collect::<FutureGroup<_>>()
}
#[inline]
fn process_output(
ExtractionOutput {
file_path_id,
object_id,
kind,
}: ExtractionOutput,
exif_media_datas: &mut Vec<(ExifMetadata, object::id::Type)>,
ffmpeg_media_datas: &mut Vec<(FFmpegMetadata, object::id::Type)>,
extract_ids_to_remove_from_map: &mut Vec<file_path::id::Type>,
output: &mut Output,
) {
match kind {
ExtractionOutputKind::Exif(Ok(Some(exif_data))) => {
exif_media_datas.push((exif_data, object_id));
}
ExtractionOutputKind::Exif(Ok(None)) => {
// No exif media data found
output.skipped += 1;
}
ExtractionOutputKind::FFmpeg(Ok(ffmpeg_data)) => {
ffmpeg_media_datas.push((ffmpeg_data, object_id));
}
ExtractionOutputKind::Exif(Err(e)) | ExtractionOutputKind::FFmpeg(Err(e)) => {
output.errors.push(e.into());
}
}
extract_ids_to_remove_from_map.push(file_path_id);
}
#[inline]
async fn save(
kind: Kind,
exif_media_datas: &mut Vec<(ExifMetadata, object::id::Type)>,
ffmpeg_media_datas: &mut Vec<(FFmpegMetadata, object::id::Type)>,
db: &PrismaClient,
) -> Result<u64, media_processor::Error> {
match kind {
Kind::Exif => exif_media_data::save(mem::take(exif_media_datas), db).await,
Kind::FFmpeg => ffmpeg_media_data::save(mem::take(ffmpeg_media_datas), db).await,
}
}

View file

@ -0,0 +1,5 @@
pub mod media_data_extractor;
pub mod thumbnailer;
pub use media_data_extractor::MediaDataExtractor;
pub use thumbnailer::Thumbnailer;

View file

@ -0,0 +1,677 @@
//! Thumbnails directory have the following structure:
//! thumbnails/
//! ├── version.txt
//! ├── ephemeral/ # ephemeral ones have it's own directory
//! │ └── <`cas_id`>[0..3]/ # sharding
//! │ └── <`cas_id`>.webp
//! └── <`library_id`>/ # we segregate thumbnails by library
//! └── <`cas_id`>[0..3]/ # sharding
//! └── <`cas_id`>.webp
use crate::{
media_processor::{
self,
helpers::thumbnailer::{
can_generate_thumbnail_for_document, can_generate_thumbnail_for_image, get_shard_hex,
EPHEMERAL_DIR, TARGET_PX, TARGET_QUALITY, THUMBNAIL_GENERATION_TIMEOUT, WEBP_EXTENSION,
},
ThumbKey, ThumbnailKind,
},
Error,
};
use sd_core_file_path_helper::IsolatedFilePathData;
use sd_core_prisma_helpers::file_path_for_media_processor;
use sd_file_ext::extensions::{DocumentExtension, ImageExtension};
use sd_images::{format_image, scale_dimensions, ConvertibleExtension};
use sd_media_metadata::exif::Orientation;
use sd_prisma::prisma::{file_path, location};
use sd_task_system::{
ExecStatus, Interrupter, InterruptionKind, IntoAnyTaskOutput, SerializableTask, Task, TaskId,
};
use sd_utils::error::FileIOError;
use std::{
collections::HashMap,
fmt,
future::IntoFuture,
mem,
ops::Deref,
path::{Path, PathBuf},
pin::pin,
str::FromStr,
sync::Arc,
time::Duration,
};
use futures::{FutureExt, StreamExt};
use futures_concurrency::future::{FutureGroup, Race};
use image::{imageops, DynamicImage, GenericImageView};
use serde::{Deserialize, Serialize};
use specta::Type;
use tokio::{
fs, io,
task::spawn_blocking,
time::{sleep, Instant},
};
use tracing::{error, info, trace};
use uuid::Uuid;
use webp::Encoder;
#[derive(Debug, Serialize, Deserialize)]
pub struct GenerateThumbnailArgs {
pub extension: String,
pub cas_id: String,
pub path: PathBuf,
}
impl GenerateThumbnailArgs {
#[must_use]
pub const fn new(extension: String, cas_id: String, path: PathBuf) -> Self {
Self {
extension,
cas_id,
path,
}
}
}
pub type ThumbnailId = u32;
pub trait NewThumbnailReporter: Send + Sync + fmt::Debug + 'static {
fn new_thumbnail(&self, thumb_key: ThumbKey);
}
#[derive(Debug)]
pub struct Thumbnailer<Reporter: NewThumbnailReporter> {
id: TaskId,
reporter: Arc<Reporter>,
thumbs_kind: ThumbnailKind,
thumbnails_directory_path: Arc<PathBuf>,
thumbnails_to_generate: HashMap<ThumbnailId, GenerateThumbnailArgs>,
already_processed_ids: Vec<ThumbnailId>,
should_regenerate: bool,
with_priority: bool,
output: Output,
}
#[async_trait::async_trait]
impl<Reporter: NewThumbnailReporter> Task<Error> for Thumbnailer<Reporter> {
fn id(&self) -> TaskId {
self.id
}
fn with_priority(&self) -> bool {
self.with_priority
}
fn with_timeout(&self) -> Option<Duration> {
Some(Duration::from_secs(60 * 5)) // The entire task must not take more than 5 minutes
}
async fn run(&mut self, interrupter: &Interrupter) -> Result<ExecStatus, Error> {
enum InterruptRace {
Interrupted(InterruptionKind),
Processed(ThumbnailGenerationOutput),
}
let Self {
thumbs_kind,
thumbnails_directory_path,
thumbnails_to_generate,
already_processed_ids,
should_regenerate,
with_priority,
reporter,
output,
..
} = self;
// Removing already processed thumbnails from a possible previous run
already_processed_ids.drain(..).for_each(|id| {
thumbnails_to_generate.remove(&id);
});
let start = Instant::now();
let mut futures = pin!(thumbnails_to_generate
.iter()
.map(|(id, generate_args)| {
let path = &generate_args.path;
(
generate_thumbnail(
thumbnails_directory_path,
generate_args,
thumbs_kind,
*should_regenerate,
)
.map(|res| (*id, res)),
sleep(THUMBNAIL_GENERATION_TIMEOUT).map(|()| {
(
*id,
(
THUMBNAIL_GENERATION_TIMEOUT,
Err(NonCriticalError::ThumbnailGenerationTimeout(path.clone())),
),
)
}),
)
.race()
.map(InterruptRace::Processed)
})
.map(|fut| (
fut,
interrupter.into_future().map(InterruptRace::Interrupted)
)
.race())
.collect::<FutureGroup<_>>());
while let Some(race_output) = futures.next().await {
match race_output {
InterruptRace::Processed(out) => process_thumbnail_generation_output(
out,
*with_priority,
reporter.as_ref(),
already_processed_ids,
output,
),
InterruptRace::Interrupted(kind) => {
output.total_time += start.elapsed();
return Ok(match kind {
InterruptionKind::Pause => ExecStatus::Paused,
InterruptionKind::Cancel => ExecStatus::Canceled,
});
}
}
}
output.total_time += start.elapsed();
#[allow(clippy::cast_precision_loss)]
// SAFETY: we're probably won't have 2^52 thumbnails being generated on a single task for this cast to have
// a precision loss issue
let total = (output.generated + output.skipped) as f64;
let mean_generation_time = output.mean_time_acc / total;
let generation_time_std_dev = Duration::from_secs_f64(
(mean_generation_time.mul_add(-mean_generation_time, output.std_dev_acc / total))
.sqrt(),
);
info!(
"{{generated: {generated}, skipped: {skipped}}} thumbnails; \
mean generation time: {mean_generation_time:?} ± {generation_time_std_dev:?}",
generated = output.generated,
skipped = output.skipped,
mean_generation_time = Duration::from_secs_f64(mean_generation_time)
);
Ok(ExecStatus::Done(mem::take(output).into_output()))
}
}
#[derive(Serialize, Deserialize, Default, Debug)]
pub struct Output {
pub generated: u64,
pub skipped: u64,
pub errors: Vec<crate::NonCriticalError>,
pub total_time: Duration,
pub mean_time_acc: f64,
pub std_dev_acc: f64,
}
#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)]
pub enum NonCriticalError {
#[error("file path <id='{0}'> has no cas_id")]
MissingCasId(file_path::id::Type),
#[error("failed to extract isolated file path data from file path <id='{0}'>: {1}")]
FailedToExtractIsolatedFilePathData(file_path::id::Type, String),
#[error("failed to generate video file thumbnail <path='{}'>: {1}", .0.display())]
VideoThumbnailGenerationFailed(PathBuf, String),
#[error("failed to format image <path='{}'>: {1}", .0.display())]
FormatImage(PathBuf, String),
#[error("failed to encode webp image <path='{}'>: {1}", .0.display())]
WebPEncoding(PathBuf, String),
#[error("processing thread panicked while generating thumbnail from <path='{}'>: {1}", .0.display())]
PanicWhileGeneratingThumbnail(PathBuf, String),
#[error("failed to create shard directory for thumbnail: {0}")]
CreateShardDirectory(String),
#[error("failed to save thumbnail <path='{}'>: {1}", .0.display())]
SaveThumbnail(PathBuf, String),
#[error("thumbnail generation timed out <path='{}'>", .0.display())]
ThumbnailGenerationTimeout(PathBuf),
}
impl<Reporter: NewThumbnailReporter> Thumbnailer<Reporter> {
fn new(
thumbs_kind: ThumbnailKind,
thumbnails_directory_path: Arc<PathBuf>,
thumbnails_to_generate: HashMap<ThumbnailId, GenerateThumbnailArgs>,
errors: Vec<crate::NonCriticalError>,
should_regenerate: bool,
with_priority: bool,
reporter: Arc<Reporter>,
) -> Self {
Self {
id: TaskId::new_v4(),
thumbs_kind,
thumbnails_directory_path,
already_processed_ids: Vec::with_capacity(thumbnails_to_generate.len()),
thumbnails_to_generate,
should_regenerate,
with_priority,
output: Output {
errors,
..Default::default()
},
reporter,
}
}
#[must_use]
pub fn new_ephemeral(
thumbnails_directory_path: Arc<PathBuf>,
thumbnails_to_generate: Vec<GenerateThumbnailArgs>,
reporter: Arc<Reporter>,
) -> Self {
Self::new(
ThumbnailKind::Ephemeral,
thumbnails_directory_path,
thumbnails_to_generate
.into_iter()
.enumerate()
.map(|(i, args)| {
#[allow(clippy::cast_possible_truncation)]
{
// SAFETY: it's fine, we will never process more than 4 billion thumbnails
// on a single task LMAO
(i as ThumbnailId, args)
}
})
.collect(),
Vec::new(),
false,
true,
reporter,
)
}
#[must_use]
pub fn new_indexed(
thumbnails_directory_path: Arc<PathBuf>,
file_paths: &[file_path_for_media_processor::Data],
(location_id, location_path): (location::id::Type, &Path),
library_id: Uuid,
should_regenerate: bool,
with_priority: bool,
reporter: Arc<Reporter>,
) -> Self {
let mut errors = Vec::new();
Self::new(
ThumbnailKind::Indexed(library_id),
thumbnails_directory_path,
file_paths
.iter()
.filter_map(|file_path| {
if let Some(cas_id) = file_path.cas_id.as_ref() {
let file_path_id = file_path.id;
IsolatedFilePathData::try_from((location_id, file_path))
.map_err(|e| {
errors.push(
media_processor::NonCriticalError::from(
NonCriticalError::FailedToExtractIsolatedFilePathData(
file_path_id,
e.to_string(),
),
)
.into(),
);
})
.ok()
.map(|iso_file_path| (file_path_id, cas_id, iso_file_path))
} else {
errors.push(
media_processor::NonCriticalError::from(
NonCriticalError::MissingCasId(file_path.id),
)
.into(),
);
None
}
})
.map(|(file_path_id, cas_id, iso_file_path)| {
let full_path = location_path.join(&iso_file_path);
#[allow(clippy::cast_sign_loss)]
{
(
// SAFETY: db doesn't have negative indexes
file_path_id as u32,
GenerateThumbnailArgs::new(
iso_file_path.extension().to_string(),
cas_id.clone(),
full_path,
),
)
}
})
.collect::<HashMap<_, _>>(),
errors,
should_regenerate,
with_priority,
reporter,
)
}
}
#[derive(Debug, Serialize, Deserialize)]
struct SaveState {
id: TaskId,
thumbs_kind: ThumbnailKind,
thumbnails_directory_path: Arc<PathBuf>,
thumbnails_to_generate: HashMap<ThumbnailId, GenerateThumbnailArgs>,
should_regenerate: bool,
with_priority: bool,
output: Output,
}
impl<Reporter: NewThumbnailReporter> SerializableTask<Error> for Thumbnailer<Reporter> {
type SerializeError = rmp_serde::encode::Error;
type DeserializeError = rmp_serde::decode::Error;
type DeserializeCtx = Arc<Reporter>;
async fn serialize(self) -> Result<Vec<u8>, Self::SerializeError> {
let Self {
id,
thumbs_kind,
thumbnails_directory_path,
mut thumbnails_to_generate,
already_processed_ids,
should_regenerate,
with_priority,
output,
..
} = self;
for id in already_processed_ids {
thumbnails_to_generate.remove(&id);
}
rmp_serde::to_vec_named(&SaveState {
id,
thumbs_kind,
thumbnails_directory_path,
thumbnails_to_generate,
should_regenerate,
with_priority,
output,
})
}
async fn deserialize(
data: &[u8],
reporter: Self::DeserializeCtx,
) -> Result<Self, Self::DeserializeError> {
rmp_serde::from_slice(data).map(
|SaveState {
id,
thumbs_kind,
thumbnails_to_generate,
thumbnails_directory_path,
should_regenerate,
with_priority,
output,
}| Self {
id,
reporter,
thumbs_kind,
thumbnails_to_generate,
thumbnails_directory_path,
already_processed_ids: Vec::new(),
should_regenerate,
with_priority,
output,
},
)
}
}
enum GenerationStatus {
Generated,
Skipped,
}
type ThumbnailGenerationOutput = (
ThumbnailId,
(
Duration,
Result<(ThumbKey, GenerationStatus), NonCriticalError>,
),
);
fn process_thumbnail_generation_output(
(id, (elapsed_time, res)): ThumbnailGenerationOutput,
with_priority: bool,
reporter: &impl NewThumbnailReporter,
already_processed_ids: &mut Vec<ThumbnailId>,
Output {
generated,
skipped,
errors,
mean_time_acc: mean_generation_time_accumulator,
std_dev_acc: std_dev_accumulator,
..
}: &mut Output,
) {
let elapsed_time = elapsed_time.as_secs_f64();
*mean_generation_time_accumulator += elapsed_time;
*std_dev_accumulator += elapsed_time * elapsed_time;
match res {
Ok((thumb_key, status)) => {
match status {
GenerationStatus::Generated => {
*generated += 1;
}
GenerationStatus::Skipped => {
*skipped += 1;
}
}
// This if is REALLY needed, due to the sheer performance of the thumbnailer,
// I restricted to only send events notifying for thumbnails in the current
// opened directory, sending events for the entire location turns into a
// humongous bottleneck in the frontend lol, since it doesn't even knows
// what to do with thumbnails for inner directories lol
// - fogodev
if with_priority {
reporter.new_thumbnail(thumb_key);
}
}
Err(e) => {
errors.push(media_processor::NonCriticalError::from(e).into());
*skipped += 1;
}
}
already_processed_ids.push(id);
}
async fn generate_thumbnail(
thumbnails_directory: &Path,
GenerateThumbnailArgs {
extension,
cas_id,
path,
}: &GenerateThumbnailArgs,
kind: &ThumbnailKind,
should_regenerate: bool,
) -> (
Duration,
Result<(ThumbKey, GenerationStatus), NonCriticalError>,
) {
trace!("Generating thumbnail for {}", path.display());
let start = Instant::now();
let mut output_path = match kind {
ThumbnailKind::Ephemeral => thumbnails_directory.join(EPHEMERAL_DIR),
ThumbnailKind::Indexed(library_id) => thumbnails_directory.join(library_id.to_string()),
};
output_path.push(get_shard_hex(cas_id));
output_path.push(cas_id);
output_path.set_extension(WEBP_EXTENSION);
if let Err(e) = fs::metadata(&*output_path).await {
if e.kind() != io::ErrorKind::NotFound {
error!(
"Failed to check if thumbnail exists, but we will try to generate it anyway: {e:#?}"
);
}
// Otherwise we good, thumbnail doesn't exist so we can generate it
} else if !should_regenerate {
trace!(
"Skipping thumbnail generation for {} because it already exists",
path.display()
);
return (
start.elapsed(),
Ok((ThumbKey::new(cas_id, kind), GenerationStatus::Skipped)),
);
}
if let Ok(extension) = ImageExtension::from_str(extension) {
if can_generate_thumbnail_for_image(extension) {
if let Err(e) = generate_image_thumbnail(&path, &output_path).await {
return (start.elapsed(), Err(e));
}
}
} else if let Ok(extension) = DocumentExtension::from_str(extension) {
if can_generate_thumbnail_for_document(extension) {
if let Err(e) = generate_image_thumbnail(&path, &output_path).await {
return (start.elapsed(), Err(e));
}
}
}
#[cfg(feature = "ffmpeg")]
{
use crate::media_processor::helpers::thumbnailer::can_generate_thumbnail_for_video;
use sd_file_ext::extensions::VideoExtension;
if let Ok(extension) = VideoExtension::from_str(extension) {
if can_generate_thumbnail_for_video(extension) {
if let Err(e) = generate_video_thumbnail(&path, &output_path).await {
return (start.elapsed(), Err(e));
}
}
}
}
trace!("Generated thumbnail for {}", path.display());
(
start.elapsed(),
Ok((ThumbKey::new(cas_id, kind), GenerationStatus::Generated)),
)
}
async fn generate_image_thumbnail(
file_path: impl AsRef<Path> + Send,
output_path: impl AsRef<Path> + Send,
) -> Result<(), NonCriticalError> {
let file_path = file_path.as_ref().to_path_buf();
let webp = spawn_blocking({
let file_path = file_path.clone();
move || -> Result<_, NonCriticalError> {
let mut img = format_image(&file_path)
.map_err(|e| NonCriticalError::FormatImage(file_path.clone(), e.to_string()))?;
let (w, h) = img.dimensions();
#[allow(clippy::cast_precision_loss)]
let (w_scaled, h_scaled) = scale_dimensions(w as f32, h as f32, TARGET_PX);
// Optionally, resize the existing photo and convert back into DynamicImage
if w != w_scaled && h != h_scaled {
img = DynamicImage::ImageRgba8(imageops::resize(
&img,
w_scaled,
h_scaled,
imageops::FilterType::Triangle,
));
}
// this corrects the rotation/flip of the image based on the *available* exif data
// not all images have exif data, so we don't error. we also don't rotate HEIF as that's against the spec
if let Some(orientation) = Orientation::from_path(&file_path) {
if ConvertibleExtension::try_from(file_path.as_ref())
.expect("we already checked if the image was convertible")
.should_rotate()
{
img = orientation.correct_thumbnail(img);
}
}
// Create the WebP encoder for the above image
let encoder = Encoder::from_image(&img)
.map_err(|reason| NonCriticalError::WebPEncoding(file_path, reason.to_string()))?;
// Type `WebPMemory` is !Send, which makes the `Future` in this function `!Send`,
// this make us `deref` to have a `&[u8]` and then `to_owned` to make a `Vec<u8>`
// which implies on a unwanted clone...
Ok(encoder.encode(TARGET_QUALITY).deref().to_owned())
}
})
.await
.map_err(|e| {
NonCriticalError::PanicWhileGeneratingThumbnail(file_path.clone(), e.to_string())
})??;
let output_path = output_path.as_ref();
if let Some(shard_dir) = output_path.parent() {
fs::create_dir_all(shard_dir).await.map_err(|e| {
NonCriticalError::CreateShardDirectory(FileIOError::from((shard_dir, e)).to_string())
})?;
} else {
error!(
"Failed to get parent directory of '{}' for sharding parent directory",
output_path.display()
);
}
fs::write(output_path, &webp).await.map_err(|e| {
NonCriticalError::SaveThumbnail(file_path, FileIOError::from((output_path, e)).to_string())
})
}
#[cfg(feature = "ffmpeg")]
async fn generate_video_thumbnail(
file_path: impl AsRef<Path> + Send,
output_path: impl AsRef<Path> + Send,
) -> Result<(), NonCriticalError> {
use sd_ffmpeg::{to_thumbnail, ThumbnailSize};
let file_path = file_path.as_ref();
to_thumbnail(
file_path,
output_path,
ThumbnailSize::Scale(1024),
TARGET_QUALITY,
)
.await
.map_err(|e| {
NonCriticalError::VideoThumbnailGenerationFailed(file_path.to_path_buf(), e.to_string())
})
}

View file

@ -11,7 +11,7 @@ use std::path::{Path, PathBuf};
use prisma_client_rust::QueryError;
#[derive(thiserror::Error, Debug)]
pub enum SubPathError {
pub enum Error {
#[error("received sub path not in database: <path='{}'>", .0.display())]
SubPathNotFound(Box<Path>),
@ -22,10 +22,10 @@ pub enum SubPathError {
IsoFilePath(#[from] FilePathError),
}
impl From<SubPathError> for rspc::Error {
fn from(err: SubPathError) -> Self {
impl From<Error> for rspc::Error {
fn from(err: Error) -> Self {
match err {
SubPathError::SubPathNotFound(_) => {
Error::SubPathNotFound(_) => {
Self::with_cause(ErrorCode::NotFound, err.to_string(), err)
}
@ -39,7 +39,7 @@ pub async fn get_full_path_from_sub_path(
sub_path: &Option<impl AsRef<Path> + Send + Sync>,
location_path: impl AsRef<Path> + Send,
db: &PrismaClient,
) -> Result<PathBuf, SubPathError> {
) -> Result<PathBuf, Error> {
let location_path = location_path.as_ref();
match sub_path {
@ -53,7 +53,7 @@ pub async fn get_full_path_from_sub_path(
sub_path,
&IsolatedFilePathData::new(location_id, location_path, &full_path, true)?,
db,
SubPathError::SubPathNotFound,
Error::SubPathNotFound,
)
.await?;
@ -68,7 +68,7 @@ pub async fn maybe_get_iso_file_path_from_sub_path(
sub_path: &Option<impl AsRef<Path> + Send + Sync>,
location_path: impl AsRef<Path> + Send,
db: &PrismaClient,
) -> Result<Option<IsolatedFilePathData<'static>>, SubPathError> {
) -> Result<Option<IsolatedFilePathData<'static>>, Error> {
let location_path = location_path.as_ref();
match sub_path {
@ -79,14 +79,9 @@ pub async fn maybe_get_iso_file_path_from_sub_path(
let sub_iso_file_path =
IsolatedFilePathData::new(location_id, location_path, &full_path, true)?;
ensure_file_path_exists(
sub_path,
&sub_iso_file_path,
db,
SubPathError::SubPathNotFound,
)
.await
.map(|()| Some(sub_iso_file_path))
ensure_file_path_exists(sub_path, &sub_iso_file_path, db, Error::SubPathNotFound)
.await
.map(|()| Some(sub_iso_file_path))
}
_ => Ok(None),
}

View file

@ -11,6 +11,8 @@ pub enum SystemError {
TaskAborted(TaskId),
#[error("task join error <task_id='{0}'>")]
TaskJoin(TaskId),
#[error("task timeout error <task_id='{0}'>")]
TaskTimeout(TaskId),
#[error("forced abortion for task <task_id='{0}'> timed out")]
TaskForcedAbortTimeout(TaskId),
}

View file

@ -7,6 +7,7 @@ use std::{
Arc,
},
task::{Context, Poll},
time::Duration,
};
use async_channel as chan;
@ -141,6 +142,13 @@ pub trait Task<E: RunError>: fmt::Debug + Downcast + Send + Sync + 'static {
false
}
/// Here we define if we want the task system to shutdown our task if it takes too long to finish. By default the
/// task system will wait indefinitely for the task to finish, but if the user wants to have a timeout, they can
/// return a [`Duration`] here and the task system will cancel the task if it takes longer than the specified time.
fn with_timeout(&self) -> Option<Duration> {
None
}
/// This method represent the work that should be done by the worker, it will be called by the
/// worker when there is a slot available in its internal queue.
/// We receive a `&mut self` so any internal data can be mutated on each `run` invocation.

View file

@ -10,13 +10,13 @@ use std::{
};
use async_channel as chan;
use futures::StreamExt;
use futures::{FutureExt, StreamExt};
use futures_concurrency::future::Race;
use tokio::{
spawn,
sync::oneshot,
task::{JoinError, JoinHandle},
time::{timeout, Instant},
time::{sleep, timeout, Instant},
};
use tracing::{debug, error, trace, warn};
@ -1165,11 +1165,26 @@ fn handle_run_task_attempt<E: RunError>(
(task, Err(SystemError::TaskAborted(task_id)))
} else {
let res = task.run(&interrupter).await;
let run_result = if let Some(timeout_duration) = task.with_timeout() {
(task.run(&interrupter).map(Ok), async move {
sleep(timeout_duration)
.map(|()| Err(SystemError::TaskTimeout(task_id)))
.await
})
.race()
.await
} else {
task.run(&interrupter).map(Ok).await
};
trace!("Ran task: <worker_id='{worker_id}', task_id='{task_id}'>: {res:?}");
match run_result {
Ok(res) => {
trace!("Ran task: <worker_id='{worker_id}', task_id='{task_id}'>: {res:?}");
(task, Ok(res))
(task, Ok(res))
}
Err(e) => (task, Err(e)),
}
}
}
})