diff --git a/.vscode/launch.json b/.vscode/launch.json index 411d7f5a1..df88b7c03 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -11,6 +11,7 @@ "cargo": { "args": [ "build", + "--profile=dev-debug", "--manifest-path=./apps/desktop/src-tauri/Cargo.toml", "--no-default-features" ], diff --git a/Cargo.lock b/Cargo.lock index a02a9959a..45ba63376 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -282,7 +282,7 @@ dependencies = [ "proc-macro2", "quote", "syn 1.0.109", - "synstructure", + "synstructure 0.12.6", ] [[package]] @@ -383,9 +383,9 @@ dependencies = [ [[package]] name = "async-signal" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "329972aa325176e89114919f2a80fdae4f4c040f66a370b1a1159c6c0f94e7aa" +checksum = "794f185324c2f00e771cd9f1ae8b5ac68be2ca7abb129a87afd6e86d228bc54d" dependencies = [ "async-io", "async-lock", @@ -826,9 +826,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.5.5" +version = "1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0d3965f6417a92a6d1009c5958a67042f57e46342afb37ca58f9ad26744ec73" +checksum = "8508de54f34b8feca6638466c2bd2de9d1df5bf79c578de9a649b72d644006b3" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -840,6 +840,7 @@ dependencies = [ "http 0.2.12", "http-body 0.4.6", "http-body 1.0.0", + "httparse", "hyper 0.14.29", "hyper-rustls 0.24.2", "once_cell", @@ -852,9 +853,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.6.2" +version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4179bd8a1c943e1aceb46c5b9fc014a561bd6c35a2153e816ba29076ee49d245" +checksum = "aa6dbabc7629fab4e4467f95f119c2e1a9b00b44c893affa98e23b040a0e2567" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -869,9 +870,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.1.10" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b6764ba7e1c5ede1c9f9e4046645534f06c2581402461c559b481a420330a83" +checksum = "cfe321a6b21f5d8eabd0ade9c55d3d0335f3c3157fc2b3e87f05f34b539e4df5" dependencies = [ "base64-simd", "bytes", @@ -1306,7 +1307,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" dependencies = [ "memchr", - "regex-automata 0.4.6", + "regex-automata 0.4.7", "serde", ] @@ -1618,9 +1619,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.6" +version = "4.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9689a29b593160de5bc4aacab7b5d54fb52231de70122626c178e6a368994c7" +checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f" dependencies = [ "clap_builder", "clap_derive", @@ -1628,9 +1629,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.6" +version = "4.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5387378c84f6faa26890ebf9f0a92989f8873d4d380467bcd0d8d8620424df" +checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f" dependencies = [ "anstream", "anstyle", @@ -2749,7 +2750,7 @@ dependencies = [ "proc-macro2", "quote", "syn 1.0.109", - "synstructure", + "synstructure 0.12.6", ] [[package]] @@ -3015,6 +3016,17 @@ dependencies = [ "futures-util", ] +[[package]] +name = "futures-buffered" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02dcae03ee5afa5ea17b1aebc793806b8ddfc6dc500e0b8e8e1eb30b9dad22c0" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", +] + [[package]] name = "futures-channel" version = "0.3.30" @@ -3027,11 +3039,12 @@ dependencies = [ [[package]] name = "futures-concurrency" -version = "7.6.0" +version = "7.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51ee14e256b9143bfafbf2fddeede6f396650bacf95d06fc1b3f2b503df129a0" +checksum = "4b14ac911e85d57c5ea6eef76d7b4d4a3177ecd15f4bea2e61927e9e3823e19f" dependencies = [ "bitvec", + "futures-buffered", "futures-core", "futures-lite 1.13.0", "pin-project", @@ -3542,8 +3555,8 @@ dependencies = [ "aho-corasick 1.1.3", "bstr", "log", - "regex-automata 0.4.6", - "regex-syntax 0.8.3", + "regex-automata 0.4.7", + "regex-syntax 0.8.4", "serde", ] @@ -4009,12 +4022,12 @@ dependencies = [ [[package]] name = "http-body-util" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", - "futures-core", + "futures-util", "http 1.1.0", "http-body 1.0.0", "pin-project-lite", @@ -4223,6 +4236,124 @@ dependencies = [ "objc2", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f8ac670d7422d7f76b32e17a5db556510825b29ec9154f235977c9caba61036" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -4241,12 +4372,14 @@ dependencies = [ [[package]] name = "idna" -version = "0.5.0" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "4716a3a0933a1d01c2f72450e89596eb51dd34ef3c211ccd875acdf1f8fe47ed" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "icu_normalizer", + "icu_properties", + "smallvec", + "utf8_iter", ] [[package]] @@ -4512,9 +4645,9 @@ checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" [[package]] name = "iter_tools" -version = "0.17.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55f9f40b3308a2367d5201430790786748b3e038982317dd880677c0f7b3f3f0" +checksum = "f85582248e8796b1d7146eabe9f70c5b9de4db16bf934ca893581d33c66403b6" dependencies = [ "itertools 0.11.0", ] @@ -5417,6 +5550,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" + [[package]] name = "litrs" version = "0.4.1" @@ -8087,14 +8226,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.4" +version = "1.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" +checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" dependencies = [ "aho-corasick 1.1.3", "memchr", - "regex-automata 0.4.6", - "regex-syntax 0.8.3", + "regex-automata 0.4.7", + "regex-syntax 0.8.4", ] [[package]] @@ -8108,20 +8247,20 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" dependencies = [ "aho-corasick 1.1.3", "memchr", - "regex-syntax 0.8.3", + "regex-syntax 0.8.4", ] [[package]] name = "regex-lite" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" +checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" @@ -8131,9 +8270,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "renderdoc-sys" @@ -8864,6 +9003,7 @@ dependencies = [ "sd-p2p-tunnel", "sd-prisma", "sd-sync", + "sd-task-system", "sd-utils", "serde", "serde-hashkey", @@ -8984,7 +9124,10 @@ version = "0.1.0" dependencies = [ "prisma-client-rust", "sd-prisma", + "sd-utils", "serde", + "specta", + "uuid", ] [[package]] @@ -9329,6 +9472,7 @@ dependencies = [ "tokio", "tokio-stream", "tracing", + "tracing-subscriber", "tracing-test", "uuid", ] @@ -10227,6 +10371,17 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "sys-locale" version = "0.3.1" @@ -10851,6 +11006,16 @@ dependencies = [ "strict-num", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -11516,12 +11681,12 @@ dependencies = [ [[package]] name = "url" -version = "2.5.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +checksum = "f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56" dependencies = [ "form_urlencoded", - "idna 0.5.0", + "idna 1.0.0", "percent-encoding", "serde", ] @@ -11603,6 +11768,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + [[package]] name = "utf16string" version = "0.2.0" @@ -11612,6 +11783,12 @@ dependencies = [ "byteorder", ] +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" version = "0.2.2" @@ -12521,6 +12698,18 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "wry" version = "0.39.5" @@ -12635,12 +12824,12 @@ dependencies = [ [[package]] name = "xdg-home" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21e5a325c3cb8398ad6cf859c1135b25dd29e186679cf2da7581d9679f63b38e" +checksum = "ca91dcf8f93db085f3a0a29358cd0b9d670915468f4290e8b85d118a34211ab8" dependencies = [ "libc", - "winapi", + "windows-sys 0.52.0", ] [[package]] @@ -12710,6 +12899,30 @@ dependencies = [ "time", ] +[[package]] +name = "yoke" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", + "synstructure 0.13.1", +] + [[package]] name = "zbus" version = "4.0.1" @@ -12789,6 +13002,27 @@ dependencies = [ "syn 2.0.66", ] +[[package]] +name = "zerofrom" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", + "synstructure 0.13.1", +] + [[package]] name = "zeroize" version = "1.8.1" @@ -12809,6 +13043,28 @@ dependencies = [ "syn 2.0.66", ] +[[package]] +name = "zerovec" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb2cc8827d6c0994478a15c53f374f46fbd41bea663d809b14744bc42e6b109c" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97cf56601ee5052b4417d90c8755c6683473c926039908196cf35d99f893ebe7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "zip" version = "0.6.6" diff --git a/Cargo.toml b/Cargo.toml index 6261b8ab7..349d571ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -114,6 +114,17 @@ lto = false codegen-units = 256 incremental = true +[profile.dev-debug] +inherits = "dev" +# Enables debugger +split-debuginfo = "none" +opt-level = 0 +debug = "full" +strip = "none" +lto = "off" +codegen-units = 256 +incremental = true + # Set the settings for build scripts and proc-macros. [profile.dev.build-override] opt-level = 3 @@ -123,6 +134,13 @@ opt-level = 3 opt-level = 3 incremental = false +# Set the default for dependencies, except workspace members. +[profile.dev-debug.package."*"] +inherits = "dev" +opt-level = 3 +debug = "full" +incremental = false + # Optimize release builds [profile.release] panic = "abort" # Strip expensive panic clean-up logic diff --git a/apps/desktop/src-tauri/src/main.rs b/apps/desktop/src-tauri/src/main.rs index 8543fcb71..3b8d617ae 100644 --- a/apps/desktop/src-tauri/src/main.rs +++ b/apps/desktop/src-tauri/src/main.rs @@ -32,7 +32,7 @@ async fn app_ready(app_handle: AppHandle) { #[tauri::command(async)] #[specta::specta] -// If this erorrs, we don't have FDA and we need to re-prompt for it +// If this errors, we don't have FDA and we need to re-prompt for it async fn request_fda_macos() { DiskAccess::request_fda().expect("Unable to request full disk access"); } diff --git a/apps/desktop/src/platform.ts b/apps/desktop/src/platform.ts index 8956eff29..288e887df 100644 --- a/apps/desktop/src/platform.ts +++ b/apps/desktop/src/platform.ts @@ -45,9 +45,11 @@ function constructServerUrl(urlSuffix: string) { export const platform = { platform: 'tauri', - getThumbnailUrlByThumbKey: (keyParts) => + getThumbnailUrlByThumbKey: (thumbKey) => constructServerUrl( - `/thumbnail/${keyParts.map((i) => encodeURIComponent(i)).join('/')}.webp` + `/thumbnail/${encodeURIComponent( + thumbKey.base_directory_str + )}/${encodeURIComponent(thumbKey.shard_hex)}/${encodeURIComponent(thumbKey.cas_id)}.webp` ), getFileUrl: (libraryId, locationLocalId, filePathId) => constructServerUrl(`/file/${libraryId}/${locationLocalId}/${filePathId}`), diff --git a/apps/mobile/modules/sd-core/core/src/lib.rs b/apps/mobile/modules/sd-core/core/src/lib.rs index 0f5851bb4..575447ca4 100644 --- a/apps/mobile/modules/sd-core/core/src/lib.rs +++ b/apps/mobile/modules/sd-core/core/src/lib.rs @@ -76,8 +76,8 @@ pub fn handle_core_msg( let new_node = match Node::new(data_dir, sd_core::Env::new(CLIENT_ID)).await { Ok(node) => node, - Err(err) => { - error!("failed to initialise node: {}", err); + Err(e) => { + error!(?e, "Failed to initialize node;"); callback(Err(query)); return; } @@ -94,8 +94,8 @@ pub fn handle_core_msg( false => from_value::(v).map(|v| vec![v]), }) { Ok(v) => v, - Err(err) => { - error!("failed to decode JSON-RPC request: {}", err); // Don't use tracing here because it's before the `Node` is initialised which sets that config! + Err(e) => { + error!(?e, "Failed to decode JSON-RPC request;"); callback(Err(query)); return; } @@ -133,8 +133,8 @@ pub fn spawn_core_event_listener(callback: impl Fn(String) + Send + 'static) { while let Some(event) = rx.next().await { let data = match to_string(&event) { Ok(json) => json, - Err(err) => { - error!("Failed to serialize event: {err}"); + Err(e) => { + error!(?e, "Failed to serialize event;"); continue; } }; diff --git a/apps/mobile/src/components/explorer/FileThumb.tsx b/apps/mobile/src/components/explorer/FileThumb.tsx index f991e0df3..897cc19d4 100644 --- a/apps/mobile/src/components/explorer/FileThumb.tsx +++ b/apps/mobile/src/components/explorer/FileThumb.tsx @@ -1,24 +1,25 @@ import { DocumentDirectoryPath } from '@dr.pogodin/react-native-fs'; import { getIcon } from '@sd/assets/util'; +import { Image } from 'expo-image'; +import { useEffect, useLayoutEffect, useMemo, useState, type PropsWithChildren } from 'react'; +import { View } from 'react-native'; import { getExplorerItemData, getItemFilePath, getItemLocation, isDarkTheme, + ThumbKey, type ExplorerItem } from '@sd/client'; -import { Image } from 'expo-image'; -import { useEffect, useLayoutEffect, useMemo, useState, type PropsWithChildren } from 'react'; -import { View } from 'react-native'; import { flattenThumbnailKey, useExplorerStore } from '~/stores/explorerStore'; import { tw } from '../../lib/tailwind'; // NOTE: `file://` is required for Android to load local files! -export const getThumbnailUrlByThumbKey = (thumbKey: string[]) => { - return `file://${DocumentDirectoryPath}/thumbnails/${thumbKey - .map((i) => encodeURIComponent(i)) - .join('/')}.webp`; +export const getThumbnailUrlByThumbKey = (thumbKey: ThumbKey) => { + return `file://${DocumentDirectoryPath}/thumbnails/${encodeURIComponent( + thumbKey.base_directory_str + )}/${encodeURIComponent(thumbKey.shard_hex)}/${encodeURIComponent(thumbKey.cas_id)}.webp`; }; const FileThumbWrapper = ({ children, size = 1 }: PropsWithChildren<{ size: number }>) => ( diff --git a/apps/mobile/src/components/job/Job.tsx b/apps/mobile/src/components/job/Job.tsx index a9ccdac90..c94f8e623 100644 --- a/apps/mobile/src/components/job/Job.tsx +++ b/apps/mobile/src/components/job/Job.tsx @@ -1,4 +1,3 @@ -import { JobProgressEvent, JobReport, useJobInfo } from '@sd/client'; import { Copy, Fingerprint, @@ -11,13 +10,14 @@ import { } from 'phosphor-react-native'; import { memo } from 'react'; import { View, ViewStyle } from 'react-native'; +import { JobProgressEvent, Report, useJobInfo } from '@sd/client'; import { tw, twStyle } from '~/lib/tailwind'; import { ProgressBar } from '../animation/ProgressBar'; import JobContainer from './JobContainer'; type JobProps = { - job: JobReport; + job: Report; isChild?: boolean; containerStyle?: ViewStyle; progress: JobProgressEvent | null; diff --git a/apps/mobile/src/components/job/JobGroup.tsx b/apps/mobile/src/components/job/JobGroup.tsx index 060b4e729..c0077add4 100644 --- a/apps/mobile/src/components/job/JobGroup.tsx +++ b/apps/mobile/src/components/job/JobGroup.tsx @@ -1,19 +1,19 @@ import { Folder } from '@sd/assets/icons'; -import { - getJobNiceActionName, - getTotalTasks, - JobGroup, - JobProgressEvent, - JobReport, - useLibraryMutation, - useRspcLibraryContext, - useTotalElapsedTimeText -} from '@sd/client'; import dayjs from 'dayjs'; import { DotsThreeVertical, Eye, Pause, Play, Stop, Trash } from 'phosphor-react-native'; import { SetStateAction, useMemo, useState } from 'react'; import { Animated, Pressable, View } from 'react-native'; import { Swipeable } from 'react-native-gesture-handler'; +import { + getJobNiceActionName, + getTotalTasks, + JobGroup, + JobProgressEvent, + Report, + useLibraryMutation, + useRspcLibraryContext, + useTotalElapsedTimeText +} from '@sd/client'; import { tw, twStyle } from '~/lib/tailwind'; import { AnimatedHeight } from '../animation/layout'; @@ -64,7 +64,12 @@ export default function ({ group, progress }: JobGroupProps) { { transform: [{ translateX: translate }] } ]} > - + ); }; @@ -169,22 +174,20 @@ const toastErrorSuccess = ( }; interface OptionsProps { - activeJob?: JobReport; + activeJob?: Report; group: JobGroup; showChildJobs: boolean; - setShowChildJobs: React.Dispatch> + setShowChildJobs: React.Dispatch>; } function Options({ activeJob, group, setShowChildJobs, showChildJobs }: OptionsProps) { - const rspc = useRspcLibraryContext(); - const clearJob = useLibraryMutation( - ['jobs.clear'], { - onSuccess: () => { - rspc.queryClient.invalidateQueries(['jobs.reports']); - } - }) + const clearJob = useLibraryMutation(['jobs.clear'], { + onSuccess: () => { + rspc.queryClient.invalidateQueries(['jobs.reports']); + } + }); const resumeJob = useLibraryMutation( ['jobs.resume'], @@ -208,8 +211,7 @@ function Options({ activeJob, group, setShowChildJobs, showChildJobs }: OptionsP group.jobs.forEach((job) => { clearJob.mutate(job.id); //only one toast for all jobs - if (job.id === group.id) - toast.success('Job has been removed'); + if (job.id === group.id) toast.success('Job has been removed'); }); }; @@ -217,35 +219,68 @@ function Options({ activeJob, group, setShowChildJobs, showChildJobs }: OptionsP <> {/* Resume */} {(group.status === 'Queued' || group.status === 'Paused' || isJobPaused) && ( - )} {/* TODO: This should remove the job from panel */} - {!activeJob !== undefined ? ( - - - - } - > - setShowChildJobs(!showChildJobs)} - text="Expand" icon={Eye}/> - - - ) : ( + {activeJob !== undefined ? ( - - + ) : ( + + + + } + > + setShowChildJobs(!showChildJobs)} + text="Expand" + icon={Eye} + /> + + )} ); diff --git a/apps/mobile/src/stores/explorerStore.ts b/apps/mobile/src/stores/explorerStore.ts index ca9bc9efa..a5019fb39 100644 --- a/apps/mobile/src/stores/explorerStore.ts +++ b/apps/mobile/src/stores/explorerStore.ts @@ -1,4 +1,4 @@ -import { resetStore } from '@sd/client'; +import { ThumbKey, resetStore } from '@sd/client'; import { proxy, useSnapshot } from 'valtio'; import { proxySet } from 'valtio/utils'; @@ -26,14 +26,14 @@ const state = { orderDirection: 'Asc' as 'Asc' | 'Desc' }; -export function flattenThumbnailKey(thumbKey: string[]) { - return thumbKey.join('/'); +export function flattenThumbnailKey(thumbKey: ThumbKey) { + return `${thumbKey.base_directory_str}/${thumbKey.shard_hex}/${thumbKey.cas_id}`; } const store = proxy({ ...state, reset: () => resetStore(store, state), - addNewThumbnail: (thumbKey: string[]) => { + addNewThumbnail: (thumbKey: ThumbKey) => { store.newThumbnails.add(flattenThumbnailKey(thumbKey)); }, // this should be done when the explorer query is refreshed diff --git a/apps/web/src/App.tsx b/apps/web/src/App.tsx index 1619416a8..6da60257d 100644 --- a/apps/web/src/App.tsx +++ b/apps/web/src/App.tsx @@ -42,8 +42,10 @@ const spacedriveURL = (() => { const platform: Platform = { platform: 'web', - getThumbnailUrlByThumbKey: (keyParts) => - `${spacedriveURL}/thumbnail/${keyParts.map((i) => encodeURIComponent(i)).join('/')}.webp`, + getThumbnailUrlByThumbKey: (thumbKey) => + `${spacedriveURL}/thumbnail/${encodeURIComponent( + thumbKey.base_directory_str + )}/${encodeURIComponent(thumbKey.shard_hex)}/${encodeURIComponent(thumbKey.cas_id)}.webp`, getFileUrl: (libraryId, locationLocalId, filePathId) => `${spacedriveURL}/file/${encodeURIComponent(libraryId)}/${encodeURIComponent( locationLocalId diff --git a/core/Cargo.toml b/core/Cargo.toml index a777bedeb..dd90499ed 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -47,6 +47,7 @@ sd-p2p-proto = { path = "../crates/p2p/crates/proto" } sd-p2p-tunnel = { path = "../crates/p2p/crates/tunnel" } sd-prisma = { path = "../crates/prisma" } sd-sync = { path = "../crates/sync" } +sd-task-system = { path = "../crates/task-system" } sd-utils = { path = "../crates/utils" } # Workspace dependencies diff --git a/core/crates/heavy-lifting/src/file_identifier/cas_id.rs b/core/crates/heavy-lifting/src/file_identifier/cas_id.rs index 5ad5a9456..7240c3483 100644 --- a/core/crates/heavy-lifting/src/file_identifier/cas_id.rs +++ b/core/crates/heavy-lifting/src/file_identifier/cas_id.rs @@ -1,3 +1,5 @@ +use sd_core_prisma_helpers::CasId; + use std::path::Path; use blake3::Hasher; @@ -6,6 +8,7 @@ use tokio::{ fs::{self, File}, io::{self, AsyncReadExt, AsyncSeekExt, SeekFrom}, }; +use tracing::{instrument, trace, Level}; const SAMPLE_COUNT: u64 = 4; const SAMPLE_SIZE: u64 = 1024 * 10; @@ -20,20 +23,29 @@ const_assert!((HEADER_OR_FOOTER_SIZE * 2 + SAMPLE_COUNT * SAMPLE_SIZE) < MINIMUM // Asserting that the sample size is larger than header/footer size, as the same buffer is used for both const_assert!(SAMPLE_SIZE > HEADER_OR_FOOTER_SIZE); +#[instrument( + skip(path), + ret(level = Level::TRACE), + err, + fields(path = %path.as_ref().display() +))] // SAFETY: Casts here are safe, they're hardcoded values we have some const assertions above to make sure they're correct #[allow(clippy::cast_possible_truncation)] #[allow(clippy::cast_possible_wrap)] pub async fn generate_cas_id( path: impl AsRef + Send, size: u64, -) -> Result { +) -> Result, io::Error> { let mut hasher = Hasher::new(); hasher.update(&size.to_le_bytes()); if size <= MINIMUM_FILE_SIZE { + trace!("File is small, hashing the whole file"); // For small files, we hash the whole file hasher.update(&fs::read(path).await?); } else { + trace!("File bigger than threshold, hashing samples"); + let mut file = File::open(path).await?; let mut buf = vec![0; SAMPLE_SIZE as usize].into_boxed_slice(); @@ -64,5 +76,5 @@ pub async fn generate_cas_id( hasher.update(&buf[..HEADER_OR_FOOTER_SIZE as usize]); } - Ok(hasher.finalize().to_hex()[..16].to_string()) + Ok(hasher.finalize().to_hex()[..16].to_string().into()) } diff --git a/core/crates/heavy-lifting/src/file_identifier/job.rs b/core/crates/heavy-lifting/src/file_identifier/job.rs index 8ae358dee..7e7a99726 100644 --- a/core/crates/heavy-lifting/src/file_identifier/job.rs +++ b/core/crates/heavy-lifting/src/file_identifier/job.rs @@ -4,24 +4,26 @@ use crate::{ job::{Job, JobReturn, JobTaskDispatcher, ReturnStatus}, report::ReportOutputMetadata, utils::cancel_pending_tasks, - SerializableJob, SerializedTasks, + DispatcherError, JobErrorOrDispatcherError, SerializableJob, SerializedTasks, }, utils::sub_path::maybe_get_iso_file_path_from_sub_path, - Error, JobName, LocationScanState, NonCriticalError, OuterContext, ProgressUpdate, UpdateEvent, + Error, JobContext, JobName, LocationScanState, NonCriticalError, OuterContext, ProgressUpdate, + UpdateEvent, }; use sd_core_file_path_helper::IsolatedFilePathData; -use sd_core_prisma_helpers::file_path_for_file_identifier; +use sd_core_prisma_helpers::{file_path_for_file_identifier, CasId}; use sd_prisma::prisma::{file_path, location, SortOrder}; use sd_task_system::{ AnyTaskOutput, IntoTask, SerializableTask, Task, TaskDispatcher, TaskHandle, TaskId, TaskOutput, TaskStatus, }; -use sd_utils::db::maybe_missing; +use sd_utils::{db::maybe_missing, u64_to_frontend}; use std::{ collections::{HashMap, HashSet}, + fmt, hash::{Hash, Hasher}, mem, path::PathBuf, @@ -34,28 +36,67 @@ use futures_concurrency::future::TryJoin; use serde::{Deserialize, Serialize}; use serde_json::json; use tokio::time::Instant; -use tracing::warn; +use tracing::{debug, instrument, trace, warn, Level}; use super::{ - orphan_path_filters_deep, orphan_path_filters_shallow, - tasks::{ - extract_file_metadata, object_processor, ExtractFileMetadataTask, ObjectProcessorTask, - }, + accumulate_file_paths_by_cas_id, dispatch_object_processor_tasks, orphan_path_filters_deep, + orphan_path_filters_shallow, + tasks::{self, identifier, object_processor, FilePathToCreateOrLinkObject}, CHUNK_SIZE, }; +#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +enum Phase { + SearchingOrphansWithPriority, + SearchingOrphans, + IdentifyingFiles, + ProcessingObjects, +} + +impl Default for Phase { + fn default() -> Self { + Self::SearchingOrphansWithPriority + } +} + +impl fmt::Display for Phase { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::SearchingOrphans | Self::SearchingOrphansWithPriority => { + write!(f, "searching_orphans") + } + Self::IdentifyingFiles => write!(f, "identifying_files"), + Self::ProcessingObjects => write!(f, "processing_objects"), + } + } +} + +impl From for String { + fn from(phase: Phase) -> Self { + phase.to_string() + } +} + #[derive(Debug)] pub struct FileIdentifier { + // Received arguments location: Arc, location_path: Arc, sub_path: Option, + // Inner state + file_paths_accumulator: HashMap, Vec>, + file_paths_ids_with_priority: HashSet, + last_orphan_file_path_id: Option, + + // Job control + phase: Phase, + + // Run data metadata: Metadata, - - priority_tasks_ids: HashSet, - errors: Vec, + // On shutdown data pending_tasks_on_resume: Vec>, tasks_for_shutdown: Vec>>, } @@ -72,29 +113,27 @@ impl Hash for FileIdentifier { impl Job for FileIdentifier { const NAME: JobName = JobName::FileIdentifier; - async fn resume_tasks( + async fn resume_tasks( &mut self, dispatcher: &JobTaskDispatcher, - ctx: &impl OuterContext, + ctx: &impl JobContext, SerializedTasks(serialized_tasks): SerializedTasks, ) -> Result<(), Error> { - self.pending_tasks_on_resume = dispatcher + if let Ok(tasks) = dispatcher .dispatch_many_boxed( rmp_serde::from_slice::)>>(&serialized_tasks) .map_err(file_identifier::Error::from)? .into_iter() .map(|(task_kind, task_bytes)| async move { match task_kind { - TaskKind::ExtractFileMetadata => { - >::deserialize( - &task_bytes, - (), - ) - .await - .map(IntoTask::into_task) - } + TaskKind::Identifier => tasks::Identifier::deserialize( + &task_bytes, + (Arc::clone(ctx.db()), Arc::clone(ctx.sync())), + ) + .await + .map(IntoTask::into_task), - TaskKind::ObjectProcessor => ObjectProcessorTask::deserialize( + TaskKind::ObjectProcessor => tasks::ObjectProcessor::deserialize( &task_bytes, (Arc::clone(ctx.db()), Arc::clone(ctx.sync())), ) @@ -107,34 +146,76 @@ impl Job for FileIdentifier { .await .map_err(file_identifier::Error::from)?, ) - .await; + .await + { + self.pending_tasks_on_resume = tasks; + } else { + warn!("Failed to dispatch tasks to resume as job was already canceled"); + } Ok(()) } - async fn run( + #[instrument( + skip_all, + fields( + location_id = self.location.id, + location_path = %self.location_path.display(), + sub_path = ?self.sub_path.as_ref().map(|path| path.display()), + ), + ret(level = Level::TRACE), + err, + )] + async fn run( mut self, dispatcher: JobTaskDispatcher, - ctx: Ctx, + ctx: impl JobContext, ) -> Result { let mut pending_running_tasks = FuturesUnordered::new(); - self.init_or_resume(&mut pending_running_tasks, &ctx, &dispatcher) - .await?; + match self + .init_or_resume(&mut pending_running_tasks, &ctx, &dispatcher) + .await + { + Ok(()) => { /* Everything is awesome! */ } + Err(JobErrorOrDispatcherError::JobError(e)) => { + return Err(e.into()); + } + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::JobCanceled(_))) => { + return Ok(self.cancel_job(&mut pending_running_tasks).await); + } + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::Shutdown(tasks))) => { + self.tasks_for_shutdown.extend(tasks); + + if pending_running_tasks.is_empty() { + // If no task managed to be dispatched, we can just shutdown + // otherwise we have to process handles below and wait for them to be shutdown too + return Ok(ReturnStatus::Shutdown( + SerializableJob::::serialize(self).await, + )); + } + } + } while let Some(task) = pending_running_tasks.next().await { match task { Ok(TaskStatus::Done((task_id, TaskOutput::Out(out)))) => { - if let Some(new_object_processor_task) = self + match self .process_task_output(task_id, out, &ctx, &dispatcher) .await { - pending_running_tasks.push(new_object_processor_task); - }; + Ok(tasks) => pending_running_tasks.extend(tasks), + Err(DispatcherError::JobCanceled(_)) => { + return Ok(self.cancel_job(&mut pending_running_tasks).await); + } + Err(DispatcherError::Shutdown(tasks)) => { + self.tasks_for_shutdown.extend(tasks); + } + } } Ok(TaskStatus::Done((task_id, TaskOutput::Empty))) => { - warn!("Task returned an empty output"); + warn!(%task_id, "Task returned an empty output"); } Ok(TaskStatus::Shutdown(task)) => { @@ -142,19 +223,17 @@ impl Job for FileIdentifier { } Ok(TaskStatus::Error(e)) => { - cancel_pending_tasks(&pending_running_tasks).await; + cancel_pending_tasks(&mut pending_running_tasks).await; return Err(e); } Ok(TaskStatus::Canceled | TaskStatus::ForcedAbortion) => { - cancel_pending_tasks(&pending_running_tasks).await; - - return Ok(ReturnStatus::Canceled); + return Ok(self.cancel_job(&mut pending_running_tasks).await); } Err(e) => { - cancel_pending_tasks(&pending_running_tasks).await; + cancel_pending_tasks(&mut pending_running_tasks).await; return Err(e.into()); } @@ -163,7 +242,7 @@ impl Job for FileIdentifier { if !self.tasks_for_shutdown.is_empty() { return Ok(ReturnStatus::Shutdown( - SerializableJob::::serialize(self).await, + SerializableJob::::serialize(self).await, )); } @@ -207,70 +286,152 @@ impl FileIdentifier { .map(Arc::new)?, location: Arc::new(location), sub_path, + file_paths_accumulator: HashMap::new(), + file_paths_ids_with_priority: HashSet::new(), + last_orphan_file_path_id: None, + phase: Phase::default(), metadata: Metadata::default(), - priority_tasks_ids: HashSet::new(), errors: Vec::new(), pending_tasks_on_resume: Vec::new(), tasks_for_shutdown: Vec::new(), }) } - async fn init_or_resume( + #[allow(clippy::too_many_lines)] + async fn init_or_resume( &mut self, pending_running_tasks: &mut FuturesUnordered>, - ctx: &impl OuterContext, + ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, - ) -> Result<(), file_identifier::Error> { + ) -> Result<(), JobErrorOrDispatcherError> { // if we don't have any pending task, then this is a fresh job - if self.pending_tasks_on_resume.is_empty() { - let db = ctx.db(); - let maybe_sub_iso_file_path = maybe_get_iso_file_path_from_sub_path( + let db = ctx.db(); + let maybe_sub_iso_file_path = + maybe_get_iso_file_path_from_sub_path::( self.location.id, - &self.sub_path, + self.sub_path.as_ref(), &*self.location_path, db, ) .await?; - let mut last_orphan_file_path_id = None; + let start = Instant::now(); - let start = Instant::now(); + let location_root_iso_file_path = IsolatedFilePathData::new( + self.location.id, + &*self.location_path, + &*self.location_path, + true, + ) + .map_err(file_identifier::Error::from)?; - let location_root_iso_file_path = IsolatedFilePathData::new( - self.location.id, - &*self.location_path, - &*self.location_path, - true, - ) - .map_err(file_identifier::Error::from)?; + if self.pending_tasks_on_resume.is_empty() { + ctx.progress([ProgressUpdate::phase(self.phase)]).await; // First we dispatch some shallow priority tasks to quickly identify orphans in the location // root directory or in the desired sub-path - let file_paths_already_identifying = self - .dispatch_priority_identifier_tasks( - &mut last_orphan_file_path_id, - maybe_sub_iso_file_path - .as_ref() - .unwrap_or(&location_root_iso_file_path), - ctx, - dispatcher, - pending_running_tasks, - ) - .await?; + self.dispatch_priority_identifier_tasks( + maybe_sub_iso_file_path + .as_ref() + .unwrap_or(&location_root_iso_file_path), + ctx, + dispatcher, + pending_running_tasks, + ) + .await?; + + self.phase = Phase::SearchingOrphans; + // Resetting the last orphan file path id for deep search + self.last_orphan_file_path_id = None; self.dispatch_deep_identifier_tasks( - &mut last_orphan_file_path_id, &maybe_sub_iso_file_path, ctx, dispatcher, pending_running_tasks, - &file_paths_already_identifying, ) .await?; + self.last_orphan_file_path_id = None; + self.phase = Phase::IdentifyingFiles; + + ctx.progress(vec![ + ProgressUpdate::TaskCount(u64::from(self.metadata.total_identifier_tasks)), + ProgressUpdate::phase(self.phase), + ProgressUpdate::Message(format!( + "{} files to be identified", + self.metadata.total_found_orphans + )), + ]) + .await; + self.metadata.seeking_orphans_time = start.elapsed(); } else { pending_running_tasks.extend(mem::take(&mut self.pending_tasks_on_resume)); + + // For these 2 phases, we need to keep dispatching tasks until we have no more orphans to identify + // as we could have receive a shutdown command before being able to run through all orphans + match self.phase { + Phase::SearchingOrphansWithPriority => { + self.dispatch_priority_identifier_tasks( + maybe_sub_iso_file_path + .as_ref() + .unwrap_or(&location_root_iso_file_path), + ctx, + dispatcher, + pending_running_tasks, + ) + .await?; + + self.phase = Phase::SearchingOrphans; + // Resetting the last orphan file path id for deep search + self.last_orphan_file_path_id = None; + + self.dispatch_deep_identifier_tasks( + &maybe_sub_iso_file_path, + ctx, + dispatcher, + pending_running_tasks, + ) + .await?; + + self.last_orphan_file_path_id = None; + self.phase = Phase::IdentifyingFiles; + } + + Phase::SearchingOrphans => { + self.dispatch_deep_identifier_tasks( + &maybe_sub_iso_file_path, + ctx, + dispatcher, + pending_running_tasks, + ) + .await?; + + self.last_orphan_file_path_id = None; + self.phase = Phase::IdentifyingFiles; + } + + _ => {} + } + + ctx.progress(vec![ + ProgressUpdate::TaskCount(if matches!(self.phase, Phase::IdentifyingFiles) { + u64::from(self.metadata.total_identifier_tasks) + } else { + u64::from(self.metadata.total_object_processor_tasks) + }), + ProgressUpdate::phase(self.phase), + ProgressUpdate::Message(format!( + "{} files to be identified", + self.metadata.total_found_orphans + )), + ]) + .await; + debug!( + resuming_tasks_count = self.pending_tasks_on_resume.len(), + "Resuming tasks for FileIdentifier job;", + ); } Ok(()) @@ -281,19 +442,19 @@ impl FileIdentifier { /// # Panics /// Will panic if another task type is added in the job, but this function wasn't updated to handle it /// - async fn process_task_output( + async fn process_task_output( &mut self, task_id: TaskId, any_task_output: Box, - ctx: &impl OuterContext, + ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, - ) -> Option> { - if any_task_output.is::() { + ) -> Result>, DispatcherError> { + if any_task_output.is::() { return self - .process_extract_file_metadata_output( + .process_identifier_output( task_id, *any_task_output - .downcast::() + .downcast::() .expect("just checked"), ctx, dispatcher, @@ -306,186 +467,276 @@ impl FileIdentifier { .downcast::() .expect("just checked"), ctx, - ); + ) + .await; } else { unreachable!("Unexpected task output type: "); } - None + Ok(vec![]) } - async fn process_extract_file_metadata_output( + #[instrument( + skip_all, + fields( + %task_id, + ?extract_metadata_time, + ?save_db_time, + created_objects_count, + total_identified_files, + errors_count = errors.len() + ) + )] + async fn process_identifier_output( &mut self, task_id: TaskId, - extract_file_metadata::Output { - identified_files, + identifier::Output { + file_path_ids_with_new_object, + file_paths_by_cas_id, extract_metadata_time, + save_db_time, + created_objects_count, + total_identified_files, errors, - }: extract_file_metadata::Output, - ctx: &impl OuterContext, + }: identifier::Output, + ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, - ) -> Option> { - self.metadata.extract_metadata_time += extract_metadata_time; - self.errors.extend(errors); + ) -> Result>, DispatcherError> { + self.metadata.mean_extract_metadata_time += extract_metadata_time; + self.metadata.mean_save_db_time_on_identifier_tasks += save_db_time; + self.metadata.total_identified_files += total_identified_files; + self.metadata.created_objects_count += created_objects_count; - if identified_files.is_empty() { - self.metadata.completed_tasks += 1; + let file_paths_with_new_object_to_report = file_path_ids_with_new_object + .into_iter() + .filter_map(|id| self.file_paths_ids_with_priority.take(&id)) + .collect::>(); - ctx.progress(vec![ProgressUpdate::CompletedTaskCount( - self.metadata.completed_tasks, - )]); + if !file_paths_with_new_object_to_report.is_empty() { + ctx.report_update(UpdateEvent::NewIdentifiedObjects { + file_path_ids: file_paths_with_new_object_to_report, + }); + } - None - } else { - ctx.progress_msg(format!("Identified {} files", identified_files.len())); + if !errors.is_empty() { + warn!(?errors, "Non critical errors while extracting metadata;"); + self.errors.extend(errors); + } - let with_priority = self.priority_tasks_ids.remove(&task_id); + accumulate_file_paths_by_cas_id(file_paths_by_cas_id, &mut self.file_paths_accumulator); - let task = dispatcher - .dispatch(ObjectProcessorTask::new( - identified_files, - Arc::clone(ctx.db()), - Arc::clone(ctx.sync()), - with_priority, - )) - .await; + self.metadata.completed_identifier_tasks += 1; - if with_priority { - self.priority_tasks_ids.insert(task.task_id()); + ctx.progress(vec![ + ProgressUpdate::CompletedTaskCount(u64::from(self.metadata.completed_identifier_tasks)), + ProgressUpdate::Message(format!( + "Identified {} of {} files", + self.metadata.total_identified_files, self.metadata.total_found_orphans + )), + ]) + .await; + + debug!( + "Processed ({}/{}) identifier tasks, took: {extract_metadata_time:?};", + self.metadata.completed_identifier_tasks, self.metadata.total_identifier_tasks, + ); + + // If we completed all identifier tasks, then we dispatch the object processor tasks + if self.metadata.completed_identifier_tasks == self.metadata.total_identifier_tasks { + self.phase = Phase::ProcessingObjects; + let (tasks_count, res) = match dispatch_object_processor_tasks( + self.file_paths_accumulator.drain(), + ctx, + dispatcher, + false, + ) + .await + { + Ok(task_handles) => (task_handles.len(), Ok(task_handles)), + Err(DispatcherError::Shutdown(tasks)) => { + (tasks.len(), Err(DispatcherError::Shutdown(tasks))) + } + Err(e) => return Err(e), + }; + + #[allow(clippy::cast_possible_truncation)] + { + // SAFETY: we know that `tasks.len()` is a valid u32 as we wouldn't dispatch more than `u32::MAX` tasks + self.metadata.total_object_processor_tasks = tasks_count as u32; } - Some(task) + ctx.progress(vec![ + ProgressUpdate::TaskCount(u64::from(self.metadata.total_object_processor_tasks)), + ProgressUpdate::CompletedTaskCount(0), + ProgressUpdate::phase(self.phase), + ]) + .await; + + res + } else { + Ok(vec![]) } } - fn process_object_processor_output( + #[instrument(skip(self, file_path_ids_with_new_object, ctx))] + async fn process_object_processor_output( &mut self, task_id: TaskId, object_processor::Output { file_path_ids_with_new_object, - assign_cas_ids_time, fetch_existing_objects_time, assign_to_existing_object_time, create_object_time, created_objects_count, linked_objects_count, }: object_processor::Output, - ctx: &impl OuterContext, + ctx: &impl JobContext, ) { - self.metadata.assign_cas_ids_time += assign_cas_ids_time; - self.metadata.fetch_existing_objects_time += fetch_existing_objects_time; - self.metadata.assign_to_existing_object_time += assign_to_existing_object_time; - self.metadata.create_object_time += create_object_time; + self.metadata.mean_fetch_existing_objects_time += fetch_existing_objects_time; + self.metadata.mean_assign_to_existing_object_time += assign_to_existing_object_time; + self.metadata.mean_create_object_time += create_object_time; self.metadata.created_objects_count += created_objects_count; self.metadata.linked_objects_count += linked_objects_count; - self.metadata.completed_tasks += 1; + self.metadata.completed_object_processor_tasks += 1; ctx.progress(vec![ - ProgressUpdate::CompletedTaskCount(self.metadata.completed_tasks), + ProgressUpdate::CompletedTaskCount(u64::from( + self.metadata.completed_object_processor_tasks, + )), ProgressUpdate::Message(format!( "Processed {} of {} objects", self.metadata.created_objects_count + self.metadata.linked_objects_count, self.metadata.total_found_orphans )), - ]); + ]) + .await; - if self.priority_tasks_ids.remove(&task_id) { + let file_paths_with_new_object_to_report = file_path_ids_with_new_object + .into_iter() + .filter_map(|id| self.file_paths_ids_with_priority.take(&id)) + .collect::>(); + + if !file_paths_with_new_object_to_report.is_empty() { ctx.report_update(UpdateEvent::NewIdentifiedObjects { - file_path_ids: file_path_ids_with_new_object, + file_path_ids: file_paths_with_new_object_to_report, }); } + + debug!( + "Processed ({}/{}) object processor tasks, took: {:?};", + self.metadata.completed_object_processor_tasks, + self.metadata.total_object_processor_tasks, + fetch_existing_objects_time + assign_to_existing_object_time + create_object_time, + ); } - async fn dispatch_priority_identifier_tasks( + async fn dispatch_priority_identifier_tasks( &mut self, - last_orphan_file_path_id: &mut Option, sub_iso_file_path: &IsolatedFilePathData<'static>, - ctx: &impl OuterContext, + ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, pending_running_tasks: &FuturesUnordered>, - ) -> Result, file_identifier::Error> { + ) -> Result<(), JobErrorOrDispatcherError> { let db = ctx.db(); - let mut file_paths_already_identifying = HashSet::new(); - loop { + let start = Instant::now(); + #[allow(clippy::cast_possible_wrap)] // SAFETY: we know that CHUNK_SIZE is a valid i64 let orphan_paths = db .file_path() .find_many(orphan_path_filters_shallow( self.location.id, - *last_orphan_file_path_id, + self.last_orphan_file_path_id, sub_iso_file_path, )) .order_by(file_path::id::order(SortOrder::Asc)) .take(CHUNK_SIZE as i64) .select(file_path_for_file_identifier::select()) .exec() - .await?; + .await + .map_err(file_identifier::Error::from)?; + + trace!(orphans_count = orphan_paths.len(), "Found orphan paths;"); if orphan_paths.is_empty() { break; } - file_paths_already_identifying.extend(orphan_paths.iter().map(|path| path.id)); + self.file_paths_ids_with_priority.extend( + orphan_paths + .iter() + .map(|file_path_for_file_identifier::Data { id, .. }| *id), + ); self.metadata.total_found_orphans += orphan_paths.len() as u64; - *last_orphan_file_path_id = + self.last_orphan_file_path_id = Some(orphan_paths.last().expect("orphan_paths is not empty").id); + self.metadata.total_identifier_tasks += 1; + ctx.progress(vec![ - ProgressUpdate::TaskCount(self.metadata.total_found_orphans), + ProgressUpdate::TaskCount(u64::from(self.metadata.total_identifier_tasks)), ProgressUpdate::Message(format!( - "{} files to be identified", + "Found {} orphan paths", self.metadata.total_found_orphans )), - ]); + ]) + .await; - let priority_task = dispatcher - .dispatch(ExtractFileMetadataTask::new( - Arc::clone(&self.location), - Arc::clone(&self.location_path), - orphan_paths, - true, - )) - .await; + debug!( + "Dispatched ({}/{}) identifier tasks, took: {:?};", + self.metadata.completed_identifier_tasks, + self.metadata.total_identifier_tasks, + start.elapsed(), + ); - self.priority_tasks_ids.insert(priority_task.task_id()); - - pending_running_tasks.push(priority_task); + pending_running_tasks.push( + dispatcher + .dispatch(tasks::Identifier::new( + Arc::clone(&self.location), + Arc::clone(&self.location_path), + orphan_paths, + true, + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + )) + .await?, + ); } - Ok(file_paths_already_identifying) + Ok(()) } - async fn dispatch_deep_identifier_tasks( + async fn dispatch_deep_identifier_tasks( &mut self, - last_orphan_file_path_id: &mut Option, maybe_sub_iso_file_path: &Option>, - ctx: &impl OuterContext, + ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, pending_running_tasks: &FuturesUnordered>, - file_paths_already_identifying: &HashSet, - ) -> Result<(), file_identifier::Error> { + ) -> Result<(), JobErrorOrDispatcherError> { let db = ctx.db(); loop { + let start = Instant::now(); + #[allow(clippy::cast_possible_wrap)] // SAFETY: we know that CHUNK_SIZE is a valid i64 let mut orphan_paths = db .file_path() .find_many(orphan_path_filters_deep( self.location.id, - *last_orphan_file_path_id, + self.last_orphan_file_path_id, maybe_sub_iso_file_path, )) .order_by(file_path::id::order(SortOrder::Asc)) .take(CHUNK_SIZE as i64) .select(file_path_for_file_identifier::select()) .exec() - .await?; + .await + .map_err(file_identifier::Error::from)?; // No other orphans to identify, we can break the loop if orphan_paths.is_empty() { @@ -493,10 +744,12 @@ impl FileIdentifier { } // We grab the last id to use as a starting point for the next iteration, in case we skip this one - *last_orphan_file_path_id = + self.last_orphan_file_path_id = Some(orphan_paths.last().expect("orphan_paths is not empty").id); - orphan_paths.retain(|path| !file_paths_already_identifying.contains(&path.id)); + orphan_paths.retain(|file_path_for_file_identifier::Data { id, .. }| { + !self.file_paths_ids_with_priority.contains(id) + }); // If we don't have any new orphan paths after filtering out, we can skip this iteration if orphan_paths.is_empty() { @@ -505,33 +758,59 @@ impl FileIdentifier { self.metadata.total_found_orphans += orphan_paths.len() as u64; + self.metadata.total_identifier_tasks += 1; + ctx.progress(vec![ - ProgressUpdate::TaskCount(self.metadata.total_found_orphans), + ProgressUpdate::TaskCount(u64::from(self.metadata.total_identifier_tasks)), ProgressUpdate::Message(format!( - "{} files to be identified", + "Found {} orphan paths", self.metadata.total_found_orphans )), - ]); + ]) + .await; + + debug!( + "Dispatched ({}/{}) identifier tasks, took: {:?};", + self.metadata.completed_identifier_tasks, + self.metadata.total_identifier_tasks, + start.elapsed(), + ); pending_running_tasks.push( dispatcher - .dispatch(ExtractFileMetadataTask::new( + .dispatch(tasks::Identifier::new( Arc::clone(&self.location), Arc::clone(&self.location_path), orphan_paths, false, + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), )) - .await, + .await?, ); } Ok(()) } + + async fn cancel_job( + &mut self, + pending_running_tasks: &mut FuturesUnordered>, + ) -> ReturnStatus { + cancel_pending_tasks(pending_running_tasks).await; + + ReturnStatus::Canceled( + JobReturn::builder() + .with_metadata(mem::take(&mut self.metadata)) + .with_non_critical_errors(mem::take(&mut self.errors)) + .build(), + ) + } } #[derive(Debug, Clone, Copy, Serialize, Deserialize)] enum TaskKind { - ExtractFileMetadata, + Identifier, ObjectProcessor, } @@ -541,9 +820,12 @@ struct SaveState { location_path: Arc, sub_path: Option, - metadata: Metadata, + file_paths_accumulator: HashMap, Vec>, + file_paths_ids_with_priority: HashSet, + last_orphan_file_path_id: Option, - priority_tasks_ids: HashSet, + phase: Phase, + metadata: Metadata, errors: Vec, @@ -552,119 +834,178 @@ struct SaveState { #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct Metadata { - extract_metadata_time: Duration, - assign_cas_ids_time: Duration, - fetch_existing_objects_time: Duration, - assign_to_existing_object_time: Duration, - create_object_time: Duration, + mean_extract_metadata_time: Duration, + mean_save_db_time_on_identifier_tasks: Duration, + mean_fetch_existing_objects_time: Duration, + mean_assign_to_existing_object_time: Duration, + mean_create_object_time: Duration, seeking_orphans_time: Duration, total_found_orphans: u64, + total_identified_files: u64, created_objects_count: u64, linked_objects_count: u64, - completed_tasks: u64, + total_identifier_tasks: u32, + completed_identifier_tasks: u32, + total_object_processor_tasks: u32, + completed_object_processor_tasks: u32, } -impl From for ReportOutputMetadata { - fn from(value: Metadata) -> Self { - Self::Metrics(HashMap::from([ - ( - "extract_metadata_time".into(), - json!(value.extract_metadata_time), - ), - ( - "assign_cas_ids_time".into(), - json!(value.assign_cas_ids_time), - ), - ( - "fetch_existing_objects_time".into(), - json!(value.fetch_existing_objects_time), - ), - ( - "assign_to_existing_object_time".into(), - json!(value.assign_to_existing_object_time), - ), - ("create_object_time".into(), json!(value.create_object_time)), - ( - "seeking_orphans_time".into(), - json!(value.seeking_orphans_time), - ), - ( - "total_found_orphans".into(), - json!(value.total_found_orphans), - ), - ( - "created_objects_count".into(), - json!(value.created_objects_count), - ), - ( - "linked_objects_count".into(), - json!(value.linked_objects_count), - ), - ("total_tasks".into(), json!(value.completed_tasks)), - ])) +impl From for Vec { + fn from( + Metadata { + mut mean_extract_metadata_time, + mut mean_save_db_time_on_identifier_tasks, + mut mean_fetch_existing_objects_time, + mut mean_assign_to_existing_object_time, + mut mean_create_object_time, + seeking_orphans_time, + total_found_orphans, + total_identified_files, + created_objects_count, + linked_objects_count, + total_identifier_tasks, + completed_identifier_tasks, + total_object_processor_tasks, + completed_object_processor_tasks, + }: Metadata, + ) -> Self { + // To avoid division by zero + mean_extract_metadata_time /= u32::max(total_identifier_tasks, 1); + mean_save_db_time_on_identifier_tasks /= u32::max(total_identifier_tasks, 1); + + mean_fetch_existing_objects_time /= u32::max(total_object_processor_tasks, 1); + mean_assign_to_existing_object_time /= u32::max(total_object_processor_tasks, 1); + mean_create_object_time /= u32::max(total_object_processor_tasks, 1); + + vec![ + ReportOutputMetadata::FileIdentifier { + total_orphan_paths: u64_to_frontend(total_found_orphans), + total_objects_created: u64_to_frontend(created_objects_count), + total_objects_linked: u64_to_frontend(linked_objects_count), + }, + ReportOutputMetadata::Metrics(HashMap::from([ + ( + "mean_extract_metadata_time".into(), + json!(mean_extract_metadata_time), + ), + ( + "mean_save_db_time_on_identifier_tasks".into(), + json!(mean_save_db_time_on_identifier_tasks), + ), + ( + "mean_fetch_existing_objects_time".into(), + json!(mean_fetch_existing_objects_time), + ), + ( + "mean_assign_to_existing_object_time".into(), + json!(mean_assign_to_existing_object_time), + ), + ( + "mean_create_object_time".into(), + json!(mean_create_object_time), + ), + ("seeking_orphans_time".into(), json!(seeking_orphans_time)), + ("total_found_orphans".into(), json!(total_found_orphans)), + ( + "total_identified_files".into(), + json!(total_identified_files), + ), + ("created_objects_count".into(), json!(created_objects_count)), + ("linked_objects_count".into(), json!(linked_objects_count)), + ( + "total_identifier_tasks".into(), + json!(total_identifier_tasks), + ), + ( + "completed_identifier_tasks".into(), + json!(completed_identifier_tasks), + ), + ( + "total_object_processor_tasks".into(), + json!(total_object_processor_tasks), + ), + ( + "completed_object_processor_tasks".into(), + json!(completed_object_processor_tasks), + ), + ])), + ] } } -impl SerializableJob for FileIdentifier { +impl SerializableJob for FileIdentifier { async fn serialize(self) -> Result>, rmp_serde::encode::Error> { let Self { location, location_path, sub_path, + file_paths_accumulator, + file_paths_ids_with_priority, + last_orphan_file_path_id, + phase, metadata, - priority_tasks_ids, errors, tasks_for_shutdown, .. } = self; + let serialized_tasks = tasks_for_shutdown + .into_iter() + .map(|task| async move { + if task.is::() { + SerializableTask::serialize( + *task.downcast::().expect("just checked"), + ) + .await + .map(|bytes| (TaskKind::Identifier, bytes)) + } else if task.is::() { + task.downcast::() + .expect("just checked") + .serialize() + .await + .map(|bytes| (TaskKind::ObjectProcessor, bytes)) + } else { + unreachable!("Unexpected task type") + } + }) + .collect::>() + .try_join() + .await?; + + let tasks_for_shutdown_bytes = if serialized_tasks.is_empty() { + None + } else { + Some(SerializedTasks(rmp_serde::to_vec_named(&serialized_tasks)?)) + }; + rmp_serde::to_vec_named(&SaveState { location, location_path, sub_path, + file_paths_accumulator, + file_paths_ids_with_priority, + last_orphan_file_path_id, + phase, metadata, - priority_tasks_ids, - tasks_for_shutdown_bytes: Some(SerializedTasks(rmp_serde::to_vec_named( - &tasks_for_shutdown - .into_iter() - .map(|task| async move { - if task.is::() { - SerializableTask::serialize( - *task - .downcast::() - .expect("just checked"), - ) - .await - .map(|bytes| (TaskKind::ExtractFileMetadata, bytes)) - } else if task.is::() { - task.downcast::() - .expect("just checked") - .serialize() - .await - .map(|bytes| (TaskKind::ObjectProcessor, bytes)) - } else { - unreachable!("Unexpected task type") - } - }) - .collect::>() - .try_join() - .await?, - )?)), errors, + tasks_for_shutdown_bytes, }) .map(Some) } async fn deserialize( serialized_job: &[u8], - _: &Ctx, + _: &OuterCtx, ) -> Result)>, rmp_serde::decode::Error> { let SaveState { location, location_path, sub_path, + file_paths_accumulator, + file_paths_ids_with_priority, + last_orphan_file_path_id, + phase, metadata, - priority_tasks_ids, errors, tasks_for_shutdown_bytes, } = rmp_serde::from_slice::(serialized_job)?; @@ -674,8 +1015,11 @@ impl SerializableJob for FileIdentifier { location, location_path, sub_path, + file_paths_accumulator, + file_paths_ids_with_priority, + last_orphan_file_path_id, + phase, metadata, - priority_tasks_ids, errors, pending_tasks_on_resume: Vec::new(), tasks_for_shutdown: Vec::new(), diff --git a/core/crates/heavy-lifting/src/file_identifier/mod.rs b/core/crates/heavy-lifting/src/file_identifier/mod.rs index b25e08578..e27d560b7 100644 --- a/core/crates/heavy-lifting/src/file_identifier/mod.rs +++ b/core/crates/heavy-lifting/src/file_identifier/mod.rs @@ -1,12 +1,20 @@ -use crate::utils::sub_path; +use crate::{utils::sub_path, OuterContext}; use sd_core_file_path_helper::{FilePathError, IsolatedFilePathData}; +use sd_core_prisma_helpers::CasId; use sd_file_ext::{extensions::Extension, kind::ObjectKind}; use sd_prisma::prisma::{file_path, location}; +use sd_task_system::{TaskDispatcher, TaskHandle}; use sd_utils::{db::MissingFieldError, error::FileIOError}; -use std::{fs::Metadata, path::Path}; +use std::{ + collections::{hash_map::Entry, HashMap}, + fs::Metadata, + mem, + path::Path, + sync::Arc, +}; use prisma_client_rust::{or, QueryError}; use rspc::ErrorCode; @@ -20,11 +28,13 @@ pub mod job; mod shallow; mod tasks; -use cas_id::generate_cas_id; +pub use cas_id::generate_cas_id; pub use job::FileIdentifier; pub use shallow::shallow; +use tasks::FilePathToCreateOrLinkObject; + // we break these tasks into chunks of 100 to improve performance const CHUNK_SIZE: usize = 100; @@ -44,17 +54,18 @@ pub enum Error { } impl From for rspc::Error { - fn from(err: Error) -> Self { - match err { + fn from(e: Error) -> Self { + match e { Error::SubPath(sub_path_err) => sub_path_err.into(), - _ => Self::with_cause(ErrorCode::InternalServerError, err.to_string(), err), + _ => Self::with_cause(ErrorCode::InternalServerError, e.to_string(), e), } } } -#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)] -pub enum NonCriticalError { +#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type, Clone)] +#[serde(rename_all = "snake_case")] +pub enum NonCriticalFileIdentifierError { #[error("failed to extract file metadata: {0}")] FailedToExtractFileMetadata(String), #[cfg(target_os = "windows")] @@ -66,7 +77,7 @@ pub enum NonCriticalError { #[derive(Debug, Clone)] pub struct FileMetadata { - pub cas_id: Option, + pub cas_id: Option>, pub kind: ObjectKind, pub fs_metadata: Metadata, } @@ -87,10 +98,14 @@ impl FileMetadata { .await .map_err(|e| FileIOError::from((&path, e)))?; - assert!( - !fs_metadata.is_dir(), - "We can't generate cas_id for directories" - ); + if fs_metadata.is_dir() { + trace!(path = %path.display(), "Skipping directory;"); + return Ok(Self { + cas_id: None, + kind: ObjectKind::Folder, + fs_metadata, + }); + } // derive Object kind let kind = Extension::resolve_conflicting(&path, false) @@ -108,8 +123,10 @@ impl FileMetadata { }; trace!( - "Analyzed file: ", - path.display() + path = %path.display(), + ?cas_id, + %kind, + "Analyzed file;", ); Ok(Self { @@ -140,7 +157,7 @@ fn orphan_path_filters_shallow( )), file_path::size_in_bytes_bytes::not(Some(0u64.to_be_bytes().to_vec())), ], - [file_path_id.map(file_path::id::gte)], + [file_path_id.map(file_path::id::gt)], ) } @@ -161,7 +178,7 @@ fn orphan_path_filters_deep( ], [ // this is a workaround for the cursor not working properly - file_path_id.map(file_path::id::gte), + file_path_id.map(file_path::id::gt), maybe_sub_iso_file_path.as_ref().map(|sub_iso_file_path| { file_path::materialized_path::starts_with( sub_iso_file_path @@ -172,3 +189,91 @@ fn orphan_path_filters_deep( ], ) } + +async fn dispatch_object_processor_tasks( + file_paths_by_cas_id: Iter, + ctx: &impl OuterContext, + dispatcher: &Dispatcher, + with_priority: bool, +) -> Result>, Dispatcher::DispatchError> +where + Iter: IntoIterator, Vec)> + Send, + Iter::IntoIter: Send, + Dispatcher: TaskDispatcher, +{ + let mut current_batch = HashMap::<_, Vec<_>>::new(); + let mut tasks = vec![]; + + let mut current_batch_size = 0; + + for (cas_id, objects_to_create_or_link) in file_paths_by_cas_id { + if objects_to_create_or_link.len() >= CHUNK_SIZE { + tasks.push( + dispatcher + .dispatch(tasks::ObjectProcessor::new( + HashMap::from([(cas_id, objects_to_create_or_link)]), + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + with_priority, + )) + .await?, + ); + } else { + current_batch_size += objects_to_create_or_link.len(); + match current_batch.entry(cas_id) { + Entry::Occupied(entry) => { + entry.into_mut().extend(objects_to_create_or_link); + } + Entry::Vacant(entry) => { + entry.insert(objects_to_create_or_link); + } + } + + if current_batch_size >= CHUNK_SIZE { + tasks.push( + dispatcher + .dispatch(tasks::ObjectProcessor::new( + mem::take(&mut current_batch), + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + with_priority, + )) + .await?, + ); + + current_batch_size = 0; + } + } + } + + if !current_batch.is_empty() { + tasks.push( + dispatcher + .dispatch(tasks::ObjectProcessor::new( + current_batch, + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + with_priority, + )) + .await?, + ); + } + + Ok(tasks) +} + +fn accumulate_file_paths_by_cas_id( + input: HashMap, Vec>, + accumulator: &mut HashMap, Vec>, +) { + for (cas_id, file_paths) in input { + match accumulator.entry(cas_id) { + Entry::<_, Vec<_>>::Occupied(entry) => { + entry.into_mut().extend(file_paths); + } + Entry::Vacant(entry) => { + entry.insert(file_paths); + } + } + } +} diff --git a/core/crates/heavy-lifting/src/file_identifier/shallow.rs b/core/crates/heavy-lifting/src/file_identifier/shallow.rs index dbbedb2c2..cd165867d 100644 --- a/core/crates/heavy-lifting/src/file_identifier/shallow.rs +++ b/core/crates/heavy-lifting/src/file_identifier/shallow.rs @@ -1,6 +1,6 @@ use crate::{ file_identifier, utils::sub_path::maybe_get_iso_file_path_from_sub_path, Error, - NonCriticalError, OuterContext, + NonCriticalError, OuterContext, UpdateEvent, }; use sd_core_file_path_helper::IsolatedFilePathData; @@ -8,34 +8,40 @@ use sd_core_prisma_helpers::file_path_for_file_identifier; use sd_prisma::prisma::{file_path, location, SortOrder}; use sd_task_system::{ - BaseTaskDispatcher, CancelTaskOnDrop, TaskDispatcher, TaskOutput, TaskStatus, + BaseTaskDispatcher, CancelTaskOnDrop, TaskDispatcher, TaskHandle, TaskOutput, TaskStatus, }; use sd_utils::db::maybe_missing; use std::{ + collections::HashMap, path::{Path, PathBuf}, sync::Arc, }; -use futures_concurrency::future::FutureGroup; -use lending_stream::{LendingStream, StreamExt}; -use tracing::{debug, warn}; +use futures::{stream::FuturesUnordered, StreamExt}; +use tracing::{debug, instrument, trace, warn}; use super::{ - orphan_path_filters_shallow, - tasks::{ - extract_file_metadata, object_processor, ExtractFileMetadataTask, ObjectProcessorTask, - }, + accumulate_file_paths_by_cas_id, dispatch_object_processor_tasks, orphan_path_filters_shallow, + tasks::{self, identifier, object_processor}, CHUNK_SIZE, }; +#[instrument( + skip_all, + fields( + location_id = location.id, + location_path = ?location.path, + sub_path = %sub_path.as_ref().display() + ) + err, +)] pub async fn shallow( location: location::Data, sub_path: impl AsRef + Send, - dispatcher: BaseTaskDispatcher, - ctx: impl OuterContext, + dispatcher: &BaseTaskDispatcher, + ctx: &impl OuterContext, ) -> Result, Error> { - let sub_path = sub_path.as_ref(); let db = ctx.db(); let location_path = maybe_missing(&location.path, "location.path") @@ -45,22 +51,25 @@ pub async fn shallow( let location = Arc::new(location); - let sub_iso_file_path = - maybe_get_iso_file_path_from_sub_path(location.id, &Some(sub_path), &*location_path, db) - .await - .map_err(file_identifier::Error::from)? - .map_or_else( - || { - IsolatedFilePathData::new(location.id, &*location_path, &*location_path, true) - .map_err(file_identifier::Error::from) - }, - Ok, - )?; + let sub_iso_file_path = maybe_get_iso_file_path_from_sub_path::( + location.id, + Some(sub_path.as_ref()), + &*location_path, + db, + ) + .await? + .map_or_else( + || { + IsolatedFilePathData::new(location.id, &*location_path, &*location_path, true) + .map_err(file_identifier::Error::from) + }, + Ok, + )?; let mut orphans_count = 0; let mut last_orphan_file_path_id = None; - let mut pending_running_tasks = FutureGroup::new(); + let mut identifier_tasks = vec![]; loop { #[allow(clippy::cast_possible_wrap)] @@ -87,70 +96,89 @@ pub async fn shallow( orphans_count += orphan_paths.len() as u64; last_orphan_file_path_id = Some(last_orphan.id); - pending_running_tasks.insert(CancelTaskOnDrop( - dispatcher - .dispatch(ExtractFileMetadataTask::new( - Arc::clone(&location), - Arc::clone(&location_path), - orphan_paths, - true, - )) - .await, - )); + let Ok(tasks) = dispatcher + .dispatch(tasks::Identifier::new( + Arc::clone(&location), + Arc::clone(&location_path), + orphan_paths, + true, + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + )) + .await + else { + debug!("Task system is shutting down while a shallow file identifier was in progress"); + return Ok(vec![]); + }; + + identifier_tasks.push(tasks); } if orphans_count == 0 { - debug!( - "No orphans found on ", - location.id, - sub_path.display() - ); + trace!("No orphans found"); return Ok(vec![]); } - let errors = process_tasks(pending_running_tasks, dispatcher, ctx).await?; - - Ok(errors) + process_tasks(identifier_tasks, dispatcher, ctx).await } async fn process_tasks( - pending_running_tasks: FutureGroup>, - dispatcher: BaseTaskDispatcher, - ctx: impl OuterContext, + identifier_tasks: Vec>, + dispatcher: &BaseTaskDispatcher, + ctx: &impl OuterContext, ) -> Result, Error> { - let mut pending_running_tasks = pending_running_tasks.lend_mut(); + let total_identifier_tasks = identifier_tasks.len(); - let db = ctx.db(); - let sync = ctx.sync(); + let mut pending_running_tasks = identifier_tasks + .into_iter() + .map(CancelTaskOnDrop::new) + .collect::>(); let mut errors = vec![]; + let mut completed_identifier_tasks = 0; + let mut file_paths_accumulator = HashMap::new(); - while let Some((pending_running_tasks, task_result)) = pending_running_tasks.next().await { + while let Some(task_result) = pending_running_tasks.next().await { match task_result { Ok(TaskStatus::Done((_, TaskOutput::Out(any_task_output)))) => { // We only care about ExtractFileMetadataTaskOutput because we need to dispatch further tasks // and the ObjectProcessorTask only gives back some metrics not much important for // shallow file identifier - if any_task_output.is::() { - let extract_file_metadata::Output { - identified_files, + if any_task_output.is::() { + let identifier::Output { + file_path_ids_with_new_object, + file_paths_by_cas_id, errors: more_errors, .. } = *any_task_output.downcast().expect("just checked"); + completed_identifier_tasks += 1; + + ctx.report_update(UpdateEvent::NewIdentifiedObjects { + file_path_ids: file_path_ids_with_new_object, + }); + + accumulate_file_paths_by_cas_id( + file_paths_by_cas_id, + &mut file_paths_accumulator, + ); + errors.extend(more_errors); - if !identified_files.is_empty() { - pending_running_tasks.insert(CancelTaskOnDrop( - dispatcher - .dispatch(ObjectProcessorTask::new( - identified_files, - Arc::clone(db), - Arc::clone(sync), - true, - )) - .await, - )); + if total_identifier_tasks == completed_identifier_tasks { + let Ok(tasks) = dispatch_object_processor_tasks( + file_paths_accumulator.drain(), + ctx, + dispatcher, + true, + ) + .await + else { + debug!("Task system is shutting down while a shallow file identifier was in progress"); + continue; + }; + + pending_running_tasks.extend(tasks.into_iter().map(CancelTaskOnDrop::new)); } } else { let object_processor::Output { @@ -158,21 +186,21 @@ async fn process_tasks( .. } = *any_task_output.downcast().expect("just checked"); - ctx.report_update(crate::UpdateEvent::NewIdentifiedObjects { + ctx.report_update(UpdateEvent::NewIdentifiedObjects { file_path_ids: file_path_ids_with_new_object, }); } } Ok(TaskStatus::Done((task_id, TaskOutput::Empty))) => { - warn!("Task returned an empty output"); + warn!(%task_id, "Task returned an empty output"); } Ok(TaskStatus::Shutdown(_)) => { debug!( "Spacedrive is shutting down while a shallow file identifier was in progress" ); - return Ok(vec![]); + continue; } Ok(TaskStatus::Error(e)) => { @@ -181,7 +209,7 @@ async fn process_tasks( Ok(TaskStatus::Canceled | TaskStatus::ForcedAbortion) => { warn!("Task was cancelled or aborted on shallow file identifier"); - return Ok(vec![]); + return Ok(errors); } Err(e) => { diff --git a/core/crates/heavy-lifting/src/file_identifier/tasks/extract_file_metadata.rs b/core/crates/heavy-lifting/src/file_identifier/tasks/extract_file_metadata.rs deleted file mode 100644 index f8dd41fdc..000000000 --- a/core/crates/heavy-lifting/src/file_identifier/tasks/extract_file_metadata.rs +++ /dev/null @@ -1,267 +0,0 @@ -use crate::{ - file_identifier::{self, FileMetadata}, - Error, NonCriticalError, -}; - -use sd_core_file_path_helper::IsolatedFilePathData; -use sd_core_prisma_helpers::file_path_for_file_identifier; - -use sd_prisma::prisma::location; -use sd_task_system::{ - ExecStatus, Interrupter, InterruptionKind, IntoAnyTaskOutput, SerializableTask, Task, TaskId, -}; -use sd_utils::error::FileIOError; - -use std::{ - collections::HashMap, future::IntoFuture, mem, path::PathBuf, pin::pin, sync::Arc, - time::Duration, -}; - -use futures::stream::{self, FuturesUnordered, StreamExt}; -use futures_concurrency::stream::Merge; -use serde::{Deserialize, Serialize}; -use tokio::time::Instant; -use tracing::error; -use uuid::Uuid; - -use super::IdentifiedFile; - -#[derive(Debug, Serialize, Deserialize)] -pub struct ExtractFileMetadataTask { - id: TaskId, - location: Arc, - location_path: Arc, - file_paths_by_id: HashMap, - identified_files: HashMap, - extract_metadata_time: Duration, - errors: Vec, - with_priority: bool, -} - -#[derive(Debug)] -pub struct Output { - pub identified_files: HashMap, - pub extract_metadata_time: Duration, - pub errors: Vec, -} - -impl ExtractFileMetadataTask { - #[must_use] - pub fn new( - location: Arc, - location_path: Arc, - file_paths: Vec, - with_priority: bool, - ) -> Self { - Self { - id: TaskId::new_v4(), - location, - location_path, - identified_files: HashMap::with_capacity(file_paths.len()), - file_paths_by_id: file_paths - .into_iter() - .map(|file_path| { - // SAFETY: This should never happen - ( - Uuid::from_slice(&file_path.pub_id).expect("file_path.pub_id is invalid!"), - file_path, - ) - }) - .collect(), - extract_metadata_time: Duration::ZERO, - errors: Vec::new(), - with_priority, - } - } -} - -#[async_trait::async_trait] -impl Task for ExtractFileMetadataTask { - fn id(&self) -> TaskId { - self.id - } - - fn with_priority(&self) -> bool { - self.with_priority - } - - async fn run(&mut self, interrupter: &Interrupter) -> Result { - // `Processed` is larger than `Interrupt`, but it's much more common - // so we ignore the size difference to optimize for usage - #[allow(clippy::large_enum_variant)] - enum StreamMessage { - Processed(Uuid, Result), - Interrupt(InterruptionKind), - } - - let Self { - location, - location_path, - file_paths_by_id, - identified_files, - extract_metadata_time, - errors, - .. - } = self; - - let start_time = Instant::now(); - - if !file_paths_by_id.is_empty() { - let extraction_futures = file_paths_by_id - .iter() - .filter_map(|(file_path_id, file_path)| { - try_iso_file_path_extraction( - location.id, - *file_path_id, - file_path, - Arc::clone(location_path), - errors, - ) - }) - .map(|(file_path_id, iso_file_path, location_path)| async move { - StreamMessage::Processed( - file_path_id, - FileMetadata::new(&*location_path, &iso_file_path).await, - ) - }) - .collect::>(); - - let mut msg_stream = pin!(( - extraction_futures, - stream::once(interrupter.into_future()).map(StreamMessage::Interrupt) - ) - .merge()); - - while let Some(msg) = msg_stream.next().await { - match msg { - StreamMessage::Processed(file_path_pub_id, res) => { - let file_path = file_paths_by_id - .remove(&file_path_pub_id) - .expect("file_path must be here"); - - match res { - Ok(FileMetadata { cas_id, kind, .. }) => { - identified_files.insert( - file_path_pub_id, - IdentifiedFile { - file_path, - cas_id, - kind, - }, - ); - } - Err(e) => { - handle_non_critical_errors( - location.id, - file_path_pub_id, - &e, - errors, - ); - } - } - - if file_paths_by_id.is_empty() { - // All files have been processed so we can end this merged stream and don't keep waiting an - // interrupt signal - break; - } - } - - StreamMessage::Interrupt(kind) => { - *extract_metadata_time += start_time.elapsed(); - return Ok(match kind { - InterruptionKind::Pause => ExecStatus::Paused, - InterruptionKind::Cancel => ExecStatus::Canceled, - }); - } - } - } - } - - Ok(ExecStatus::Done( - Output { - identified_files: mem::take(identified_files), - extract_metadata_time: *extract_metadata_time + start_time.elapsed(), - errors: mem::take(errors), - } - .into_output(), - )) - } -} - -fn handle_non_critical_errors( - location_id: location::id::Type, - file_path_pub_id: Uuid, - e: &FileIOError, - errors: &mut Vec, -) { - error!("Failed to extract file metadata : {e:#?}"); - - let formatted_error = format!(""); - - #[cfg(target_os = "windows")] - { - // Handle case where file is on-demand (NTFS only) - if e.source.raw_os_error().map_or(false, |code| code == 362) { - errors.push( - file_identifier::NonCriticalError::FailedToExtractMetadataFromOnDemandFile( - formatted_error, - ) - .into(), - ); - } else { - errors.push( - file_identifier::NonCriticalError::FailedToExtractFileMetadata(formatted_error) - .into(), - ); - } - } - - #[cfg(not(target_os = "windows"))] - { - errors.push( - file_identifier::NonCriticalError::FailedToExtractFileMetadata(formatted_error).into(), - ); - } -} - -fn try_iso_file_path_extraction( - location_id: location::id::Type, - file_path_pub_id: Uuid, - file_path: &file_path_for_file_identifier::Data, - location_path: Arc, - errors: &mut Vec, -) -> Option<(Uuid, IsolatedFilePathData<'static>, Arc)> { - IsolatedFilePathData::try_from((location_id, file_path)) - .map(IsolatedFilePathData::to_owned) - .map(|iso_file_path| (file_path_pub_id, iso_file_path, location_path)) - .map_err(|e| { - error!("Failed to extract isolated file path data: {e:#?}"); - errors.push( - file_identifier::NonCriticalError::FailedToExtractIsolatedFilePathData(format!( - "" - )) - .into(), - ); - }) - .ok() -} - -impl SerializableTask for ExtractFileMetadataTask { - type SerializeError = rmp_serde::encode::Error; - - type DeserializeError = rmp_serde::decode::Error; - - type DeserializeCtx = (); - - async fn serialize(self) -> Result, Self::SerializeError> { - rmp_serde::to_vec_named(&self) - } - - async fn deserialize( - data: &[u8], - (): Self::DeserializeCtx, - ) -> Result { - rmp_serde::from_slice(data) - } -} diff --git a/core/crates/heavy-lifting/src/file_identifier/tasks/identifier.rs b/core/crates/heavy-lifting/src/file_identifier/tasks/identifier.rs new file mode 100644 index 000000000..74785d9c4 --- /dev/null +++ b/core/crates/heavy-lifting/src/file_identifier/tasks/identifier.rs @@ -0,0 +1,508 @@ +use crate::{ + file_identifier::{self, FileMetadata}, + Error, NonCriticalError, +}; + +use sd_core_file_path_helper::IsolatedFilePathData; +use sd_core_prisma_helpers::{file_path_for_file_identifier, CasId, FilePathPubId}; +use sd_core_sync::Manager as SyncManager; + +use sd_file_ext::kind::ObjectKind; +use sd_prisma::{ + prisma::{file_path, location, PrismaClient}, + prisma_sync, +}; +use sd_sync::OperationFactory; +use sd_task_system::{ + ExecStatus, Interrupter, InterruptionKind, IntoAnyTaskOutput, SerializableTask, Task, TaskId, +}; +use sd_utils::{error::FileIOError, msgpack}; + +use std::{ + collections::HashMap, future::IntoFuture, mem, path::PathBuf, pin::pin, sync::Arc, + time::Duration, +}; + +use futures::stream::{self, FuturesUnordered, StreamExt}; +use futures_concurrency::{future::TryJoin, stream::Merge}; +use serde::{Deserialize, Serialize}; +use tokio::time::Instant; +use tracing::{error, instrument, trace, Level}; + +use super::{create_objects_and_update_file_paths, FilePathToCreateOrLinkObject}; + +#[derive(Debug, Serialize, Deserialize)] +struct IdentifiedFile { + file_path: file_path_for_file_identifier::Data, + cas_id: CasId<'static>, + kind: ObjectKind, +} + +impl IdentifiedFile { + pub fn new( + file_path: file_path_for_file_identifier::Data, + cas_id: impl Into>, + kind: ObjectKind, + ) -> Self { + Self { + file_path, + cas_id: cas_id.into(), + kind, + } + } +} + +#[derive(Debug)] +pub struct Identifier { + // Task control + id: TaskId, + with_priority: bool, + + // Received input args + location: Arc, + location_path: Arc, + file_paths_by_id: HashMap, + + // Inner state + identified_files: HashMap, + file_paths_without_cas_id: Vec, + + // Out collector + output: Output, + + // Dependencies + db: Arc, + sync: Arc, +} + +/// Output from the `[Identifier]` task +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct Output { + /// To send to frontend for priority reporting of new objects + pub file_path_ids_with_new_object: Vec, + + /// Files that need to be aggregate between many identifier tasks to be processed by the + /// object processor tasks + pub file_paths_by_cas_id: HashMap, Vec>, + + /// Collected metric about time elapsed extracting metadata from file system + pub extract_metadata_time: Duration, + + /// Collected metric about time spent saving objects on disk + pub save_db_time: Duration, + + /// Total number of objects already created as they didn't have `cas_id`, like directories or empty files + pub created_objects_count: u64, + + /// Total number of files that we were able to identify + pub total_identified_files: u64, + + /// Non critical errors that happened during the task execution + pub errors: Vec, +} + +#[async_trait::async_trait] +impl Task for Identifier { + fn id(&self) -> TaskId { + self.id + } + + fn with_priority(&self) -> bool { + self.with_priority + } + + #[instrument( + skip(self, interrupter), + fields( + task_id = %self.id, + location_id = %self.location.id, + location_path = %self.location_path.display(), + files_count = %self.file_paths_by_id.len(), + ), + ret(level = Level::TRACE), + err, + )] + #[allow(clippy::blocks_in_conditions)] // Due to `err` on `instrument` macro above + async fn run(&mut self, interrupter: &Interrupter) -> Result { + // `Processed` is larger than `Interrupt`, but it's much more common + // so we ignore the size difference to optimize for usage + #[allow(clippy::large_enum_variant)] + enum StreamMessage { + Processed(FilePathPubId, Result), + Interrupt(InterruptionKind), + } + + let Self { + location, + location_path, + file_paths_by_id, + file_paths_without_cas_id, + identified_files, + output, + .. + } = self; + + if !file_paths_by_id.is_empty() { + let start_time = Instant::now(); + + let extraction_futures = file_paths_by_id + .iter() + .filter_map(|(file_path_id, file_path)| { + try_iso_file_path_extraction( + location.id, + file_path_id.clone(), + file_path, + Arc::clone(location_path), + &mut output.errors, + ) + }) + .map(|(file_path_id, iso_file_path, location_path)| async move { + StreamMessage::Processed( + file_path_id, + FileMetadata::new(&*location_path, &iso_file_path).await, + ) + }) + .collect::>(); + + let mut msg_stream = pin!(( + extraction_futures, + stream::once(interrupter.into_future()).map(StreamMessage::Interrupt) + ) + .merge()); + + while let Some(msg) = msg_stream.next().await { + match msg { + StreamMessage::Processed(file_path_pub_id, res) => { + let file_path = file_paths_by_id + .remove(&file_path_pub_id) + .expect("file_path must be here"); + + trace!( + files_remaining = file_paths_by_id.len(), + %file_path_pub_id, + "Processed file;", + ); + + match res { + Ok(FileMetadata { + cas_id: Some(cas_id), + kind, + .. + }) => { + identified_files.insert( + file_path_pub_id, + IdentifiedFile::new(file_path, cas_id, kind), + ); + } + Ok(FileMetadata { + cas_id: None, kind, .. + }) => { + let file_path_for_file_identifier::Data { + id, + pub_id, + date_created, + .. + } = file_path; + file_paths_without_cas_id.push(FilePathToCreateOrLinkObject { + id, + file_path_pub_id: pub_id.into(), + kind, + created_at: date_created, + }); + } + Err(e) => { + handle_non_critical_errors( + file_path_pub_id, + &e, + &mut output.errors, + ); + } + } + + if file_paths_by_id.is_empty() { + trace!("All files have been processed"); + // All files have been processed so we can end this merged stream + // and don't keep waiting an interrupt signal + break; + } + } + + StreamMessage::Interrupt(kind) => { + trace!(?kind, "Interrupted;"); + output.extract_metadata_time += start_time.elapsed(); + return Ok(match kind { + InterruptionKind::Pause => ExecStatus::Paused, + InterruptionKind::Cancel => ExecStatus::Canceled, + }); + } + } + } + + output.extract_metadata_time = start_time.elapsed(); + + output.total_identified_files = + identified_files.len() as u64 + file_paths_without_cas_id.len() as u64; + + trace!( + identified_files_count = identified_files.len(), + "All files have been processed, saving cas_ids to db...;" + ); + let start_time = Instant::now(); + // Assign cas_id to each file path + let ((), file_path_ids_with_new_object) = ( + assign_cas_id_to_file_paths(identified_files, &self.db, &self.sync), + create_objects_and_update_file_paths( + file_paths_without_cas_id.drain(..), + &self.db, + &self.sync, + ), + ) + .try_join() + .await?; + + output.save_db_time = start_time.elapsed(); + output.created_objects_count = file_path_ids_with_new_object.len() as u64; + output.file_path_ids_with_new_object = + file_path_ids_with_new_object.into_keys().collect(); + + output.file_paths_by_cas_id = identified_files.drain().fold( + HashMap::new(), + |mut map, + ( + file_path_pub_id, + IdentifiedFile { + cas_id, + kind, + file_path: + file_path_for_file_identifier::Data { + id, date_created, .. + }, + }, + )| { + map.entry(cas_id) + .or_insert_with(|| Vec::with_capacity(1)) + .push(FilePathToCreateOrLinkObject { + id, + file_path_pub_id, + kind, + created_at: date_created, + }); + + map + }, + ); + + trace!(save_db_time = ?output.save_db_time, "Cas_ids saved to db;"); + } + + Ok(ExecStatus::Done(mem::take(output).into_output())) + } +} + +impl Identifier { + #[must_use] + pub fn new( + location: Arc, + location_path: Arc, + file_paths: Vec, + with_priority: bool, + db: Arc, + sync: Arc, + ) -> Self { + Self { + id: TaskId::new_v4(), + location, + location_path, + identified_files: HashMap::with_capacity(file_paths.len()), + file_paths_without_cas_id: Vec::with_capacity(file_paths.len()), + file_paths_by_id: file_paths + .into_iter() + .map(|file_path| (file_path.pub_id.as_slice().into(), file_path)) + .collect(), + output: Output::default(), + with_priority, + db, + sync, + } + } +} + +#[instrument(skip_all, err, fields(identified_files_count = identified_files.len()))] +async fn assign_cas_id_to_file_paths( + identified_files: &HashMap, + db: &PrismaClient, + sync: &SyncManager, +) -> Result<(), file_identifier::Error> { + // Assign cas_id to each file path + sync.write_ops( + db, + identified_files + .iter() + .map(|(pub_id, IdentifiedFile { cas_id, .. })| { + ( + sync.shared_update( + prisma_sync::file_path::SyncId { + pub_id: pub_id.to_db(), + }, + file_path::cas_id::NAME, + msgpack!(cas_id), + ), + db.file_path() + .update( + file_path::pub_id::equals(pub_id.to_db()), + vec![file_path::cas_id::set(cas_id.into())], + ) + // We don't need any data here, just the id avoids receiving the entire object + // as we can't pass an empty select macro call + .select(file_path::select!({ id })), + ) + }) + .unzip::<_, _, _, Vec<_>>(), + ) + .await?; + + Ok(()) +} + +#[instrument(skip(errors))] +fn handle_non_critical_errors( + file_path_pub_id: FilePathPubId, + e: &FileIOError, + errors: &mut Vec, +) { + let formatted_error = format!(""); + + #[cfg(target_os = "windows")] + { + // Handle case where file is on-demand (NTFS only) + if e.source.raw_os_error().map_or(false, |code| code == 362) { + errors.push( + file_identifier::NonCriticalFileIdentifierError::FailedToExtractMetadataFromOnDemandFile( + formatted_error, + ) + .into(), + ); + } else { + errors.push( + file_identifier::NonCriticalFileIdentifierError::FailedToExtractFileMetadata( + formatted_error, + ) + .into(), + ); + } + } + + #[cfg(not(target_os = "windows"))] + { + errors.push( + file_identifier::NonCriticalFileIdentifierError::FailedToExtractFileMetadata( + formatted_error, + ) + .into(), + ); + } +} + +#[instrument( + skip(location_id, file_path, location_path, errors), + fields( + file_path_id = file_path.id, + materialized_path = ?file_path.materialized_path, + name = ?file_path.name, + extension = ?file_path.extension, + ) +)] +fn try_iso_file_path_extraction( + location_id: location::id::Type, + file_path_pub_id: FilePathPubId, + file_path: &file_path_for_file_identifier::Data, + location_path: Arc, + errors: &mut Vec, +) -> Option<(FilePathPubId, IsolatedFilePathData<'static>, Arc)> { + IsolatedFilePathData::try_from((location_id, file_path)) + .map(IsolatedFilePathData::to_owned) + .map_err(|e| { + error!(?e, "Failed to extract isolated file path data;"); + errors.push( + file_identifier::NonCriticalFileIdentifierError::FailedToExtractIsolatedFilePathData(format!( + "" + )) + .into(), + ); + }) + .map(|iso_file_path| (file_path_pub_id, iso_file_path, location_path)) + .ok() +} + +#[derive(Serialize, Deserialize)] +struct SaveState { + id: TaskId, + location: Arc, + location_path: Arc, + file_paths_by_id: HashMap, + identified_files: HashMap, + file_paths_without_cas_id: Vec, + output: Output, + with_priority: bool, +} + +impl SerializableTask for Identifier { + type SerializeError = rmp_serde::encode::Error; + + type DeserializeError = rmp_serde::decode::Error; + + type DeserializeCtx = (Arc, Arc); + + async fn serialize(self) -> Result, Self::SerializeError> { + let Self { + id, + location, + location_path, + file_paths_by_id, + identified_files, + file_paths_without_cas_id, + output, + with_priority, + .. + } = self; + rmp_serde::to_vec_named(&SaveState { + id, + location, + location_path, + file_paths_by_id, + identified_files, + file_paths_without_cas_id, + output, + with_priority, + }) + } + + async fn deserialize( + data: &[u8], + (db, sync): Self::DeserializeCtx, + ) -> Result { + rmp_serde::from_slice::(data).map( + |SaveState { + id, + location, + location_path, + file_paths_by_id, + identified_files, + file_paths_without_cas_id, + output, + with_priority, + }| Self { + id, + with_priority, + location, + location_path, + file_paths_by_id, + identified_files, + file_paths_without_cas_id, + output, + db, + sync, + }, + ) + } +} diff --git a/core/crates/heavy-lifting/src/file_identifier/tasks/mod.rs b/core/crates/heavy-lifting/src/file_identifier/tasks/mod.rs index c06fc8ad0..f74a03b4a 100644 --- a/core/crates/heavy-lifting/src/file_identifier/tasks/mod.rs +++ b/core/crates/heavy-lifting/src/file_identifier/tasks/mod.rs @@ -1,18 +1,171 @@ -use sd_core_prisma_helpers::file_path_for_file_identifier; +use crate::file_identifier; + +use sd_core_prisma_helpers::{file_path_id, FilePathPubId, ObjectPubId}; +use sd_core_sync::Manager as SyncManager; use sd_file_ext::kind::ObjectKind; +use sd_prisma::{ + prisma::{file_path, object, PrismaClient}, + prisma_sync, +}; +use sd_sync::{CRDTOperation, OperationFactory}; +use sd_utils::msgpack; +use std::collections::{HashMap, HashSet}; + +use chrono::{DateTime, FixedOffset}; +use prisma_client_rust::Select; use serde::{Deserialize, Serialize}; +use tracing::{instrument, trace, Level}; -pub mod extract_file_metadata; +pub mod identifier; pub mod object_processor; -pub use extract_file_metadata::ExtractFileMetadataTask; -pub use object_processor::ObjectProcessorTask; +pub use identifier::Identifier; +pub use object_processor::ObjectProcessor; +/// This object has all needed data to create a new `object` for a `file_path` or link an existing one. #[derive(Debug, Serialize, Deserialize)] -pub(super) struct IdentifiedFile { - pub(super) file_path: file_path_for_file_identifier::Data, - pub(super) cas_id: Option, - pub(super) kind: ObjectKind, +pub(super) struct FilePathToCreateOrLinkObject { + id: file_path::id::Type, + file_path_pub_id: FilePathPubId, + kind: ObjectKind, + created_at: Option>, +} + +#[instrument(skip(sync, db))] +fn connect_file_path_to_object<'db>( + file_path_pub_id: &FilePathPubId, + object_pub_id: &ObjectPubId, + db: &'db PrismaClient, + sync: &SyncManager, +) -> (CRDTOperation, Select<'db, file_path_id::Data>) { + trace!("Connecting"); + + ( + sync.shared_update( + prisma_sync::file_path::SyncId { + pub_id: file_path_pub_id.to_db(), + }, + file_path::object::NAME, + msgpack!(prisma_sync::object::SyncId { + pub_id: object_pub_id.to_db(), + }), + ), + db.file_path() + .update( + file_path::pub_id::equals(file_path_pub_id.to_db()), + vec![file_path::object::connect(object::pub_id::equals( + object_pub_id.to_db(), + ))], + ) + // selecting just id to avoid fetching the whole object + .select(file_path_id::select()), + ) +} + +#[instrument(skip_all, ret(level = Level::TRACE), err)] +async fn create_objects_and_update_file_paths( + files_and_kinds: impl IntoIterator + Send, + db: &PrismaClient, + sync: &SyncManager, +) -> Result, file_identifier::Error> { + trace!("Preparing objects"); + let (object_create_args, file_path_args) = files_and_kinds + .into_iter() + .map( + |FilePathToCreateOrLinkObject { + id, + file_path_pub_id, + kind, + created_at, + }| { + let object_pub_id = ObjectPubId::new(); + + let kind = kind as i32; + + let (sync_params, db_params) = [ + ( + (object::date_created::NAME, msgpack!(created_at)), + object::date_created::set(created_at), + ), + ( + (object::kind::NAME, msgpack!(kind)), + object::kind::set(Some(kind)), + ), + ] + .into_iter() + .unzip::<_, _, Vec<_>, Vec<_>>(); + + ( + ( + sync.shared_create( + prisma_sync::object::SyncId { + pub_id: object_pub_id.to_db(), + }, + sync_params, + ), + object::create_unchecked(object_pub_id.to_db(), db_params), + ), + ( + (id, object_pub_id.clone()), + connect_file_path_to_object(&file_path_pub_id, &object_pub_id, db, sync), + ), + ) + }, + ) + .unzip::<_, _, Vec<_>, Vec<_>>(); + + let (mut object_pub_id_by_file_path_id, file_path_update_args) = file_path_args + .into_iter() + .unzip::<_, _, HashMap<_, _>, Vec<_>>( + ); + + trace!( + new_objects_count = object_create_args.len(), + "Creating new Objects!;", + ); + + // create new object records with assembled values + let created_objects_count = sync + .write_ops(db, { + let (sync, db_params) = object_create_args + .into_iter() + .unzip::<_, _, Vec<_>, Vec<_>>(); + + ( + sync.into_iter().flatten().collect(), + db.object().create_many(db_params), + ) + }) + .await?; + + trace!(%created_objects_count, "Created new Objects;"); + + if created_objects_count > 0 { + trace!("Updating file paths with created objects"); + + let updated_file_path_ids = sync + .write_ops( + db, + file_path_update_args + .into_iter() + .unzip::<_, _, Vec<_>, Vec<_>>(), + ) + .await + .map(|file_paths| { + file_paths + .into_iter() + .map(|file_path_id::Data { id }| id) + .collect::>() + })?; + + object_pub_id_by_file_path_id + .retain(|file_path_id, _| updated_file_path_ids.contains(file_path_id)); + + Ok(object_pub_id_by_file_path_id) + } else { + trace!("No objects created, skipping file path updates"); + Ok(HashMap::new()) + } } diff --git a/core/crates/heavy-lifting/src/file_identifier/tasks/object_processor.rs b/core/crates/heavy-lifting/src/file_identifier/tasks/object_processor.rs index bdc826ddc..458cb5654 100644 --- a/core/crates/heavy-lifting/src/file_identifier/tasks/object_processor.rs +++ b/core/crates/heavy-lifting/src/file_identifier/tasks/object_processor.rs @@ -1,98 +1,76 @@ use crate::{file_identifier, Error}; -use sd_core_prisma_helpers::{ - file_path_for_file_identifier, file_path_pub_id, object_for_file_identifier, -}; +use sd_core_prisma_helpers::{file_path_id, object_for_file_identifier, CasId, ObjectPubId}; use sd_core_sync::Manager as SyncManager; -use sd_prisma::{ - prisma::{file_path, object, PrismaClient}, - prisma_sync, -}; -use sd_sync::{CRDTOperation, OperationFactory}; +use sd_prisma::prisma::{file_path, object, PrismaClient}; use sd_task_system::{ check_interruption, ExecStatus, Interrupter, IntoAnyTaskOutput, SerializableTask, Task, TaskId, }; -use sd_utils::{msgpack, uuid_to_bytes}; -use std::{ - collections::{HashMap, HashSet}, - mem, - sync::Arc, - time::Duration, -}; +use std::{collections::HashMap, mem, sync::Arc, time::Duration}; -use prisma_client_rust::Select; use serde::{Deserialize, Serialize}; use tokio::time::Instant; -use tracing::{debug, trace}; -use uuid::Uuid; +use tracing::{instrument, trace, Level}; -use super::IdentifiedFile; +use super::{ + connect_file_path_to_object, create_objects_and_update_file_paths, FilePathToCreateOrLinkObject, +}; #[derive(Debug)] -pub struct ObjectProcessorTask { +pub struct ObjectProcessor { + // Task control id: TaskId, + with_priority: bool, + + // Received input args + file_paths_by_cas_id: HashMap, Vec>, + + // Inner state + stage: Stage, + + // Out collector + output: Output, + + // Dependencies db: Arc, sync: Arc, - identified_files: HashMap, - output: Output, - stage: Stage, - with_priority: bool, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct SaveState { - id: TaskId, - identified_files: HashMap, - output: Output, - stage: Stage, - with_priority: bool, -} - -#[derive(Debug, Serialize, Deserialize, Default)] -pub struct Output { - pub file_path_ids_with_new_object: Vec, - pub assign_cas_ids_time: Duration, - pub fetch_existing_objects_time: Duration, - pub assign_to_existing_object_time: Duration, - pub create_object_time: Duration, - pub created_objects_count: u64, - pub linked_objects_count: u64, } #[derive(Debug, Serialize, Deserialize)] enum Stage { Starting, - FetchExistingObjects, AssignFilePathsToExistingObjects { - existing_objects_by_cas_id: HashMap, + existing_objects_by_cas_id: HashMap, ObjectPubId>, }, CreateObjects, } -impl ObjectProcessorTask { - #[must_use] - pub fn new( - identified_files: HashMap, - db: Arc, - sync: Arc, - with_priority: bool, - ) -> Self { - Self { - id: TaskId::new_v4(), - db, - sync, - identified_files, - stage: Stage::Starting, - output: Output::default(), - with_priority, - } - } +/// Output from the `[ObjectProcessor]` task +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct Output { + /// To send to frontend for priority reporting of new objects + pub file_path_ids_with_new_object: Vec, + + /// Time elapsed fetching existing `objects` from db to be linked to `file_paths` + pub fetch_existing_objects_time: Duration, + + /// Time spent linking `file_paths` to already existing `objects` + pub assign_to_existing_object_time: Duration, + + /// Time spent creating new `objects` + pub create_object_time: Duration, + + /// Number of new `objects` created + pub created_objects_count: u64, + + /// Number of `objects` that were linked to `file_paths` + pub linked_objects_count: u64, } #[async_trait::async_trait] -impl Task for ObjectProcessorTask { +impl Task for ObjectProcessor { fn id(&self) -> TaskId { self.id } @@ -101,16 +79,25 @@ impl Task for ObjectProcessorTask { self.with_priority } + #[instrument( + skip(self, interrupter), + fields( + task_id = %self.id, + cas_ids_count = %self.file_paths_by_cas_id.len(), + ), + ret(level = Level::TRACE), + err, + )] + #[allow(clippy::blocks_in_conditions)] // Due to `err` on `instrument` macro above async fn run(&mut self, interrupter: &Interrupter) -> Result { let Self { db, sync, - identified_files, + file_paths_by_cas_id, stage, output: Output { file_path_ids_with_new_object, - assign_cas_ids_time, fetch_existing_objects_time, assign_to_existing_object_time, create_object_time, @@ -123,17 +110,17 @@ impl Task for ObjectProcessorTask { loop { match stage { Stage::Starting => { - let start = Instant::now(); - assign_cas_id_to_file_paths(identified_files, db, sync).await?; - *assign_cas_ids_time = start.elapsed(); - *stage = Stage::FetchExistingObjects; - } - - Stage::FetchExistingObjects => { + trace!("Starting object processor task"); let start = Instant::now(); let existing_objects_by_cas_id = - fetch_existing_objects_by_cas_id(identified_files, db).await?; + fetch_existing_objects_by_cas_id(file_paths_by_cas_id.keys(), db).await?; *fetch_existing_objects_time = start.elapsed(); + + trace!( + elapsed_time = ?fetch_existing_objects_time, + existing_objects_count = existing_objects_by_cas_id.len(), + "Fetched existing Objects;", + ); *stage = Stage::AssignFilePathsToExistingObjects { existing_objects_by_cas_id, }; @@ -142,48 +129,53 @@ impl Task for ObjectProcessorTask { Stage::AssignFilePathsToExistingObjects { existing_objects_by_cas_id, } => { + trace!( + existing_objects_to_link = existing_objects_by_cas_id.len(), + "Assigning file paths to existing Objects;", + ); let start = Instant::now(); - let assigned_file_path_pub_ids = assign_existing_objects_to_file_paths( - identified_files, + let more_file_path_ids_with_new_object = assign_existing_objects_to_file_paths( + file_paths_by_cas_id, existing_objects_by_cas_id, db, sync, ) .await?; *assign_to_existing_object_time = start.elapsed(); - *linked_objects_count = assigned_file_path_pub_ids.len() as u64; + file_path_ids_with_new_object.extend(more_file_path_ids_with_new_object); + *linked_objects_count += file_path_ids_with_new_object.len() as u64; - debug!( - "Found {} existing Objects, linked file paths to them", - existing_objects_by_cas_id.len() + trace!( + existing_objects_to_link = existing_objects_by_cas_id.len(), + %linked_objects_count, + "Found existing Objects, linked file paths to them;", ); - for file_path_pub_id::Data { pub_id } in assigned_file_path_pub_ids { - let pub_id = Uuid::from_slice(&pub_id).expect("uuid bytes are invalid"); - trace!("Assigned file path to existing object"); - - identified_files - .remove(&pub_id) - .expect("file_path must be here"); - } - *stage = Stage::CreateObjects; - if identified_files.is_empty() { + if file_paths_by_cas_id.is_empty() { + trace!("No more objects to be created, finishing task"); // No objects to be created, we're good to finish already break; } } Stage::CreateObjects => { + trace!( + creating_count = file_paths_by_cas_id.len(), + "Creating new Objects;" + ); let start = Instant::now(); - *created_objects_count = create_objects(identified_files, db, sync).await?; + let (more_file_paths_with_new_object, more_linked_objects_count) = + assign_objects_to_duplicated_orphans(file_paths_by_cas_id, db, sync) + .await?; *create_object_time = start.elapsed(); + file_path_ids_with_new_object.extend(more_file_paths_with_new_object); + *linked_objects_count += more_linked_objects_count; - *file_path_ids_with_new_object = identified_files - .values() - .map(|IdentifiedFile { file_path, .. }| file_path.id) - .collect(); + *created_objects_count = file_path_ids_with_new_object.len() as u64; + + trace!(%created_objects_count, ?create_object_time, "Created new Objects;"); break; } @@ -196,225 +188,188 @@ impl Task for ObjectProcessorTask { } } -async fn assign_cas_id_to_file_paths( - identified_files: &HashMap, - db: &PrismaClient, - sync: &SyncManager, -) -> Result<(), file_identifier::Error> { - // Assign cas_id to each file path - sync.write_ops( - db, - identified_files - .iter() - .map(|(pub_id, IdentifiedFile { cas_id, .. })| { - ( - sync.shared_update( - prisma_sync::file_path::SyncId { - pub_id: uuid_to_bytes(*pub_id), - }, - file_path::cas_id::NAME, - msgpack!(cas_id), - ), - db.file_path() - .update( - file_path::pub_id::equals(uuid_to_bytes(*pub_id)), - vec![file_path::cas_id::set(cas_id.clone())], - ) - // We don't need any data here, just the id avoids receiving the entire object - // as we can't pass an empty select macro call - .select(file_path::select!({ id })), - ) - }) - .unzip::<_, _, _, Vec<_>>(), - ) - .await?; - - Ok(()) +impl ObjectProcessor { + #[must_use] + pub fn new( + file_paths_by_cas_id: HashMap, Vec>, + db: Arc, + sync: Arc, + with_priority: bool, + ) -> Self { + Self { + id: TaskId::new_v4(), + db, + sync, + file_paths_by_cas_id, + stage: Stage::Starting, + output: Output::default(), + with_priority, + } + } } -async fn fetch_existing_objects_by_cas_id( - identified_files: &HashMap, +/// Retrieves objects that are already connected to file paths with the same cas_id +#[instrument(skip_all, err)] +async fn fetch_existing_objects_by_cas_id<'cas_id, Iter>( + cas_ids: Iter, db: &PrismaClient, -) -> Result, file_identifier::Error> { - // Retrieves objects that are already connected to file paths with the same id - db.object() - .find_many(vec![object::file_paths::some(vec![ - file_path::cas_id::in_vec( - identified_files - .values() - .filter_map(|IdentifiedFile { cas_id, .. }| cas_id.as_ref()) - .cloned() - .collect::>() +) -> Result, ObjectPubId>, file_identifier::Error> +where + Iter: IntoIterator> + Send, + Iter::IntoIter: Send, +{ + async fn inner( + stringed_cas_ids: Vec, + db: &PrismaClient, + ) -> Result, ObjectPubId>, file_identifier::Error> { + db.object() + .find_many(vec![object::file_paths::some(vec![ + file_path::cas_id::in_vec(stringed_cas_ids), + file_path::object_id::not(None), + ])]) + .select(object_for_file_identifier::select()) + .exec() + .await + .map_err(Into::into) + .map(|objects| { + objects .into_iter() - .collect(), - ), - ])]) - .select(object_for_file_identifier::select()) - .exec() - .await - .map_err(Into::into) - .map(|objects| { - objects - .into_iter() - .filter_map(|object| { - object - .file_paths - .first() - .and_then(|file_path| file_path.cas_id.clone()) - .map(|cas_id| (cas_id, object)) - }) - .collect() - }) + .filter_map(|object_for_file_identifier::Data { pub_id, file_paths }| { + file_paths + .first() + .and_then(|file_path| { + file_path + .cas_id + .as_ref() + .map(CasId::from) + .map(CasId::into_owned) + }) + .map(|cas_id| (cas_id, pub_id.into())) + }) + .collect() + }) + } + + let stringed_cas_ids = cas_ids.into_iter().map(Into::into).collect::>(); + + trace!( + cas_ids_count = stringed_cas_ids.len(), + "Fetching existing objects by cas_ids;", + ); + + inner(stringed_cas_ids, db).await } +/// Attempt to associate each file path with an object that has been +/// connected to file paths with the same cas_id +#[instrument(skip_all, err, fields(identified_files_count = file_paths_by_cas_id.len()))] async fn assign_existing_objects_to_file_paths( - identified_files: &HashMap, - objects_by_cas_id: &HashMap, + file_paths_by_cas_id: &mut HashMap, Vec>, + objects_by_cas_id: &HashMap, ObjectPubId>, db: &PrismaClient, sync: &SyncManager, -) -> Result, file_identifier::Error> { - // Attempt to associate each file path with an object that has been - // connected to file paths with the same cas_id +) -> Result, file_identifier::Error> { sync.write_ops( db, - identified_files + objects_by_cas_id .iter() - .filter_map(|(pub_id, IdentifiedFile { cas_id, .. })| { - objects_by_cas_id - // Filtering out files without cas_id due to being empty - .get(cas_id.as_ref()?) - .map(|object| (*pub_id, object)) - }) - .map(|(pub_id, object)| { - connect_file_path_to_object( - pub_id, - // SAFETY: This pub_id is generated by the uuid lib, but we have to store bytes in sqlite - Uuid::from_slice(&object.pub_id).expect("uuid bytes are invalid"), - sync, - db, - ) + .flat_map(|(cas_id, object_pub_id)| { + file_paths_by_cas_id + .remove(cas_id) + .map(|file_paths| { + file_paths.into_iter().map( + |FilePathToCreateOrLinkObject { + file_path_pub_id, .. + }| { + connect_file_path_to_object( + &file_path_pub_id, + object_pub_id, + db, + sync, + ) + }, + ) + }) + .expect("must be here") }) .unzip::<_, _, Vec<_>, Vec<_>>(), ) .await + .map(|file_paths| { + file_paths + .into_iter() + .map(|file_path_id::Data { id }| id) + .collect() + }) .map_err(Into::into) } -fn connect_file_path_to_object<'db>( - file_path_pub_id: Uuid, - object_pub_id: Uuid, - sync: &SyncManager, - db: &'db PrismaClient, -) -> (CRDTOperation, Select<'db, file_path_pub_id::Data>) { - trace!("Connecting to "); - - let vec_id = object_pub_id.as_bytes().to_vec(); - - ( - sync.shared_update( - prisma_sync::file_path::SyncId { - pub_id: uuid_to_bytes(file_path_pub_id), - }, - file_path::object::NAME, - msgpack!(prisma_sync::object::SyncId { - pub_id: vec_id.clone() - }), - ), - db.file_path() - .update( - file_path::pub_id::equals(uuid_to_bytes(file_path_pub_id)), - vec![file_path::object::connect(object::pub_id::equals(vec_id))], - ) - .select(file_path_pub_id::select()), - ) -} - -async fn create_objects( - identified_files: &HashMap, +async fn assign_objects_to_duplicated_orphans( + file_paths_by_cas_id: &mut HashMap, Vec>, db: &PrismaClient, sync: &SyncManager, -) -> Result { - trace!("Creating {} new Objects", identified_files.len(),); +) -> Result<(Vec, u64), file_identifier::Error> { + // at least 1 file path per cas_id + let mut selected_file_paths = Vec::with_capacity(file_paths_by_cas_id.len()); + let mut cas_ids_by_file_path_id = HashMap::with_capacity(file_paths_by_cas_id.len()); - let (object_create_args, file_path_update_args) = identified_files - .iter() - .map( - |( - file_path_pub_id, - IdentifiedFile { - file_path: file_path_for_file_identifier::Data { date_created, .. }, - kind, - .. - }, - )| { - let object_pub_id = Uuid::new_v4(); + file_paths_by_cas_id.retain(|cas_id, file_paths| { + let file_path = file_paths.pop().expect("file_paths can't be empty"); + let has_more_file_paths = !file_paths.is_empty(); - let kind = *kind as i32; + if has_more_file_paths { + cas_ids_by_file_path_id.insert(file_path.id, cas_id.clone()); + } + selected_file_paths.push(file_path); - let (sync_params, db_params) = [ - ( - (object::date_created::NAME, msgpack!(date_created)), - object::date_created::set(*date_created), - ), - ( - (object::kind::NAME, msgpack!(kind)), - object::kind::set(Some(kind)), - ), - ] - .into_iter() - .unzip::<_, _, Vec<_>, Vec<_>>(); + has_more_file_paths + }); + let (mut file_paths_with_new_object, objects_by_cas_id) = + create_objects_and_update_file_paths(selected_file_paths, db, sync) + .await? + .into_iter() + .map(|(file_path_id, object_pub_id)| { ( - ( - sync.shared_create( - prisma_sync::object::SyncId { - pub_id: uuid_to_bytes(object_pub_id), - }, - sync_params, - ), - object::create_unchecked(uuid_to_bytes(object_pub_id), db_params), - ), - connect_file_path_to_object(*file_path_pub_id, object_pub_id, sync, db), + file_path_id, + cas_ids_by_file_path_id + .remove(&file_path_id) + .map(|cas_id| (cas_id, object_pub_id)), ) - }, - ) - .unzip::<_, _, Vec<_>, Vec<_>>(); + }) + .unzip::<_, _, Vec<_>, Vec<_>>(); - // create new object records with assembled values - let total_created_files = sync - .write_ops(db, { - let (sync, db_params) = object_create_args - .into_iter() - .unzip::<_, _, Vec<_>, Vec<_>>(); + let more_file_paths_ids_with_new_object = assign_existing_objects_to_file_paths( + file_paths_by_cas_id, + &objects_by_cas_id.into_iter().flatten().collect(), + db, + sync, + ) + .await?; - ( - sync.into_iter().flatten().collect(), - db.object().create_many(db_params), - ) - }) - .await?; + // Sanity check + assert!( + file_paths_by_cas_id.is_empty(), + "We MUST have processed all pending `file_paths` by now" + ); - trace!("Created {total_created_files} new Objects"); + let linked_objects_count = more_file_paths_ids_with_new_object.len() as u64; - if total_created_files > 0 { - trace!("Updating file paths with created objects"); + file_paths_with_new_object.extend(more_file_paths_ids_with_new_object); - sync.write_ops( - db, - file_path_update_args - .into_iter() - .unzip::<_, _, Vec<_>, Vec<_>>(), - ) - .await?; - - trace!("Updated file paths with created objects"); - } - - #[allow(clippy::cast_sign_loss)] // SAFETY: We're sure the value is positive - Ok(total_created_files as u64) + Ok((file_paths_with_new_object, linked_objects_count)) } -impl SerializableTask for ObjectProcessorTask { +#[derive(Debug, Serialize, Deserialize)] +pub struct SaveState { + id: TaskId, + file_paths_by_cas_id: HashMap, Vec>, + stage: Stage, + output: Output, + with_priority: bool, +} + +impl SerializableTask for ObjectProcessor { type SerializeError = rmp_serde::encode::Error; type DeserializeError = rmp_serde::decode::Error; @@ -424,18 +379,18 @@ impl SerializableTask for ObjectProcessorTask { async fn serialize(self) -> Result, Self::SerializeError> { let Self { id, - identified_files, - output, + file_paths_by_cas_id, stage, + output, with_priority, .. } = self; rmp_serde::to_vec_named(&SaveState { id, - identified_files, - output, + file_paths_by_cas_id, stage, + output, with_priority, }) } @@ -447,18 +402,18 @@ impl SerializableTask for ObjectProcessorTask { rmp_serde::from_slice(data).map( |SaveState { id, - identified_files, - output, + file_paths_by_cas_id, stage, + output, with_priority, }| Self { id, + with_priority, + file_paths_by_cas_id, + stage, + output, db, sync, - identified_files, - output, - stage, - with_priority, }, ) } diff --git a/core/crates/heavy-lifting/src/indexer/job.rs b/core/crates/heavy-lifting/src/indexer/job.rs index f2ad3f6e5..22546950e 100644 --- a/core/crates/heavy-lifting/src/indexer/job.rs +++ b/core/crates/heavy-lifting/src/indexer/job.rs @@ -2,14 +2,14 @@ use crate::{ indexer, job_system::{ job::{ - Job, JobName, JobReturn, JobTaskDispatcher, OuterContext, ProgressUpdate, ReturnStatus, + Job, JobContext, JobName, JobReturn, JobTaskDispatcher, ProgressUpdate, ReturnStatus, }, report::ReportOutputMetadata, utils::cancel_pending_tasks, - SerializableJob, SerializedTasks, + DispatcherError, JobErrorOrDispatcherError, SerializableJob, SerializedTasks, }, utils::sub_path::get_full_path_from_sub_path, - Error, LocationScanState, NonCriticalError, + Error, LocationScanState, NonCriticalError, OuterContext, }; use sd_core_file_path_helper::IsolatedFilePathData; @@ -21,10 +21,10 @@ use sd_task_system::{ AnyTaskOutput, IntoTask, SerializableTask, Task, TaskDispatcher, TaskHandle, TaskId, TaskOutput, TaskStatus, }; -use sd_utils::db::maybe_missing; +use sd_utils::{db::maybe_missing, u64_to_frontend}; use std::{ - collections::{HashMap, HashSet}, + collections::{HashMap, HashSet, VecDeque}, hash::{Hash, Hasher}, mem, path::PathBuf, @@ -38,33 +38,43 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; use serde_json::json; use tokio::time::Instant; -use tracing::warn; +use tracing::{debug, instrument, trace, warn, Level}; use super::{ remove_non_existing_file_paths, reverse_update_directories_sizes, tasks::{ - saver::{SaveTask, SaveTaskOutput}, - updater::{UpdateTask, UpdateTaskOutput}, - walker::{WalkDirTask, WalkTaskOutput, WalkedEntry}, + self, saver, updater, + walker::{self, WalkedEntry}, }, update_directory_sizes, update_location_size, IsoFilePathFactory, WalkerDBProxy, BATCH_SIZE, }; #[derive(Debug)] pub struct Indexer { + // Received arguments location: location_with_indexer_rules::Data, sub_path: Option, - metadata: Metadata, + // Derived from received arguments iso_file_path_factory: IsoFilePathFactory, indexer_ruler: IndexerRuler, walker_root_path: Option>, + + // Inner state ancestors_needing_indexing: HashSet, ancestors_already_indexed: HashSet>, iso_paths_and_sizes: HashMap, u64>, + // Optimizations + processing_first_directory: bool, + to_create_buffer: VecDeque, + to_update_buffer: VecDeque, + + // Run data + metadata: Metadata, errors: Vec, + // On shutdown data pending_tasks_on_resume: Vec>, tasks_for_shutdown: Vec>>, } @@ -72,15 +82,15 @@ pub struct Indexer { impl Job for Indexer { const NAME: JobName = JobName::Indexer; - async fn resume_tasks( + async fn resume_tasks( &mut self, dispatcher: &JobTaskDispatcher, - ctx: &impl OuterContext, + ctx: &impl JobContext, SerializedTasks(serialized_tasks): SerializedTasks, ) -> Result<(), Error> { let location_id = self.location.id; - self.pending_tasks_on_resume = dispatcher + if let Ok(tasks) = dispatcher .dispatch_many_boxed( rmp_serde::from_slice::)>>(&serialized_tasks) .map_err(indexer::Error::from)? @@ -90,7 +100,7 @@ impl Job for Indexer { let iso_file_path_factory = self.iso_file_path_factory.clone(); async move { match task_kind { - TaskKind::Walk => WalkDirTask::deserialize( + TaskKind::Walk => tasks::Walker::deserialize( &task_bytes, ( indexer_ruler.clone(), @@ -99,19 +109,18 @@ impl Job for Indexer { db: Arc::clone(ctx.db()), }, iso_file_path_factory.clone(), - dispatcher.clone(), ), ) .await .map(IntoTask::into_task), - TaskKind::Save => SaveTask::deserialize( + TaskKind::Save => tasks::Saver::deserialize( &task_bytes, (Arc::clone(ctx.db()), Arc::clone(ctx.sync())), ) .await .map(IntoTask::into_task), - TaskKind::Update => UpdateTask::deserialize( + TaskKind::Update => tasks::Updater::deserialize( &task_bytes, (Arc::clone(ctx.db()), Arc::clone(ctx.sync())), ) @@ -125,20 +134,61 @@ impl Job for Indexer { .await .map_err(indexer::Error::from)?, ) - .await; + .await + { + self.pending_tasks_on_resume = tasks; + } else { + warn!("Failed to dispatch tasks to resume as job was already canceled"); + } Ok(()) } - async fn run( + #[instrument( + skip_all, + fields( + location_id = self.location.id, + location_path = ?self.location.path, + sub_path = ?self.sub_path.as_ref().map(|path| path.display()), + ), + ret(level = Level::TRACE), + err, + )] + async fn run( mut self, dispatcher: JobTaskDispatcher, - ctx: Ctx, + ctx: impl JobContext, ) -> Result { let mut pending_running_tasks = FuturesUnordered::new(); - self.init_or_resume(&mut pending_running_tasks, &ctx, &dispatcher) - .await?; + match self + .init_or_resume(&mut pending_running_tasks, &ctx, &dispatcher) + .await + { + Ok(()) => { /* Everything is awesome! */ } + Err(JobErrorOrDispatcherError::JobError(e)) => { + return Err(e.into()); + } + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::JobCanceled(_))) => { + return Ok(self.cancel_job(&mut pending_running_tasks).await); + } + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::Shutdown(tasks))) => { + self.tasks_for_shutdown.extend(tasks); + + if pending_running_tasks.is_empty() { + assert_eq!( + self.tasks_for_shutdown.len() as u64, + self.metadata.total_tasks - self.metadata.completed_tasks, + "Shutting down a job without collecting all pending tasks" + ); + // If no task managed to be dispatched, we can just shutdown + // otherwise we have to process handles below and wait for them to be shutdown too + return Ok(ReturnStatus::Shutdown( + SerializableJob::::serialize(self).await, + )); + } + } + } if let Some(res) = self .process_handles(&mut pending_running_tasks, &ctx, &dispatcher) @@ -147,47 +197,29 @@ impl Job for Indexer { return res; } - if !self.tasks_for_shutdown.is_empty() { - return Ok(ReturnStatus::Shutdown( - SerializableJob::::serialize(self).await, - )); + if let Some(res) = self + .dispatch_last_save_and_update_tasks(&mut pending_running_tasks, &ctx, &dispatcher) + .await + { + return res; } - if !self.ancestors_needing_indexing.is_empty() { - let save_tasks = self - .ancestors_needing_indexing - .drain() - .chunks(BATCH_SIZE) - .into_iter() - .map(|chunk| { - let chunked_saves = chunk.collect::>(); - self.metadata.total_paths += chunked_saves.len() as u64; - self.metadata.total_save_steps += 1; + if let Some(res) = self + .index_pending_ancestors(&mut pending_running_tasks, &ctx, &dispatcher) + .await + { + return res; + } - SaveTask::new_deep( - self.location.id, - self.location.pub_id.clone(), - chunked_saves, - Arc::clone(ctx.db()), - Arc::clone(ctx.sync()), - ) - }) - .collect::>(); - - pending_running_tasks.extend(dispatcher.dispatch_many(save_tasks).await); - - if let Some(res) = self - .process_handles(&mut pending_running_tasks, &ctx, &dispatcher) - .await - { - return res; - } - - if !self.tasks_for_shutdown.is_empty() { - return Ok(ReturnStatus::Shutdown( - SerializableJob::::serialize(self).await, - )); - } + if !self.tasks_for_shutdown.is_empty() { + assert_eq!( + self.tasks_for_shutdown.len() as u64, + self.metadata.total_tasks - self.metadata.completed_tasks, + "Shutting down a job without collecting all pending tasks" + ); + return Ok(ReturnStatus::Shutdown( + SerializableJob::::serialize(self).await, + )); } // From here onward, job will not be interrupted anymore @@ -223,7 +255,7 @@ impl Job for Indexer { update_location_size(location.id, ctx.db(), &ctx).await?; - metadata.db_write_time += start_size_update_time.elapsed(); + metadata.mean_db_write_time += start_size_update_time.elapsed(); } if metadata.removed_count > 0 { @@ -283,6 +315,12 @@ impl Indexer { location, sub_path, metadata: Metadata::default(), + + processing_first_directory: true, + + to_create_buffer: VecDeque::new(), + to_update_buffer: VecDeque::new(), + errors: Vec::new(), pending_tasks_on_resume: Vec::new(), @@ -295,43 +333,41 @@ impl Indexer { /// # Panics /// Will panic if another task type is added in the job, but this function wasn't updated to handle it /// - async fn process_task_output( + async fn process_task_output( &mut self, task_id: TaskId, any_task_output: Box, - ctx: &impl OuterContext, + ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, - ) -> Result>, indexer::Error> { + ) -> Result>, JobErrorOrDispatcherError> { self.metadata.completed_tasks += 1; - ctx.progress(vec![ProgressUpdate::CompletedTaskCount( - self.metadata.completed_tasks, - )]); - - if any_task_output.is::() { + if any_task_output.is::>() { return self .process_walk_output( *any_task_output - .downcast::() + .downcast::>() .expect("just checked"), ctx, dispatcher, ) .await; - } else if any_task_output.is::() { + } else if any_task_output.is::() { self.process_save_output( *any_task_output - .downcast::() + .downcast::() .expect("just checked"), ctx, - ); - } else if any_task_output.is::() { + ) + .await; + } else if any_task_output.is::() { self.process_update_output( *any_task_output - .downcast::() + .downcast::() .expect("just checked"), ctx, - ); + ) + .await; } else { unreachable!("Unexpected task output type: "); } @@ -339,9 +375,22 @@ impl Indexer { Ok(Vec::new()) } - async fn process_walk_output( + #[instrument( + skip_all, + fields( + to_create_count = to_create.len(), + to_update_count = to_update.len(), + to_remove_count = to_remove.len(), + accepted_ancestors_count = accepted_ancestors.len(), + directory_iso_file_path = %directory_iso_file_path.as_ref().display(), + more_walker_tasks_count = keep_walking_tasks.len(), + %total_size, + ?scan_time, + ) + )] + async fn process_walk_output( &mut self, - WalkTaskOutput { + walker::Output { to_create, to_update, to_remove, @@ -349,13 +398,19 @@ impl Indexer { errors, directory_iso_file_path, total_size, - mut handles, + keep_walking_tasks, scan_time, - }: WalkTaskOutput, - ctx: &impl OuterContext, + .. + }: walker::Output, + ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, - ) -> Result>, indexer::Error> { - self.metadata.scan_read_time += scan_time; + ) -> Result>, JobErrorOrDispatcherError> { + self.metadata.mean_scan_read_time += scan_time; + #[allow(clippy::cast_possible_truncation)] + // SAFETY: we know that `keep_walking_tasks.len()` is a valid u32 as we wouldn't dispatch more than `u32::MAX` tasks + { + self.metadata.total_walk_tasks += keep_walking_tasks.len() as u32; + } let (to_create_count, to_update_count) = (to_create.len(), to_update.len()); @@ -398,138 +453,148 @@ impl Indexer { .map(|WalkedEntry { iso_file_path, .. }| iso_file_path.clone()), ); - self.errors.extend(errors); + if !errors.is_empty() { + warn!(?errors, "Non critical errors while indexing;"); + self.errors.extend(errors); + } - let db_delete_time = Instant::now(); - self.metadata.removed_count += - remove_non_existing_file_paths(to_remove, ctx.db(), ctx.sync()).await?; - self.metadata.db_write_time += db_delete_time.elapsed(); - - let save_tasks = to_create - .into_iter() - .chunks(BATCH_SIZE) - .into_iter() - .map(|chunk| { - let chunked_saves = chunk.collect::>(); - self.metadata.total_paths += chunked_saves.len() as u64; - self.metadata.total_save_steps += 1; - - SaveTask::new_deep( - self.location.id, - self.location.pub_id.clone(), - chunked_saves, - Arc::clone(ctx.db()), - Arc::clone(ctx.sync()), - ) - }) - .collect::>(); - - let update_tasks = to_update - .into_iter() - .chunks(BATCH_SIZE) - .into_iter() - .map(|chunk| { - let chunked_updates = chunk.collect::>(); - self.metadata.total_updated_paths += chunked_updates.len() as u64; - self.metadata.total_update_steps += 1; - - UpdateTask::new_deep( - chunked_updates, - Arc::clone(ctx.db()), - Arc::clone(ctx.sync()), - ) - }) - .collect::>(); - - handles.extend(dispatcher.dispatch_many(save_tasks).await); - handles.extend(dispatcher.dispatch_many(update_tasks).await); - - self.metadata.total_tasks += handles.len() as u64; + if !to_remove.is_empty() { + let db_delete_time = Instant::now(); + self.metadata.removed_count += + remove_non_existing_file_paths(to_remove, ctx.db(), ctx.sync()).await?; + self.metadata.mean_db_write_time += db_delete_time.elapsed(); + } + let (save_tasks, update_tasks) = + self.prepare_save_and_update_tasks(to_create, to_update, ctx); ctx.progress(vec![ - ProgressUpdate::TaskCount(handles.len() as u64), + ProgressUpdate::TaskCount(self.metadata.total_tasks), + ProgressUpdate::CompletedTaskCount(self.metadata.completed_tasks), ProgressUpdate::message(format!( "Found {to_create_count} new files and {to_update_count} to update" )), - ]); + ]) + .await; - Ok(handles) + self.metadata.total_tasks += + (keep_walking_tasks.len() + save_tasks.len() + update_tasks.len()) as u64; + + debug!( + "Dispatching more ({}W/{}S/{}U) tasks, completed ({}/{});", + keep_walking_tasks.len(), + save_tasks.len(), + update_tasks.len(), + self.metadata.completed_tasks, + self.metadata.total_tasks + ); + + dispatcher + .dispatch_many_boxed( + keep_walking_tasks + .into_iter() + .map(IntoTask::into_task) + .chain(save_tasks.into_iter().map(IntoTask::into_task)) + .chain(update_tasks.into_iter().map(IntoTask::into_task)), + ) + .await + .map_err(Into::into) } - fn process_save_output( + #[instrument(skip(self, ctx))] + async fn process_save_output( &mut self, - SaveTaskOutput { + saver::Output { saved_count, save_duration, - }: SaveTaskOutput, - ctx: &impl OuterContext, + }: saver::Output, + ctx: &impl JobContext, ) { self.metadata.indexed_count += saved_count; - self.metadata.db_write_time += save_duration; + self.metadata.mean_db_write_time += save_duration; - ctx.progress_msg(format!("Saved {saved_count} files")); + ctx.progress(vec![ + ProgressUpdate::CompletedTaskCount(self.metadata.completed_tasks), + ProgressUpdate::message(format!("Saved {} files", self.metadata.indexed_count)), + ]) + .await; + + debug!( + "Processed save task in the indexer ({}/{});", + self.metadata.completed_tasks, self.metadata.total_tasks + ); } - fn process_update_output( + #[instrument(skip(self, ctx))] + async fn process_update_output( &mut self, - UpdateTaskOutput { + updater::Output { updated_count, update_duration, - }: UpdateTaskOutput, - ctx: &impl OuterContext, + }: updater::Output, + ctx: &impl JobContext, ) { self.metadata.updated_count += updated_count; - self.metadata.db_write_time += update_duration; + self.metadata.mean_db_write_time += update_duration; - ctx.progress_msg(format!("Updated {updated_count} files")); + ctx.progress(vec![ + ProgressUpdate::CompletedTaskCount(self.metadata.completed_tasks), + ProgressUpdate::message(format!("Updated {} files", self.metadata.updated_count)), + ]) + .await; + + debug!( + "Processed update task in the indexer ({}/{});", + self.metadata.completed_tasks, self.metadata.total_tasks + ); } - async fn process_handles( + async fn process_handles( &mut self, pending_running_tasks: &mut FuturesUnordered>, - ctx: &impl OuterContext, + ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, ) -> Option> { while let Some(task) = pending_running_tasks.next().await { match task { Ok(TaskStatus::Done((task_id, TaskOutput::Out(out)))) => { - let more_handles = match self + match self .process_task_output(task_id, out, ctx, dispatcher) .await { - Ok(more_handles) => more_handles, - Err(e) => { - cancel_pending_tasks(&*pending_running_tasks).await; + Ok(more_handles) => pending_running_tasks.extend(more_handles), + Err(JobErrorOrDispatcherError::JobError(e)) => { + cancel_pending_tasks(pending_running_tasks).await; return Some(Err(e.into())); } - }; + Err(JobErrorOrDispatcherError::Dispatcher( + DispatcherError::JobCanceled(_), + )) => return Some(Ok(self.cancel_job(pending_running_tasks).await)), - pending_running_tasks.extend(more_handles); + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::Shutdown( + tasks, + ))) => self.tasks_for_shutdown.extend(tasks), + }; } Ok(TaskStatus::Done((task_id, TaskOutput::Empty))) => { - warn!("Task returned an empty output"); + warn!(%task_id, "Task returned an empty output"); } - Ok(TaskStatus::Shutdown(task)) => { - self.tasks_for_shutdown.push(task); - } + Ok(TaskStatus::Shutdown(task)) => self.tasks_for_shutdown.push(task), Ok(TaskStatus::Error(e)) => { - cancel_pending_tasks(&*pending_running_tasks).await; + cancel_pending_tasks(pending_running_tasks).await; return Some(Err(e)); } Ok(TaskStatus::Canceled | TaskStatus::ForcedAbortion) => { - cancel_pending_tasks(&*pending_running_tasks).await; - - return Some(Ok(ReturnStatus::Canceled)); + return Some(Ok(self.cancel_job(pending_running_tasks).await)); } Err(e) => { - cancel_pending_tasks(&*pending_running_tasks).await; + cancel_pending_tasks(pending_running_tasks).await; return Some(Err(e.into())); } @@ -539,18 +604,18 @@ impl Indexer { None } - async fn init_or_resume( + async fn init_or_resume( &mut self, pending_running_tasks: &mut FuturesUnordered>, - ctx: &impl OuterContext, + ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, - ) -> Result<(), indexer::Error> { + ) -> Result<(), JobErrorOrDispatcherError> { // if we don't have any pending task, then this is a fresh job - if self.pending_tasks_on_resume.is_empty() { + let updates = if self.pending_tasks_on_resume.is_empty() { let walker_root_path = Arc::new( - get_full_path_from_sub_path( + get_full_path_from_sub_path::( self.location.id, - &self.sub_path, + self.sub_path.as_ref(), &*self.iso_file_path_factory.location_path, ctx.db(), ) @@ -559,7 +624,7 @@ impl Indexer { pending_running_tasks.push( dispatcher - .dispatch(WalkDirTask::new_deep( + .dispatch(tasks::Walker::new_deep( walker_root_path.as_ref(), Arc::clone(&walker_root_path), self.indexer_ruler.clone(), @@ -568,52 +633,334 @@ impl Indexer { location_id: self.location.id, db: Arc::clone(ctx.db()), }, - dispatcher.clone(), )?) - .await, + .await?, ); + self.metadata.total_tasks = 1; + self.metadata.total_walk_tasks = 1; + + let updates = vec![ + ProgressUpdate::TaskCount(self.metadata.total_tasks), + ProgressUpdate::Message(format!("Indexing {}", walker_root_path.display())), + ]; + self.walker_root_path = Some(walker_root_path); + + updates } else { pending_running_tasks.extend(mem::take(&mut self.pending_tasks_on_resume)); - } + + vec![ + ProgressUpdate::TaskCount(self.metadata.total_tasks), + ProgressUpdate::Message("Resuming tasks".to_string()), + ] + }; + + ctx.progress(updates).await; Ok(()) } + + async fn dispatch_last_save_and_update_tasks( + &mut self, + pending_running_tasks: &mut FuturesUnordered>, + ctx: &impl JobContext, + dispatcher: &JobTaskDispatcher, + ) -> Option> { + if !self.to_create_buffer.is_empty() || !self.to_update_buffer.is_empty() { + let mut tasks = Vec::with_capacity(2); + + if !self.to_create_buffer.is_empty() { + assert!( + self.to_create_buffer.len() <= BATCH_SIZE, + "last save task must be less than BATCH_SIZE paths" + ); + + self.metadata.total_tasks += 1; + self.metadata.total_paths += self.to_create_buffer.len() as u64; + self.metadata.total_save_tasks += 1; + + tasks.push( + tasks::Saver::new_deep( + self.location.id, + self.location.pub_id.clone(), + self.to_create_buffer.drain(..).collect(), + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + ) + .into_task(), + ); + } + + if !self.to_update_buffer.is_empty() { + assert!( + self.to_update_buffer.len() <= BATCH_SIZE, + "last update task must be less than BATCH_SIZE paths" + ); + + self.metadata.total_tasks += 1; + self.metadata.total_updated_paths += self.to_update_buffer.len() as u64; + self.metadata.total_update_tasks += 1; + + tasks.push( + tasks::Updater::new_deep( + self.to_update_buffer.drain(..).collect(), + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + ) + .into_task(), + ); + } + + ctx.progress(vec![ProgressUpdate::TaskCount(self.metadata.total_tasks)]) + .await; + + match dispatcher.dispatch_many_boxed(tasks).await { + Ok(task_handles) => pending_running_tasks.extend(task_handles), + Err(DispatcherError::JobCanceled(_)) => { + return Some(Ok(self.cancel_job(pending_running_tasks).await)); + } + Err(DispatcherError::Shutdown(tasks)) => { + self.tasks_for_shutdown.extend(tasks); + } + } + + self.process_handles(pending_running_tasks, ctx, dispatcher) + .await + } else { + None + } + } + + async fn index_pending_ancestors( + &mut self, + pending_running_tasks: &mut FuturesUnordered>, + ctx: &impl JobContext, + dispatcher: &JobTaskDispatcher, + ) -> Option> { + if self.ancestors_needing_indexing.is_empty() { + return None; + } + + let save_tasks = self + .ancestors_needing_indexing + .drain() + .chunks(BATCH_SIZE) + .into_iter() + .map(|chunk| { + let chunked_saves = chunk.collect::>(); + + self.metadata.total_paths += chunked_saves.len() as u64; + self.metadata.total_save_tasks += 1; + + tasks::Saver::new_deep( + self.location.id, + self.location.pub_id.clone(), + chunked_saves, + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + ) + }) + .collect::>(); + + self.metadata.total_tasks += save_tasks.len() as u64; + + match dispatcher.dispatch_many(save_tasks).await { + Ok(task_handles) => pending_running_tasks.extend(task_handles), + Err(DispatcherError::JobCanceled(_)) => { + return Some(Ok(self.cancel_job(pending_running_tasks).await)); + } + Err(DispatcherError::Shutdown(tasks)) => { + self.tasks_for_shutdown.extend(tasks); + } + } + + self.process_handles(pending_running_tasks, ctx, dispatcher) + .await + } + + fn prepare_save_and_update_tasks( + &mut self, + to_create: Vec, + to_update: Vec, + ctx: &impl JobContext, + ) -> (Vec, Vec) { + if self.processing_first_directory { + // If we are processing the first directory, we dispatch shallow tasks with higher priority + // this way we provide a faster feedback loop to the user + self.processing_first_directory = false; + + let save_tasks = to_create + .into_iter() + .chunks(BATCH_SIZE) + .into_iter() + .map(|chunk| { + let chunked_saves = chunk.collect::>(); + + self.metadata.total_paths += chunked_saves.len() as u64; + self.metadata.total_save_tasks += 1; + + tasks::Saver::new_shallow( + self.location.id, + self.location.pub_id.clone(), + chunked_saves, + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + ) + }) + .collect::>(); + + let update_tasks = to_update + .into_iter() + .chunks(BATCH_SIZE) + .into_iter() + .map(|chunk| { + let chunked_updates = chunk.collect::>(); + + self.metadata.total_updated_paths += chunked_updates.len() as u64; + self.metadata.total_update_tasks += 1; + + tasks::Updater::new_shallow( + chunked_updates, + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + ) + }) + .collect::>(); + + (save_tasks, update_tasks) + } else { + self.to_create_buffer.extend(to_create); + + let save_tasks = if self.to_create_buffer.len() > BATCH_SIZE { + let chunks_count = self.to_create_buffer.len() / BATCH_SIZE; + let mut save_tasks = Vec::with_capacity(chunks_count); + + for _ in 0..chunks_count { + let chunked_saves = self + .to_create_buffer + .drain(..BATCH_SIZE) + .collect::>(); + + self.metadata.total_paths += chunked_saves.len() as u64; + self.metadata.total_save_tasks += 1; + + save_tasks.push(tasks::Saver::new_deep( + self.location.id, + self.location.pub_id.clone(), + chunked_saves, + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + )); + } + save_tasks + } else { + trace!("Not enough entries to dispatch a new saver task;"); + vec![] + }; + + self.to_update_buffer.extend(to_update); + + let update_tasks = if self.to_update_buffer.len() > BATCH_SIZE { + let chunks_count = self.to_update_buffer.len() / BATCH_SIZE; + let mut update_tasks = Vec::with_capacity(chunks_count); + + for _ in 0..chunks_count { + let chunked_updates = self + .to_update_buffer + .drain(..BATCH_SIZE) + .collect::>(); + + self.metadata.total_updated_paths += chunked_updates.len() as u64; + self.metadata.total_update_tasks += 1; + + update_tasks.push(tasks::Updater::new_deep( + chunked_updates, + Arc::clone(ctx.db()), + Arc::clone(ctx.sync()), + )); + } + update_tasks + } else { + trace!("Not enough entries to dispatch a new updater task;"); + vec![] + }; + + (save_tasks, update_tasks) + } + } + + async fn cancel_job( + &mut self, + pending_running_tasks: &mut FuturesUnordered>, + ) -> ReturnStatus { + cancel_pending_tasks(pending_running_tasks).await; + + ReturnStatus::Canceled( + JobReturn::builder() + .with_metadata(mem::take(&mut self.metadata)) + .with_non_critical_errors(mem::take(&mut self.errors)) + .build(), + ) + } } #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct Metadata { - db_write_time: Duration, - scan_read_time: Duration, + mean_db_write_time: Duration, + mean_scan_read_time: Duration, total_tasks: u64, completed_tasks: u64, total_paths: u64, total_updated_paths: u64, - total_save_steps: u64, - total_update_steps: u64, + total_walk_tasks: u32, + total_save_tasks: u32, + total_update_tasks: u32, indexed_count: u64, updated_count: u64, removed_count: u64, } -impl From for ReportOutputMetadata { - fn from(value: Metadata) -> Self { - Self::Metrics(HashMap::from([ - ("db_write_time".into(), json!(value.db_write_time)), - ("scan_read_time".into(), json!(value.scan_read_time)), - ("total_tasks".into(), json!(value.total_tasks)), - ("total_paths".into(), json!(value.total_paths)), - ( - "total_updated_paths".into(), - json!(value.total_updated_paths), - ), - ("total_save_steps".into(), json!(value.total_save_steps)), - ("total_update_steps".into(), json!(value.total_update_steps)), - ("indexed_count".into(), json!(value.indexed_count)), - ("updated_count".into(), json!(value.updated_count)), - ("removed_count".into(), json!(value.removed_count)), - ])) +impl From for Vec { + fn from( + Metadata { + mut mean_db_write_time, + mut mean_scan_read_time, + total_tasks, + completed_tasks, + total_paths, + total_updated_paths, + total_walk_tasks, + total_save_tasks, + total_update_tasks, + indexed_count, + updated_count, + removed_count, + }: Metadata, + ) -> Self { + mean_scan_read_time /= u32::max(total_walk_tasks, 1); // To avoid division by zero + mean_db_write_time /= total_save_tasks + total_update_tasks + 1; // +1 to update directories sizes + + vec![ + ReportOutputMetadata::Indexer { + total_paths: u64_to_frontend(total_paths), + }, + ReportOutputMetadata::Metrics(HashMap::from([ + ("mean_scan_read_time".into(), json!(mean_scan_read_time)), + ("mean_db_write_time".into(), json!(mean_db_write_time)), + ("total_tasks".into(), json!(total_tasks)), + ("completed_tasks".into(), json!(completed_tasks)), + ("total_paths".into(), json!(total_paths)), + ("total_updated_paths".into(), json!(total_updated_paths)), + ("total_walk_tasks".into(), json!(total_walk_tasks)), + ("total_save_tasks".into(), json!(total_save_tasks)), + ("total_update_tasks".into(), json!(total_update_tasks)), + ("indexed_count".into(), json!(indexed_count)), + ("updated_count".into(), json!(updated_count)), + ("removed_count".into(), json!(removed_count)), + ])), + ] } } @@ -628,21 +975,26 @@ enum TaskKind { struct SaveState { location: location_with_indexer_rules::Data, sub_path: Option, - metadata: Metadata, iso_file_path_factory: IsoFilePathFactory, - indexer_ruler_bytes: Vec, + indexer_ruler: IndexerRuler, walker_root_path: Option>, + ancestors_needing_indexing: HashSet, ancestors_already_indexed: HashSet>, - paths_and_sizes: HashMap, u64>, + iso_paths_and_sizes: HashMap, u64>, + processing_first_directory: bool, + to_create_buffer: VecDeque, + to_update_buffer: VecDeque, + + metadata: Metadata, errors: Vec, tasks_for_shutdown_bytes: Option, } -impl SerializableJob for Indexer { +impl SerializableJob for Indexer { async fn serialize(self) -> Result>, rmp_serde::encode::Error> { let Self { location, @@ -653,81 +1005,90 @@ impl SerializableJob for Indexer { walker_root_path, ancestors_needing_indexing, ancestors_already_indexed, - iso_paths_and_sizes: paths_and_sizes, + iso_paths_and_sizes, + processing_first_directory, + to_create_buffer, + to_update_buffer, errors, tasks_for_shutdown, .. } = self; + let serialized_tasks = tasks_for_shutdown + .into_iter() + .map(|task| async move { + if task.is::>() { + task.downcast::>() + .expect("just checked") + .serialize() + .await + .map(|bytes| (TaskKind::Walk, bytes)) + } else if task.is::() { + task.downcast::() + .expect("just checked") + .serialize() + .await + .map(|bytes| (TaskKind::Save, bytes)) + } else if task.is::() { + task.downcast::() + .expect("just checked") + .serialize() + .await + .map(|bytes| (TaskKind::Update, bytes)) + } else { + unreachable!("Unexpected task type") + } + }) + .collect::>() + .try_join() + .await?; + + let tasks_for_shutdown_bytes = if serialized_tasks.is_empty() { + None + } else { + Some(SerializedTasks(rmp_serde::to_vec_named(&serialized_tasks)?)) + }; + rmp_serde::to_vec_named(&SaveState { location, sub_path, - metadata, iso_file_path_factory, - indexer_ruler_bytes: indexer_ruler.serialize().await?, + indexer_ruler, walker_root_path, ancestors_needing_indexing, ancestors_already_indexed, - paths_and_sizes, - tasks_for_shutdown_bytes: Some(SerializedTasks(rmp_serde::to_vec_named( - &tasks_for_shutdown - .into_iter() - .map(|task| async move { - if task - .is::>( - ) { - task - .downcast::>( - ) - .expect("just checked") - .serialize() - .await - .map(|bytes| (TaskKind::Walk, bytes)) - } else if task.is::() { - task.downcast::() - .expect("just checked") - .serialize() - .await - .map(|bytes| (TaskKind::Save, bytes)) - } else if task.is::() { - task.downcast::() - .expect("just checked") - .serialize() - .await - .map(|bytes| (TaskKind::Update, bytes)) - } else { - unreachable!("Unexpected task type") - } - }) - .collect::>() - .try_join() - .await?, - )?)), + iso_paths_and_sizes, + processing_first_directory, + to_create_buffer, + to_update_buffer, + metadata, errors, + tasks_for_shutdown_bytes, }) .map(Some) } async fn deserialize( serialized_job: &[u8], - _: &Ctx, + _: &OuterCtx, ) -> Result)>, rmp_serde::decode::Error> { let SaveState { location, sub_path, - metadata, iso_file_path_factory, - indexer_ruler_bytes, + indexer_ruler, walker_root_path, ancestors_needing_indexing, ancestors_already_indexed, - paths_and_sizes, + iso_paths_and_sizes, + processing_first_directory, + to_create_buffer, + to_update_buffer, + metadata, errors, tasks_for_shutdown_bytes, } = rmp_serde::from_slice::(serialized_job)?; - let indexer_ruler = IndexerRuler::deserialize(&indexer_ruler_bytes)?; - Ok(Some(( Self { location, @@ -738,7 +1099,10 @@ impl SerializableJob for Indexer { walker_root_path, ancestors_needing_indexing, ancestors_already_indexed, - iso_paths_and_sizes: paths_and_sizes, + iso_paths_and_sizes, + processing_first_directory, + to_create_buffer, + to_update_buffer, errors, pending_tasks_on_resume: Vec::new(), tasks_for_shutdown: Vec::new(), diff --git a/core/crates/heavy-lifting/src/indexer/mod.rs b/core/crates/heavy-lifting/src/indexer/mod.rs index 78b9d3827..3d7ad5eda 100644 --- a/core/crates/heavy-lifting/src/indexer/mod.rs +++ b/core/crates/heavy-lifting/src/indexer/mod.rs @@ -1,7 +1,6 @@ use crate::{utils::sub_path, OuterContext}; use sd_core_file_path_helper::{FilePathError, IsolatedFilePathData}; -use sd_core_indexer_rules::IndexerRuleError; use sd_core_prisma_helpers::{ file_path_pub_and_cas_ids, file_path_to_isolate_with_pub_id, file_path_walker, }; @@ -27,11 +26,11 @@ use std::{ }; use itertools::Itertools; -use prisma_client_rust::{operator::or, Select}; +use prisma_client_rust::{operator::or, QueryError, Select}; use rspc::ErrorCode; use serde::{Deserialize, Serialize}; use specta::Type; -use tracing::warn; +use tracing::{instrument, warn}; pub mod job; mod shallow; @@ -53,8 +52,8 @@ pub enum Error { SubPath(#[from] sub_path::Error), // Internal Errors - #[error("database Error: {0}")] - Database(#[from] prisma_client_rust::QueryError), + #[error("database error: {0}")] + Database(#[from] QueryError), #[error(transparent)] FileIO(#[from] FileIOError), #[error(transparent)] @@ -68,27 +67,28 @@ pub enum Error { // Mixed errors #[error(transparent)] - Rules(#[from] IndexerRuleError), + Rules(#[from] sd_core_indexer_rules::Error), } impl From for rspc::Error { - fn from(err: Error) -> Self { - match err { + fn from(e: Error) -> Self { + match e { Error::IndexerRuleNotFound(_) => { - Self::with_cause(ErrorCode::NotFound, err.to_string(), err) + Self::with_cause(ErrorCode::NotFound, e.to_string(), e) } Error::SubPath(sub_path_err) => sub_path_err.into(), Error::Rules(rule_err) => rule_err.into(), - _ => Self::with_cause(ErrorCode::InternalServerError, err.to_string(), err), + _ => Self::with_cause(ErrorCode::InternalServerError, e.to_string(), e), } } } -#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)] -pub enum NonCriticalError { +#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type, Clone)] +#[serde(rename_all = "snake_case")] +pub enum NonCriticalIndexerError { #[error("failed to read directory entry: {0}")] FailedDirectoryEntry(String), #[error("failed to fetch metadata: {0}")] @@ -153,10 +153,12 @@ async fn update_directory_sizes( file_path::size_in_bytes_bytes::NAME, msgpack!(size_bytes), ), - db.file_path().update( - file_path::pub_id::equals(file_path.pub_id), - vec![file_path::size_in_bytes_bytes::set(Some(size_bytes))], - ), + db.file_path() + .update( + file_path::pub_id::equals(file_path.pub_id), + vec![file_path::size_in_bytes_bytes::set(Some(size_bytes))], + ) + .select(file_path::select!({ id })), )) }) .collect::, Error>>()? @@ -240,8 +242,16 @@ async fn remove_non_existing_file_paths( .map_err(Into::into) } +#[instrument( + skip(base_path, location_path, db, sync, errors), + fields( + base_path = %base_path.as_ref().display(), + location_path = %location_path.as_ref().display(), + ), + err, +)] #[allow(clippy::missing_panics_doc)] // Can't actually panic as we only deal with directories -async fn reverse_update_directories_sizes( +pub async fn reverse_update_directories_sizes( base_path: impl AsRef + Send, location_id: location::id::Type, location_path: impl AsRef + Send, @@ -278,7 +288,7 @@ async fn reverse_update_directories_sizes( IsolatedFilePathData::try_from(file_path) .map_err(|e| { errors.push( - NonCriticalError::MissingFilePathData(format!( + NonCriticalIndexerError::MissingFilePathData(format!( "Found a file_path missing data: , error: {e:#?}", from_bytes_to_uuid(&pub_id) )) @@ -328,7 +338,7 @@ async fn reverse_update_directories_sizes( ), )) } else { - warn!("Got a missing ancestor for a file_path in the database, maybe we have a corruption"); + warn!("Got a missing ancestor for a file_path in the database, ignoring..."); None } }) @@ -345,8 +355,9 @@ async fn compute_sizes( pub_id_by_ancestor_materialized_path: &mut HashMap, db: &PrismaClient, errors: &mut Vec, -) -> Result<(), Error> { - db.file_path() +) -> Result<(), QueryError> { + for file_path in db + .file_path() .find_many(vec![ file_path::location_id::equals(Some(location_id)), file_path::materialized_path::in_vec(materialized_paths), @@ -354,30 +365,29 @@ async fn compute_sizes( .select(file_path::select!({ pub_id materialized_path size_in_bytes_bytes })) .exec() .await? - .into_iter() - .for_each(|file_path| { - if let Some(materialized_path) = file_path.materialized_path { - if let Some((_, size)) = - pub_id_by_ancestor_materialized_path.get_mut(&materialized_path) - { - *size += file_path.size_in_bytes_bytes.map_or_else( - || { - warn!("Got a directory missing its size in bytes"); - 0 - }, - |size_in_bytes_bytes| size_in_bytes_from_db(&size_in_bytes_bytes), - ); - } - } else { - errors.push( - NonCriticalError::MissingFilePathData(format!( + { + if let Some(materialized_path) = file_path.materialized_path { + if let Some((_, size)) = + pub_id_by_ancestor_materialized_path.get_mut(&materialized_path) + { + *size += file_path.size_in_bytes_bytes.map_or_else( + || { + warn!("Got a directory missing its size in bytes"); + 0 + }, + |size_in_bytes_bytes| size_in_bytes_from_db(&size_in_bytes_bytes), + ); + } + } else { + errors.push( + NonCriticalIndexerError::MissingFilePathData(format!( "Corrupt database possessing a file_path entry without materialized_path: ", from_bytes_to_uuid(&file_path.pub_id) )) - .into(), - ); - } - }); + .into(), + ); + } + } Ok(()) } @@ -433,57 +443,76 @@ impl walker::WalkerDBProxy for WalkerDBProxy { async fn fetch_file_paths_to_remove( &self, parent_iso_file_path: &IsolatedFilePathData<'_>, + mut existing_inodes: HashSet>, unique_location_id_materialized_path_name_extension_params: Vec, - ) -> Result, NonCriticalError> { + ) -> Result, NonCriticalIndexerError> { // NOTE: This batch size can be increased if we wish to trade memory for more performance const BATCH_SIZE: i64 = 1000; - let founds_ids = self - .db - ._batch( - unique_location_id_materialized_path_name_extension_params - .into_iter() - .chunks(200) - .into_iter() - .map(|unique_params| { - self.db - .file_path() - .find_many(vec![or(unique_params.collect())]) - .select(file_path::select!({ id })) - }) - .collect::>(), - ) - .await - .map(|founds_chunk| { - founds_chunk - .into_iter() - .flat_map(|file_paths| file_paths.into_iter().map(|file_path| file_path.id)) - .collect::>() - }) - .map_err(|e| NonCriticalError::FetchAlreadyExistingFilePathIds(e.to_string()))?; + let founds_ids = { + let found_chunks = self + .db + ._batch( + unique_location_id_materialized_path_name_extension_params + .into_iter() + .chunks(200) + .into_iter() + .map(|unique_params| { + self.db + .file_path() + .find_many(vec![or(unique_params.collect())]) + .select(file_path::select!({ id inode })) + }) + .collect::>(), + ) + .await + .map_err(|e| { + NonCriticalIndexerError::FetchAlreadyExistingFilePathIds(e.to_string()) + })?; + + found_chunks + .into_iter() + .flatten() + .map(|file_path| { + if let Some(inode) = file_path.inode { + existing_inodes.remove(&inode); + } + file_path.id + }) + .collect::>() + }; let mut to_remove = vec![]; let mut cursor = 1; loop { + let materialized_path_param = file_path::materialized_path::equals(Some( + parent_iso_file_path + .materialized_path_for_children() + .expect("the received isolated file path must be from a directory"), + )); + let found = self .db .file_path() .find_many(vec![ file_path::location_id::equals(Some(self.location_id)), - file_path::materialized_path::equals(Some( - parent_iso_file_path - .materialized_path_for_children() - .expect("the received isolated file path must be from a directory"), - )), + if existing_inodes.is_empty() { + materialized_path_param + } else { + or(vec![ + materialized_path_param, + file_path::inode::in_vec(existing_inodes.iter().cloned().collect()), + ]) + }, ]) .order_by(file_path::id::order(SortOrder::Asc)) .take(BATCH_SIZE) .cursor(file_path::id::equals(cursor)) - .select(file_path_pub_and_cas_ids::select()) + .select(file_path::select!({ id pub_id cas_id inode })) .exec() .await - .map_err(|e| NonCriticalError::FetchFilePathsToRemove(e.to_string()))?; + .map_err(|e| NonCriticalIndexerError::FetchFilePathsToRemove(e.to_string()))?; #[allow(clippy::cast_possible_truncation)] // Safe because we are using a constant let should_stop = found.len() < BATCH_SIZE as usize; @@ -494,11 +523,17 @@ impl walker::WalkerDBProxy for WalkerDBProxy { break; } - to_remove.extend( - found - .into_iter() - .filter(|file_path| !founds_ids.contains(&file_path.id)), - ); + to_remove.extend(found.into_iter().filter_map(|file_path| { + if let Some(inode) = file_path.inode { + existing_inodes.remove(&inode); + } + + (!founds_ids.contains(&file_path.id)).then_some(file_path_pub_and_cas_ids::Data { + id: file_path.id, + pub_id: file_path.pub_id, + cas_id: file_path.cas_id, + }) + })); if should_stop { break; diff --git a/core/crates/heavy-lifting/src/indexer/shallow.rs b/core/crates/heavy-lifting/src/indexer/shallow.rs index 085b6f1a5..c57993840 100644 --- a/core/crates/heavy-lifting/src/indexer/shallow.rs +++ b/core/crates/heavy-lifting/src/indexer/shallow.rs @@ -18,25 +18,32 @@ use std::{ use futures_concurrency::future::TryJoin; use itertools::Itertools; -use tracing::{debug, warn}; +use tracing::{debug, instrument, warn}; use super::{ remove_non_existing_file_paths, reverse_update_directories_sizes, tasks::{ - saver::{SaveTask, SaveTaskOutput}, - updater::{UpdateTask, UpdateTaskOutput}, - walker::{ToWalkEntry, WalkDirTask, WalkTaskOutput, WalkedEntry}, + self, saver, updater, + walker::{self, ToWalkEntry, WalkedEntry}, }, update_directory_sizes, update_location_size, IsoFilePathFactory, WalkerDBProxy, BATCH_SIZE, }; +#[instrument( + skip_all, + fields( + location_id = location.id, + location_path = ?location.path, + sub_path = %sub_path.as_ref().display() + ) + err, +)] pub async fn shallow( location: location_with_indexer_rules::Data, sub_path: impl AsRef + Send, - dispatcher: BaseTaskDispatcher, - ctx: impl OuterContext, + dispatcher: &BaseTaskDispatcher, + ctx: &impl OuterContext, ) -> Result, Error> { - let sub_path = sub_path.as_ref(); let db = ctx.db(); let sync = ctx.sync(); @@ -46,15 +53,20 @@ pub async fn shallow( .map_err(indexer::Error::from)?; let to_walk_path = Arc::new( - get_full_path_from_sub_path(location.id, &Some(sub_path), &*location_path, db) - .await - .map_err(indexer::Error::from)?, + get_full_path_from_sub_path::( + location.id, + Some(sub_path.as_ref()), + &*location_path, + db, + ) + .await?, ); - let Some(WalkTaskOutput { + let Some(walker::Output { to_create, to_update, to_remove, + non_indexed_paths, mut errors, directory_iso_file_path, total_size, @@ -64,13 +76,16 @@ pub async fn shallow( Arc::clone(&location_path), Arc::clone(&to_walk_path), Arc::clone(db), - &dispatcher, + dispatcher, ) .await? else { return Ok(vec![]); }; + // TODO use non_indexed_paths here in the future, sending it to frontend, showing then alongside the indexed files from db + debug!(non_indexed_paths_count = non_indexed_paths.len()); + let removed_count = remove_non_existing_file_paths(to_remove, db, sync).await?; let Some(Metadata { @@ -82,7 +97,7 @@ pub async fn shallow( to_update, Arc::clone(db), Arc::clone(sync), - &dispatcher, + dispatcher, ) .await? else { @@ -109,7 +124,7 @@ pub async fn shallow( .await?; } - update_location_size(location.id, db, &ctx).await?; + update_location_size(location.id, db, ctx).await?; } if indexed_count > 0 || removed_count > 0 { @@ -119,15 +134,19 @@ pub async fn shallow( Ok(errors) } +#[instrument( + skip_all, + fields(to_walk_path = %to_walk_path.display()) +)] async fn walk( location: &location_with_indexer_rules::Data, location_path: Arc, to_walk_path: Arc, db: Arc, dispatcher: &BaseTaskDispatcher, -) -> Result, Error> { - match dispatcher - .dispatch(WalkDirTask::new_shallow( +) -> Result>, Error> { + let Ok(task_handle) = dispatcher + .dispatch(tasks::Walker::new_shallow( ToWalkEntry::from(&*to_walk_path), to_walk_path, location @@ -147,11 +166,15 @@ async fn walk( }, )?) .await - .await? - { + else { + debug!("Task system is shutting down while a shallow indexer was in progress"); + return Ok(None); + }; + + match task_handle.await? { sd_task_system::TaskStatus::Done((_, TaskOutput::Out(data))) => Ok(Some( *data - .downcast::() + .downcast::>() .expect("we just dispatched this task"), )), sd_task_system::TaskStatus::Done((_, TaskOutput::Empty)) => { @@ -188,7 +211,7 @@ async fn save_and_update( .chunks(BATCH_SIZE) .into_iter() .map(|chunk| { - SaveTask::new_shallow( + tasks::Saver::new_shallow( location.id, location.pub_id.clone(), chunk.collect::>(), @@ -203,7 +226,7 @@ async fn save_and_update( .chunks(BATCH_SIZE) .into_iter() .map(|chunk| { - UpdateTask::new_shallow( + tasks::Updater::new_shallow( chunk.collect::>(), Arc::clone(&db), Arc::clone(&sync), @@ -218,25 +241,28 @@ async fn save_and_update( updated_count: 0, }; - for task_status in dispatcher - .dispatch_many_boxed(save_and_update_tasks) - .await + let Ok(tasks_handles) = dispatcher.dispatch_many_boxed(save_and_update_tasks).await else { + debug!("Task system is shutting down while a shallow indexer was in progress"); + return Ok(None); + }; + + for task_status in tasks_handles .into_iter() - .map(CancelTaskOnDrop) + .map(CancelTaskOnDrop::new) .collect::>() .try_join() .await? { match task_status { sd_task_system::TaskStatus::Done((_, TaskOutput::Out(data))) => { - if data.is::() { + if data.is::() { metadata.indexed_count += data - .downcast::() + .downcast::() .expect("just checked") .saved_count; } else { metadata.updated_count += data - .downcast::() + .downcast::() .expect("just checked") .updated_count; } diff --git a/core/crates/heavy-lifting/src/indexer/tasks/mod.rs b/core/crates/heavy-lifting/src/indexer/tasks/mod.rs index eacba8f11..0d2bf5f18 100644 --- a/core/crates/heavy-lifting/src/indexer/tasks/mod.rs +++ b/core/crates/heavy-lifting/src/indexer/tasks/mod.rs @@ -1,3 +1,7 @@ pub mod saver; pub mod updater; pub mod walker; + +pub use saver::Saver; +pub use updater::Updater; +pub use walker::Walker; diff --git a/core/crates/heavy-lifting/src/indexer/tasks/saver.rs b/core/crates/heavy-lifting/src/indexer/tasks/saver.rs index d9fbef278..3bdb113b5 100644 --- a/core/crates/heavy-lifting/src/indexer/tasks/saver.rs +++ b/core/crates/heavy-lifting/src/indexer/tasks/saver.rs @@ -16,22 +16,165 @@ use std::{sync::Arc, time::Duration}; use chrono::Utc; use serde::{Deserialize, Serialize}; use tokio::time::Instant; -use tracing::trace; +use tracing::{instrument, trace, Level}; use super::walker::WalkedEntry; #[derive(Debug)] -pub struct SaveTask { +pub struct Saver { + // Task control id: TaskId, + is_shallow: bool, + + // Received input args location_id: location::id::Type, location_pub_id: location::pub_id::Type, walked_entries: Vec, + + // Dependencies db: Arc, sync: Arc, - is_shallow: bool, } -impl SaveTask { +/// [`Save`] Task output +#[derive(Debug)] +pub struct Output { + /// Number of records inserted on database + pub saved_count: u64, + /// Time spent saving records + pub save_duration: Duration, +} + +#[async_trait::async_trait] +impl Task for Saver { + fn id(&self) -> TaskId { + self.id + } + + fn with_priority(&self) -> bool { + // If we're running in shallow mode, then we want priority + self.is_shallow + } + + #[instrument( + skip_all, + fields( + task_id = %self.id, + location_id = %self.location_id, + to_save_count = %self.walked_entries.len(), + is_shallow = self.is_shallow, + ), + ret(level = Level::TRACE), + err, + )] + #[allow(clippy::blocks_in_conditions)] // Due to `err` on `instrument` macro above + async fn run(&mut self, _: &Interrupter) -> Result { + use file_path::{ + create_unchecked, date_created, date_indexed, date_modified, extension, hidden, inode, + is_dir, location, location_id, materialized_path, name, size_in_bytes_bytes, + }; + + let start_time = Instant::now(); + + let Self { + location_id, + location_pub_id, + walked_entries, + db, + sync, + .. + } = self; + + let (sync_stuff, paths): (Vec<_>, Vec<_>) = walked_entries + .drain(..) + .map( + |WalkedEntry { + pub_id, + maybe_object_id, + iso_file_path, + metadata, + }| { + let IsolatedFilePathDataParts { + materialized_path, + is_dir, + name, + extension, + .. + } = iso_file_path.to_parts(); + + assert!( + maybe_object_id.is_none(), + "Object ID must be None as this tasks only created \ + new file_paths and they were not identified yet" + ); + + let (sync_params, db_params): (Vec<_>, Vec<_>) = [ + ( + ( + location::NAME, + msgpack!(prisma_sync::location::SyncId { + pub_id: location_pub_id.clone() + }), + ), + location_id::set(Some(*location_id)), + ), + sync_db_entry!(materialized_path.to_string(), materialized_path), + sync_db_entry!(name.to_string(), name), + sync_db_entry!(is_dir, is_dir), + sync_db_entry!(extension.to_string(), extension), + sync_db_entry!( + metadata.size_in_bytes.to_be_bytes().to_vec(), + size_in_bytes_bytes + ), + sync_db_entry!(inode_to_db(metadata.inode), inode), + sync_db_entry!(metadata.created_at.into(), date_created), + sync_db_entry!(metadata.modified_at.into(), date_modified), + sync_db_entry!(Utc::now().into(), date_indexed), + sync_db_entry!(metadata.hidden, hidden), + ] + .into_iter() + .unzip(); + + ( + sync.shared_create( + prisma_sync::file_path::SyncId { + pub_id: pub_id.to_db(), + }, + sync_params, + ), + create_unchecked(pub_id.into(), db_params), + ) + }, + ) + .unzip(); + + #[allow(clippy::cast_sign_loss)] + let saved_count = sync + .write_ops( + db, + ( + sync_stuff.into_iter().flatten().collect(), + db.file_path().create_many(paths).skip_duplicates(), + ), + ) + .await + .map_err(indexer::Error::from)? as u64; + + let save_duration = start_time.elapsed(); + + trace!(saved_count, "Inserted records;"); + + Ok(ExecStatus::Done( + Output { + saved_count, + save_duration, + } + .into_output(), + )) + } +} + +impl Saver { #[must_use] pub fn new_deep( location_id: location::id::Type, @@ -72,15 +215,16 @@ impl SaveTask { } #[derive(Debug, Serialize, Deserialize)] -struct SaveTaskSaveState { +struct SaveState { id: TaskId, + is_shallow: bool, + location_id: location::id::Type, location_pub_id: location::pub_id::Type, walked_entries: Vec, - is_shallow: bool, } -impl SerializableTask for SaveTask { +impl SerializableTask for Saver { type SerializeError = rmp_serde::encode::Error; type DeserializeError = rmp_serde::decode::Error; @@ -90,18 +234,18 @@ impl SerializableTask for SaveTask { async fn serialize(self) -> Result, Self::SerializeError> { let Self { id, + is_shallow, location_id, location_pub_id, walked_entries, - is_shallow, .. } = self; - rmp_serde::to_vec_named(&SaveTaskSaveState { + rmp_serde::to_vec_named(&SaveState { id, + is_shallow, location_id, location_pub_id, walked_entries, - is_shallow, }) } @@ -110,131 +254,21 @@ impl SerializableTask for SaveTask { (db, sync): Self::DeserializeCtx, ) -> Result { rmp_serde::from_slice(data).map( - |SaveTaskSaveState { + |SaveState { id, + is_shallow, location_id, location_pub_id, walked_entries, - is_shallow, }| Self { id, + is_shallow, location_id, location_pub_id, walked_entries, db, sync, - is_shallow, }, ) } } - -#[derive(Debug)] -pub struct SaveTaskOutput { - pub saved_count: u64, - pub save_duration: Duration, -} - -#[async_trait::async_trait] -impl Task for SaveTask { - fn id(&self) -> TaskId { - self.id - } - - fn with_priority(&self) -> bool { - // If we're running in shallow mode, then we want priority - self.is_shallow - } - - async fn run(&mut self, _: &Interrupter) -> Result { - use file_path::{ - create_unchecked, date_created, date_indexed, date_modified, extension, hidden, inode, - is_dir, location, location_id, materialized_path, name, size_in_bytes_bytes, - }; - - let start_time = Instant::now(); - - let Self { - location_id, - location_pub_id, - walked_entries, - db, - sync, - .. - } = self; - - let (sync_stuff, paths): (Vec<_>, Vec<_>) = walked_entries - .drain(..) - .map(|entry| { - let IsolatedFilePathDataParts { - materialized_path, - is_dir, - name, - extension, - .. - } = entry.iso_file_path.to_parts(); - - let pub_id = sd_utils::uuid_to_bytes(entry.pub_id); - - let (sync_params, db_params): (Vec<_>, Vec<_>) = [ - ( - ( - location::NAME, - msgpack!(prisma_sync::location::SyncId { - pub_id: location_pub_id.clone() - }), - ), - location_id::set(Some(*location_id)), - ), - sync_db_entry!(materialized_path.to_string(), materialized_path), - sync_db_entry!(name.to_string(), name), - sync_db_entry!(is_dir, is_dir), - sync_db_entry!(extension.to_string(), extension), - sync_db_entry!( - entry.metadata.size_in_bytes.to_be_bytes().to_vec(), - size_in_bytes_bytes - ), - sync_db_entry!(inode_to_db(entry.metadata.inode), inode), - sync_db_entry!(entry.metadata.created_at.into(), date_created), - sync_db_entry!(entry.metadata.modified_at.into(), date_modified), - sync_db_entry!(Utc::now().into(), date_indexed), - sync_db_entry!(entry.metadata.hidden, hidden), - ] - .into_iter() - .unzip(); - - ( - sync.shared_create( - prisma_sync::file_path::SyncId { - pub_id: sd_utils::uuid_to_bytes(entry.pub_id), - }, - sync_params, - ), - create_unchecked(pub_id, db_params), - ) - }) - .unzip(); - - #[allow(clippy::cast_sign_loss)] - let saved_count = sync - .write_ops( - db, - ( - sync_stuff.into_iter().flatten().collect(), - db.file_path().create_many(paths).skip_duplicates(), - ), - ) - .await - .map_err(indexer::Error::from)? as u64; - - trace!("Inserted {saved_count} records"); - - Ok(ExecStatus::Done( - SaveTaskOutput { - saved_count, - save_duration: start_time.elapsed(), - } - .into_output(), - )) - } -} diff --git a/core/crates/heavy-lifting/src/indexer/tasks/updater.rs b/core/crates/heavy-lifting/src/indexer/tasks/updater.rs index e547ec8ac..47c552814 100644 --- a/core/crates/heavy-lifting/src/indexer/tasks/updater.rs +++ b/core/crates/heavy-lifting/src/indexer/tasks/updater.rs @@ -17,21 +17,169 @@ use std::{collections::HashSet, sync::Arc, time::Duration}; use serde::{Deserialize, Serialize}; use tokio::time::Instant; -use tracing::trace; +use tracing::{instrument, trace, Level}; use super::walker::WalkedEntry; #[derive(Debug)] -pub struct UpdateTask { +pub struct Updater { + // Task control id: TaskId, + is_shallow: bool, + + // Received input args walked_entries: Vec, + + // Inner state object_ids_that_should_be_unlinked: HashSet, + + // Dependencies db: Arc, sync: Arc, - is_shallow: bool, } -impl UpdateTask { +/// [`Update`] Task output +#[derive(Debug)] +pub struct Output { + /// Number of records updated on database + pub updated_count: u64, + /// Time spent updating records + pub update_duration: Duration, +} + +#[async_trait::async_trait] +impl Task for Updater { + fn id(&self) -> TaskId { + self.id + } + + fn with_priority(&self) -> bool { + // If we're running in shallow mode, then we want priority + self.is_shallow + } + + #[instrument( + skip_all, + fields( + task_id = %self.id, + to_update_count = %self.walked_entries.len(), + is_shallow = self.is_shallow, + ), + ret(level = Level::TRACE), + err, + )] + #[allow(clippy::blocks_in_conditions)] // Due to `err` on `instrument` macro above + async fn run(&mut self, interrupter: &Interrupter) -> Result { + use file_path::{ + cas_id, date_created, date_modified, hidden, inode, is_dir, object, object_id, + size_in_bytes_bytes, + }; + + let start_time = Instant::now(); + + let Self { + walked_entries, + db, + sync, + object_ids_that_should_be_unlinked, + .. + } = self; + + fetch_objects_ids_to_unlink(walked_entries, object_ids_that_should_be_unlinked, db).await?; + + check_interruption!(interrupter); + + let (sync_stuff, paths_to_update) = walked_entries + .drain(..) + .map( + |WalkedEntry { + pub_id, + maybe_object_id, + iso_file_path, + metadata, + }| { + let IsolatedFilePathDataParts { is_dir, .. } = &iso_file_path.to_parts(); + + let should_unlink_object = maybe_object_id.map_or(false, |object_id| { + object_ids_that_should_be_unlinked.contains(&object_id) + }); + + let (sync_params, db_params) = chain_optional_iter( + [ + ((cas_id::NAME, msgpack!(nil)), cas_id::set(None)), + sync_db_entry!(*is_dir, is_dir), + sync_db_entry!( + metadata.size_in_bytes.to_be_bytes().to_vec(), + size_in_bytes_bytes + ), + sync_db_entry!(inode_to_db(metadata.inode), inode), + { + let v = metadata.created_at.into(); + sync_db_entry!(v, date_created) + }, + { + let v = metadata.modified_at.into(); + sync_db_entry!(v, date_modified) + }, + sync_db_entry!(metadata.hidden, hidden), + ], + [ + // As this file was updated while Spacedrive was offline, we mark the object_id and cas_id as null + // So this file_path will be updated at file identifier job + should_unlink_object.then_some(( + (object_id::NAME, msgpack!(nil)), + object::disconnect(), + )), + ], + ) + .into_iter() + .unzip::<_, _, Vec<_>, Vec<_>>(); + + ( + sync_params + .into_iter() + .map(|(field, value)| { + sync.shared_update( + prisma_sync::file_path::SyncId { + pub_id: pub_id.to_db(), + }, + field, + value, + ) + }) + .collect::>(), + db.file_path() + .update(file_path::pub_id::equals(pub_id.into()), db_params) + // selecting id to avoid fetching whole object from database + .select(file_path::select!({ id })), + ) + }, + ) + .unzip::<_, _, Vec<_>, Vec<_>>(); + + let updated = sync + .write_ops( + db, + (sync_stuff.into_iter().flatten().collect(), paths_to_update), + ) + .await + .map_err(indexer::Error::from)?; + + let update_duration = start_time.elapsed(); + + trace!(?updated, "Updated records;"); + + Ok(ExecStatus::Done( + Output { + updated_count: updated.len() as u64, + update_duration, + } + .into_output(), + )) + } +} + +impl Updater { #[must_use] pub fn new_deep( walked_entries: Vec, @@ -65,177 +213,6 @@ impl UpdateTask { } } -#[derive(Debug, Serialize, Deserialize)] -struct UpdateTaskSaveState { - id: TaskId, - walked_entries: Vec, - object_ids_that_should_be_unlinked: HashSet, - is_shallow: bool, -} - -impl SerializableTask for UpdateTask { - type SerializeError = rmp_serde::encode::Error; - - type DeserializeError = rmp_serde::decode::Error; - - type DeserializeCtx = (Arc, Arc); - - async fn serialize(self) -> Result, Self::SerializeError> { - let Self { - id, - walked_entries, - object_ids_that_should_be_unlinked, - is_shallow, - .. - } = self; - - rmp_serde::to_vec_named(&UpdateTaskSaveState { - id, - walked_entries, - object_ids_that_should_be_unlinked, - is_shallow, - }) - } - - async fn deserialize( - data: &[u8], - (db, sync): Self::DeserializeCtx, - ) -> Result { - rmp_serde::from_slice(data).map( - |UpdateTaskSaveState { - id, - walked_entries, - object_ids_that_should_be_unlinked, - is_shallow, - }| Self { - id, - walked_entries, - object_ids_that_should_be_unlinked, - db, - sync, - is_shallow, - }, - ) - } -} - -#[derive(Debug)] -pub struct UpdateTaskOutput { - pub updated_count: u64, - pub update_duration: Duration, -} - -#[async_trait::async_trait] -impl Task for UpdateTask { - fn id(&self) -> TaskId { - self.id - } - - fn with_priority(&self) -> bool { - // If we're running in shallow mode, then we want priority - self.is_shallow - } - - async fn run(&mut self, interrupter: &Interrupter) -> Result { - use file_path::{ - cas_id, date_created, date_modified, hidden, inode, is_dir, object, object_id, - size_in_bytes_bytes, - }; - - let start_time = Instant::now(); - - let Self { - walked_entries, - db, - sync, - object_ids_that_should_be_unlinked, - .. - } = self; - - fetch_objects_ids_to_unlink(walked_entries, object_ids_that_should_be_unlinked, db).await?; - - check_interruption!(interrupter); - - let (sync_stuff, paths_to_update) = walked_entries - .drain(..) - .map(|entry| { - let IsolatedFilePathDataParts { is_dir, .. } = &entry.iso_file_path.to_parts(); - - let pub_id = sd_utils::uuid_to_bytes(entry.pub_id); - - let should_unlink_object = entry.maybe_object_id.map_or(false, |object_id| { - object_ids_that_should_be_unlinked.contains(&object_id) - }); - - let (sync_params, db_params) = chain_optional_iter( - [ - ((cas_id::NAME, msgpack!(nil)), cas_id::set(None)), - sync_db_entry!(*is_dir, is_dir), - sync_db_entry!( - entry.metadata.size_in_bytes.to_be_bytes().to_vec(), - size_in_bytes_bytes - ), - sync_db_entry!(inode_to_db(entry.metadata.inode), inode), - { - let v = entry.metadata.created_at.into(); - sync_db_entry!(v, date_created) - }, - { - let v = entry.metadata.modified_at.into(); - sync_db_entry!(v, date_modified) - }, - sync_db_entry!(entry.metadata.hidden, hidden), - ], - [ - // As this file was updated while Spacedrive was offline, we mark the object_id and cas_id as null - // So this file_path will be updated at file identifier job - should_unlink_object - .then_some(((object_id::NAME, msgpack!(nil)), object::disconnect())), - ], - ) - .into_iter() - .unzip::<_, _, Vec<_>, Vec<_>>(); - - ( - sync_params - .into_iter() - .map(|(field, value)| { - sync.shared_update( - prisma_sync::file_path::SyncId { - pub_id: pub_id.clone(), - }, - field, - value, - ) - }) - .collect::>(), - db.file_path() - .update(file_path::pub_id::equals(pub_id), db_params) - .select(file_path::select!({ id })), - ) - }) - .unzip::<_, _, Vec<_>, Vec<_>>(); - - let updated = sync - .write_ops( - db, - (sync_stuff.into_iter().flatten().collect(), paths_to_update), - ) - .await - .map_err(indexer::Error::from)?; - - trace!("Updated {updated:?} records"); - - Ok(ExecStatus::Done( - UpdateTaskOutput { - updated_count: updated.len() as u64, - update_duration: start_time.elapsed(), - } - .into_output(), - )) - } -} - async fn fetch_objects_ids_to_unlink( walked_entries: &[WalkedEntry], object_ids_that_should_be_unlinked: &mut HashSet, @@ -269,3 +246,59 @@ async fn fetch_objects_ids_to_unlink( Ok(()) } + +#[derive(Debug, Serialize, Deserialize)] +struct SaveState { + id: TaskId, + is_shallow: bool, + + walked_entries: Vec, + + object_ids_that_should_be_unlinked: HashSet, +} + +impl SerializableTask for Updater { + type SerializeError = rmp_serde::encode::Error; + + type DeserializeError = rmp_serde::decode::Error; + + type DeserializeCtx = (Arc, Arc); + + async fn serialize(self) -> Result, Self::SerializeError> { + let Self { + id, + walked_entries, + object_ids_that_should_be_unlinked, + is_shallow, + .. + } = self; + + rmp_serde::to_vec_named(&SaveState { + id, + is_shallow, + walked_entries, + object_ids_that_should_be_unlinked, + }) + } + + async fn deserialize( + data: &[u8], + (db, sync): Self::DeserializeCtx, + ) -> Result { + rmp_serde::from_slice(data).map( + |SaveState { + id, + is_shallow, + walked_entries, + object_ids_that_should_be_unlinked, + }| Self { + id, + is_shallow, + walked_entries, + object_ids_that_should_be_unlinked, + db, + sync, + }, + ) + } +} diff --git a/core/crates/heavy-lifting/src/indexer/tasks/walker.rs b/core/crates/heavy-lifting/src/indexer/tasks/walker.rs deleted file mode 100644 index f99bc2fab..000000000 --- a/core/crates/heavy-lifting/src/indexer/tasks/walker.rs +++ /dev/null @@ -1,1612 +0,0 @@ -use crate::{indexer, Error, NonCriticalError}; - -use sd_core_file_path_helper::{FilePathError, FilePathMetadata, IsolatedFilePathData}; -use sd_core_indexer_rules::{ - seed::{GitIgnoreRules, GITIGNORE}, - IndexerRuler, MetadataForIndexerRules, RuleKind, -}; -use sd_core_prisma_helpers::{file_path_pub_and_cas_ids, file_path_walker}; - -use sd_prisma::prisma::file_path; -use sd_task_system::{ - check_interruption, BaseTaskDispatcher, ExecStatus, Interrupter, IntoAnyTaskOutput, - SerializableTask, Task, TaskDispatcher, TaskHandle, TaskId, -}; -use sd_utils::{db::inode_from_db, error::FileIOError}; - -use std::{ - collections::{hash_map::Entry, HashMap, HashSet}, - fmt, - fs::Metadata, - future::Future, - hash::{Hash, Hasher}, - mem, - path::{Path, PathBuf}, - sync::Arc, - time::Duration, -}; - -use chrono::{DateTime, Duration as ChronoDuration, FixedOffset, Utc}; -use futures_concurrency::future::Join; -use serde::{Deserialize, Serialize}; -use tokio::{fs, time::Instant}; -use tokio_stream::{wrappers::ReadDirStream, StreamExt}; -use tracing::trace; -use uuid::Uuid; - -/// `WalkedEntry` represents a single path in the filesystem -#[derive(Debug, Serialize, Deserialize)] -pub struct WalkedEntry { - pub pub_id: Uuid, - pub maybe_object_id: file_path::object_id::Type, - pub iso_file_path: IsolatedFilePathData<'static>, - pub metadata: FilePathMetadata, -} - -impl PartialEq for WalkedEntry { - fn eq(&self, other: &Self) -> bool { - self.iso_file_path == other.iso_file_path - } -} - -impl Eq for WalkedEntry {} - -impl Hash for WalkedEntry { - fn hash(&self, state: &mut H) { - self.iso_file_path.hash(state); - } -} - -#[derive(Debug, Serialize, Deserialize)] -struct WalkingEntry { - iso_file_path: IsolatedFilePathData<'static>, - metadata: FilePathMetadata, -} - -impl From for WalkedEntry { - fn from( - WalkingEntry { - iso_file_path, - metadata, - }: WalkingEntry, - ) -> Self { - Self { - pub_id: Uuid::new_v4(), - maybe_object_id: None, - iso_file_path, - metadata, - } - } -} - -impl From<(Uuid, file_path::object_id::Type, WalkingEntry)> for WalkedEntry { - fn from( - ( - pub_id, - maybe_object_id, - WalkingEntry { - iso_file_path, - metadata, - }, - ): (Uuid, file_path::object_id::Type, WalkingEntry), - ) -> Self { - Self { - pub_id, - maybe_object_id, - iso_file_path, - metadata, - } - } -} - -pub trait IsoFilePathFactory: Clone + Send + Sync + fmt::Debug + 'static { - fn build( - &self, - path: impl AsRef, - is_dir: bool, - ) -> Result, FilePathError>; -} - -pub trait WalkerDBProxy: Clone + Send + Sync + fmt::Debug + 'static { - fn fetch_file_paths( - &self, - found_paths: Vec, - ) -> impl Future, indexer::Error>> + Send; - - fn fetch_file_paths_to_remove( - &self, - parent_iso_file_path: &IsolatedFilePathData<'_>, - unique_location_id_materialized_path_name_extension_params: Vec, - ) -> impl Future, indexer::NonCriticalError>> - + Send; -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct ToWalkEntry { - path: PathBuf, - parent_dir_accepted_by_its_children: Option, -} - -impl> From

for ToWalkEntry { - fn from(path: P) -> Self { - Self { - path: path.as_ref().into(), - parent_dir_accepted_by_its_children: None, - } - } -} - -#[derive(Debug)] -pub struct WalkTaskOutput { - pub to_create: Vec, - pub to_update: Vec, - pub to_remove: Vec, - pub accepted_ancestors: HashSet, - pub errors: Vec, - pub directory_iso_file_path: IsolatedFilePathData<'static>, - pub total_size: u64, - pub handles: Vec>, - pub scan_time: Duration, -} - -#[derive(Debug, Serialize, Deserialize)] -struct InnerMetadata { - pub is_dir: bool, - pub is_symlink: bool, - pub inode: u64, - pub size_in_bytes: u64, - pub hidden: bool, - pub created_at: DateTime, - pub modified_at: DateTime, -} - -impl InnerMetadata { - fn new(path: impl AsRef, metadata: &Metadata) -> Result { - let FilePathMetadata { - inode, - size_in_bytes, - created_at, - modified_at, - hidden, - } = FilePathMetadata::from_path(path, metadata) - .map_err(|e| indexer::NonCriticalError::FilePathMetadata(e.to_string()))?; - - Ok(Self { - is_dir: metadata.is_dir(), - is_symlink: metadata.is_symlink(), - inode, - size_in_bytes, - hidden, - created_at, - modified_at, - }) - } -} - -impl MetadataForIndexerRules for InnerMetadata { - fn is_dir(&self) -> bool { - self.is_dir - } -} - -impl From for FilePathMetadata { - fn from(metadata: InnerMetadata) -> Self { - Self { - inode: metadata.inode, - size_in_bytes: metadata.size_in_bytes, - hidden: metadata.hidden, - created_at: metadata.created_at, - modified_at: metadata.modified_at, - } - } -} - -#[derive(Debug)] -enum WalkerStage { - Start, - Walking { - read_dir_stream: ReadDirStream, - found_paths: Vec, - }, - CollectingMetadata { - found_paths: Vec, - }, - CheckingIndexerRules { - paths_and_metadatas: HashMap, - }, - ProcessingRulesResults { - paths_metadatas_and_acceptance: - HashMap>)>, - }, - GatheringFilePathsToRemove { - accepted_paths: HashMap, - maybe_to_keep_walking: Option>, - accepted_ancestors: HashSet, - }, - Finalize { - walking_entries: Vec, - accepted_ancestors: HashSet, - to_remove_entries: Vec, - maybe_to_keep_walking: Option>, - }, -} - -#[derive(Debug, Serialize, Deserialize)] -struct WalkDirSaveState { - id: TaskId, - entry: ToWalkEntry, - root: Arc, - entry_iso_file_path: IsolatedFilePathData<'static>, - stage: WalkerStageSaveState, - errors: Vec, - scan_time: Duration, - is_shallow: bool, -} - -#[derive(Debug, Serialize, Deserialize)] -enum WalkerStageSaveState { - Start, - CollectingMetadata { - found_paths: Vec, - }, - CheckingIndexerRules { - paths_and_metadatas: HashMap, - }, - ProcessingRulesResults { - paths_metadatas_and_acceptance: - HashMap>)>, - }, - GatheringFilePathsToRemove { - accepted_paths: HashMap, - maybe_to_keep_walking: Option>, - accepted_ancestors: HashSet, - }, - Finalize { - walking_entries: Vec, - accepted_ancestors: HashSet, - to_remove_entries: Vec, - maybe_to_keep_walking: Option>, - }, -} - -impl From for WalkerStageSaveState { - fn from(stage: WalkerStage) -> Self { - match stage { - // We can't store the current state of `ReadDirStream` so we start again from the beginning - WalkerStage::Start | WalkerStage::Walking { .. } => Self::Start, - WalkerStage::CollectingMetadata { found_paths } => { - Self::CollectingMetadata { found_paths } - } - WalkerStage::CheckingIndexerRules { - paths_and_metadatas, - } => Self::CheckingIndexerRules { - paths_and_metadatas, - }, - WalkerStage::ProcessingRulesResults { - paths_metadatas_and_acceptance, - } => Self::ProcessingRulesResults { - paths_metadatas_and_acceptance, - }, - WalkerStage::GatheringFilePathsToRemove { - accepted_paths, - maybe_to_keep_walking, - accepted_ancestors, - } => Self::GatheringFilePathsToRemove { - accepted_paths, - maybe_to_keep_walking, - accepted_ancestors, - }, - WalkerStage::Finalize { - walking_entries, - accepted_ancestors, - to_remove_entries, - maybe_to_keep_walking, - } => Self::Finalize { - walking_entries, - accepted_ancestors, - to_remove_entries, - maybe_to_keep_walking, - }, - } - } -} - -impl From for WalkerStage { - fn from(value: WalkerStageSaveState) -> Self { - match value { - WalkerStageSaveState::Start => Self::Start, - WalkerStageSaveState::CollectingMetadata { found_paths } => { - Self::CollectingMetadata { found_paths } - } - WalkerStageSaveState::CheckingIndexerRules { - paths_and_metadatas, - } => Self::CheckingIndexerRules { - paths_and_metadatas, - }, - WalkerStageSaveState::ProcessingRulesResults { - paths_metadatas_and_acceptance, - } => Self::ProcessingRulesResults { - paths_metadatas_and_acceptance, - }, - WalkerStageSaveState::GatheringFilePathsToRemove { - accepted_paths, - maybe_to_keep_walking, - accepted_ancestors, - } => Self::GatheringFilePathsToRemove { - accepted_paths, - maybe_to_keep_walking, - accepted_ancestors, - }, - WalkerStageSaveState::Finalize { - walking_entries, - accepted_ancestors, - to_remove_entries, - maybe_to_keep_walking, - } => Self::Finalize { - walking_entries, - accepted_ancestors, - to_remove_entries, - maybe_to_keep_walking, - }, - } - } -} - -#[derive(Debug)] -pub struct WalkDirTask> -where - DBProxy: WalkerDBProxy, - IsoPathFactory: IsoFilePathFactory, - Dispatcher: TaskDispatcher, -{ - id: TaskId, - entry: ToWalkEntry, - root: Arc, - entry_iso_file_path: IsolatedFilePathData<'static>, - indexer_ruler: IndexerRuler, - iso_file_path_factory: IsoPathFactory, - db_proxy: DBProxy, - stage: WalkerStage, - maybe_dispatcher: Option, - errors: Vec, - scan_time: Duration, - is_shallow: bool, -} - -impl WalkDirTask -where - DBProxy: WalkerDBProxy, - IsoPathFactory: IsoFilePathFactory, - Dispatcher: TaskDispatcher, -{ - pub fn new_deep( - entry: impl Into + Send, - root: Arc, - indexer_ruler: IndexerRuler, - iso_file_path_factory: IsoPathFactory, - db_proxy: DBProxy, - dispatcher: Dispatcher, - ) -> Result { - let entry = entry.into(); - Ok(Self { - id: TaskId::new_v4(), - root, - indexer_ruler, - entry_iso_file_path: iso_file_path_factory.build(&entry.path, true)?, - iso_file_path_factory, - db_proxy, - stage: WalkerStage::Start, - entry, - maybe_dispatcher: Some(dispatcher), - is_shallow: false, - errors: Vec::new(), - scan_time: Duration::ZERO, - }) - } -} - -impl WalkDirTask> -where - DBProxy: WalkerDBProxy, - IsoPathFactory: IsoFilePathFactory, -{ - pub fn new_shallow( - entry: impl Into + Send, - root: Arc, - indexer_ruler: IndexerRuler, - iso_file_path_factory: IsoPathFactory, - db_proxy: DBProxy, - ) -> Result { - let entry = entry.into(); - Ok(Self { - id: TaskId::new_v4(), - root, - indexer_ruler, - entry_iso_file_path: iso_file_path_factory.build(&entry.path, true)?, - iso_file_path_factory, - db_proxy, - stage: WalkerStage::Start, - entry, - maybe_dispatcher: None, - is_shallow: true, - errors: Vec::new(), - scan_time: Duration::ZERO, - }) - } -} - -impl SerializableTask - for WalkDirTask -where - DBProxy: WalkerDBProxy, - IsoPathFactory: IsoFilePathFactory, - Dispatcher: TaskDispatcher, -{ - type SerializeError = rmp_serde::encode::Error; - type DeserializeError = rmp_serde::decode::Error; - type DeserializeCtx = (IndexerRuler, DBProxy, IsoPathFactory, Dispatcher); - - async fn serialize(self) -> Result, Self::SerializeError> { - let Self { - id, - entry, - root, - entry_iso_file_path, - stage, - errors, - scan_time, - is_shallow, - .. - } = self; - rmp_serde::to_vec_named(&WalkDirSaveState { - id, - entry, - root, - entry_iso_file_path, - stage: stage.into(), - errors, - scan_time, - is_shallow, - }) - } - - async fn deserialize( - data: &[u8], - (indexer_ruler, db_proxy, iso_file_path_factory, dispatcher): Self::DeserializeCtx, - ) -> Result { - rmp_serde::from_slice(data).map( - |WalkDirSaveState { - id, - entry, - root, - entry_iso_file_path, - stage, - errors, - scan_time, - is_shallow, - }| Self { - id, - entry, - root, - entry_iso_file_path, - indexer_ruler, - iso_file_path_factory, - db_proxy, - stage: stage.into(), - maybe_dispatcher: is_shallow.then_some(dispatcher), - errors, - scan_time, - is_shallow, - }, - ) - } -} - -#[async_trait::async_trait] -impl Task - for WalkDirTask -where - DBProxy: WalkerDBProxy, - IsoPathFactory: IsoFilePathFactory, - Dispatcher: TaskDispatcher, -{ - fn id(&self) -> TaskId { - self.id - } - - fn with_priority(&self) -> bool { - // If we're running in shallow mode, then we want priority - self.is_shallow - } - - #[allow(clippy::too_many_lines)] - async fn run(&mut self, interrupter: &Interrupter) -> Result { - let Self { - root, - entry: ToWalkEntry { - path, - parent_dir_accepted_by_its_children, - }, - entry_iso_file_path, - iso_file_path_factory, - indexer_ruler, - db_proxy, - stage, - maybe_dispatcher, - errors, - scan_time, - .. - } = self; - - let start_time = Instant::now(); - - let (to_create, to_update, total_size, to_remove, accepted_ancestors, handles) = loop { - match stage { - WalkerStage::Start => { - if indexer_ruler.has_system(&GITIGNORE).await { - if let Some(rules) = - GitIgnoreRules::get_rules_if_in_git_repo(root.as_ref(), path).await - { - indexer_ruler.extend(rules.map(Into::into)).await; - } - } - - *stage = WalkerStage::Walking { - read_dir_stream: ReadDirStream::new(fs::read_dir(&path).await.map_err( - |e| { - indexer::Error::FileIO( - (&path, e, "Failed to open directory to read its entries") - .into(), - ) - }, - )?), - found_paths: Vec::new(), - }; - } - - WalkerStage::Walking { - read_dir_stream, - found_paths, - } => { - while let Some(res) = read_dir_stream.next().await { - match res { - Ok(dir_entry) => { - found_paths.push(dir_entry.path()); - } - Err(e) => { - errors.push(NonCriticalError::Indexer( - indexer::NonCriticalError::FailedDirectoryEntry( - FileIOError::from((&path, e)).to_string(), - ), - )); - } - } - - check_interruption!(interrupter, start_time, scan_time); - } - - *stage = WalkerStage::CollectingMetadata { - found_paths: mem::take(found_paths), - }; - - check_interruption!(interrupter, start_time, scan_time); - } - - WalkerStage::CollectingMetadata { found_paths } => { - *stage = WalkerStage::CheckingIndexerRules { - paths_and_metadatas: collect_metadata(found_paths, errors).await, - }; - - check_interruption!(interrupter, start_time, scan_time); - } - - WalkerStage::CheckingIndexerRules { - paths_and_metadatas, - } => { - *stage = WalkerStage::ProcessingRulesResults { - paths_metadatas_and_acceptance: apply_indexer_rules( - paths_and_metadatas, - indexer_ruler, - errors, - ) - .await, - }; - - check_interruption!(interrupter, start_time, scan_time); - } - - WalkerStage::ProcessingRulesResults { - paths_metadatas_and_acceptance, - } => { - let mut maybe_to_keep_walking = maybe_dispatcher.is_some().then(Vec::new); - let (accepted_paths, accepted_ancestors) = process_rules_results( - root, - iso_file_path_factory, - *parent_dir_accepted_by_its_children, - paths_metadatas_and_acceptance, - &mut maybe_to_keep_walking, - errors, - ) - .await; - - *stage = WalkerStage::GatheringFilePathsToRemove { - accepted_paths, - maybe_to_keep_walking, - accepted_ancestors, - }; - - check_interruption!(interrupter, start_time, scan_time); - } - - WalkerStage::GatheringFilePathsToRemove { - accepted_paths, - maybe_to_keep_walking, - accepted_ancestors, - } => { - let (walking_entries, to_remove_entries) = gather_file_paths_to_remove( - accepted_paths, - entry_iso_file_path, - iso_file_path_factory, - db_proxy, - errors, - ) - .await; - - *stage = WalkerStage::Finalize { - walking_entries, - to_remove_entries, - maybe_to_keep_walking: mem::take(maybe_to_keep_walking), - accepted_ancestors: mem::take(accepted_ancestors), - }; - - check_interruption!(interrupter, start_time, scan_time); - } - - // From this points onwards, we will not allow to be interrupted anymore - WalkerStage::Finalize { - walking_entries, - to_remove_entries, - maybe_to_keep_walking, - accepted_ancestors, - } => { - let (to_create, to_update, total_size) = - segregate_creates_and_updates(walking_entries, db_proxy).await?; - - let handles = keep_walking( - root, - indexer_ruler, - iso_file_path_factory, - db_proxy, - maybe_to_keep_walking, - maybe_dispatcher, - errors, - ) - .await; - - break ( - to_create, - to_update, - total_size, - mem::take(to_remove_entries), - mem::take(accepted_ancestors), - handles, - ); - } - } - }; - - *scan_time += start_time.elapsed(); - - // Taking out some data as the task is finally complete - Ok(ExecStatus::Done( - WalkTaskOutput { - to_create, - to_update, - to_remove, - accepted_ancestors, - errors: mem::take(errors), - directory_iso_file_path: mem::take(entry_iso_file_path), - total_size, - handles, - scan_time: *scan_time, - } - .into_output(), - )) - } -} - -async fn segregate_creates_and_updates( - walking_entries: &mut Vec, - db_proxy: &impl WalkerDBProxy, -) -> Result<(Vec, Vec, u64), Error> { - if walking_entries.is_empty() { - Ok((vec![], vec![], 0)) - } else { - let iso_paths_already_in_db = db_proxy - .fetch_file_paths( - walking_entries - .iter() - .map(|entry| file_path::WhereParam::from(&entry.iso_file_path)) - .collect(), - ) - .await? - .into_iter() - .flat_map(|file_path| { - IsolatedFilePathData::try_from(file_path.clone()) - .map(|iso_file_path| (iso_file_path, file_path)) - }) - .collect::>(); - - Ok(walking_entries.drain(..).fold( - (Vec::new(), Vec::new(), 0), - |(mut to_create, mut to_update, mut total_size), entry| { - let WalkingEntry{iso_file_path, metadata} = &entry; - - total_size += metadata.size_in_bytes; - - if let Some(file_path) = iso_paths_already_in_db.get(iso_file_path) { - if let (Some(inode), Some(date_modified)) = ( - &file_path.inode, - &file_path.date_modified, - ) { - if ( - inode_from_db(&inode[0..8]) != metadata.inode - // Datetimes stored in DB loses a bit of precision, so we need to check against a delta - // instead of using != operator - || DateTime::::from(metadata.modified_at) - *date_modified - > ChronoDuration::milliseconds(1) || file_path.hidden.is_none() || metadata.hidden != file_path.hidden.unwrap_or_default() - ) - // We ignore the size of directories because it is not reliable, we need to - // calculate it ourselves later - && !( - iso_file_path.to_parts().is_dir - && metadata.size_in_bytes - != file_path - .size_in_bytes_bytes - .as_ref() - .map(|size_in_bytes_bytes| { - u64::from_be_bytes([ - size_in_bytes_bytes[0], - size_in_bytes_bytes[1], - size_in_bytes_bytes[2], - size_in_bytes_bytes[3], - size_in_bytes_bytes[4], - size_in_bytes_bytes[5], - size_in_bytes_bytes[6], - size_in_bytes_bytes[7], - ]) - }) - .unwrap_or_default() - ) { - to_update.push( - WalkedEntry::from((sd_utils::from_bytes_to_uuid(&file_path.pub_id), file_path.object_id, entry)), - ); - } - } - } else { - to_create.push(WalkedEntry::from(entry)); - } - - (to_create, to_update, total_size) - } - )) - } -} - -async fn keep_walking( - root: &Arc, - indexer_ruler: &IndexerRuler, - iso_file_path_factory: &impl IsoFilePathFactory, - db_proxy: &impl WalkerDBProxy, - maybe_to_keep_walking: &mut Option>, - dispatcher: &Option>, - errors: &mut Vec, -) -> Vec> { - if let (Some(dispatcher), Some(to_keep_walking)) = (dispatcher, maybe_to_keep_walking) { - dispatcher - .dispatch_many( - to_keep_walking - .drain(..) - .map(|entry| { - WalkDirTask::new_deep( - entry, - Arc::clone(root), - indexer_ruler.clone(), - iso_file_path_factory.clone(), - db_proxy.clone(), - dispatcher.clone(), - ) - .map_err(|e| indexer::NonCriticalError::DispatchKeepWalking(e.to_string())) - }) - .filter_map(|res| res.map_err(|e| errors.push(e.into())).ok()), - ) - .await - } else { - Vec::new() - } -} - -async fn collect_metadata( - found_paths: &mut Vec, - errors: &mut Vec, -) -> HashMap { - found_paths - .drain(..) - .map(|current_path| async move { - fs::metadata(¤t_path) - .await - .map_err(|e| { - indexer::NonCriticalError::Metadata( - FileIOError::from((¤t_path, e)).to_string(), - ) - }) - .and_then(|metadata| { - InnerMetadata::new(¤t_path, &metadata) - .map(|metadata| (current_path, metadata)) - }) - }) - .collect::>() - .join() - .await - .into_iter() - .filter_map(|res| res.map_err(|e| errors.push(e.into())).ok()) - .collect() -} - -async fn apply_indexer_rules( - paths_and_metadatas: &mut HashMap, - indexer_ruler: &IndexerRuler, - errors: &mut Vec, -) -> HashMap>)> { - paths_and_metadatas - .drain() - // TODO: Hard ignoring symlinks for now, but this should be configurable - .filter(|(_, metadata)| !metadata.is_symlink) - .map(|(current_path, metadata)| async { - indexer_ruler - .apply_all(¤t_path, &metadata) - .await - .map(|acceptance_per_rule_kind| { - (current_path, (metadata, acceptance_per_rule_kind)) - }) - .map_err(|e| indexer::NonCriticalError::IndexerRule(e.to_string())) - }) - .collect::>() - .join() - .await - .into_iter() - .filter_map(|res| res.map_err(|e| errors.push(e.into())).ok()) - .collect() -} - -async fn process_rules_results( - root: &Arc, - iso_file_path_factory: &impl IsoFilePathFactory, - parent_dir_accepted_by_its_children: Option, - paths_metadatas_and_acceptance: &mut HashMap< - PathBuf, - (InnerMetadata, HashMap>), - >, - maybe_to_keep_walking: &mut Option>, - errors: &mut Vec, -) -> (HashMap, HashSet) { - let root = root.as_ref(); - - let (accepted, accepted_ancestors) = paths_metadatas_and_acceptance.drain().fold( - (HashMap::new(), HashMap::new()), - |(mut accepted, mut accepted_ancestors), - (current_path, (metadata, acceptance_per_rule_kind))| { - // Accept by children has three states, - // None if we don't now yet or if this check doesn't apply - // Some(true) if this check applies and it passes - // Some(false) if this check applies and it was rejected - // and we pass the current parent state to its children - let mut accept_by_children_dir = parent_dir_accepted_by_its_children; - - if rejected_by_reject_glob(&acceptance_per_rule_kind) { - trace!( - "Path {} rejected by `RuleKind::RejectFilesByGlob`", - current_path.display() - ); - - return (accepted, accepted_ancestors); - } - - let is_dir = metadata.is_dir(); - - if is_dir - && process_and_maybe_reject_by_directory_rules( - ¤t_path, - &acceptance_per_rule_kind, - &mut accept_by_children_dir, - maybe_to_keep_walking, - ) { - trace!( - "Path {} rejected by rule `RuleKind::RejectIfChildrenDirectoriesArePresent`", - current_path.display(), - ); - return (accepted, accepted_ancestors); - } - - if rejected_by_accept_glob(&acceptance_per_rule_kind) { - trace!( - "Path {} reject because it didn't passed in any AcceptFilesByGlob rules", - current_path.display() - ); - return (accepted, accepted_ancestors); - } - - if accept_by_children_dir.unwrap_or(true) { - accept_ancestors( - current_path, - metadata, - root, - &mut accepted, - iso_file_path_factory, - &mut accepted_ancestors, - errors, - ); - } - - (accepted, accepted_ancestors) - }, - ); - - ( - accepted, - accepted_ancestors - .into_iter() - .map(|(ancestor_iso_file_path, ancestor_path)| async move { - fs::metadata(&ancestor_path) - .await - .map_err(|e| { - indexer::NonCriticalError::Metadata( - FileIOError::from((&ancestor_path, e)).to_string(), - ) - }) - .and_then(|metadata| { - FilePathMetadata::from_path(&ancestor_path, &metadata) - .map(|metadata| { - WalkingEntry { - iso_file_path: ancestor_iso_file_path, - metadata, - } - .into() - }) - .map_err(|e| indexer::NonCriticalError::FilePathMetadata(e.to_string())) - }) - }) - .collect::>() - .join() - .await - .into_iter() - .filter_map(|res| res.map_err(|e| errors.push(e.into())).ok()) - .collect(), - ) -} - -fn process_and_maybe_reject_by_directory_rules( - current_path: &Path, - acceptance_per_rule_kind: &HashMap>, - accept_by_children_dir: &mut Option, - maybe_to_keep_walking: &mut Option>, -) -> bool { - // If it is a directory, first we check if we must reject it and its children entirely - if rejected_by_children_directories(acceptance_per_rule_kind) { - return true; - } - - // Then we check if we must accept it and its children - if let Some(accepted_by_children_rules) = - acceptance_per_rule_kind.get(&RuleKind::AcceptIfChildrenDirectoriesArePresent) - { - if accepted_by_children_rules.iter().any(|accept| *accept) { - *accept_by_children_dir = Some(true); - } - - // If it wasn't accepted then we mark as rejected - if accept_by_children_dir.is_none() { - trace!( - "Path {} rejected because it didn't passed in any AcceptIfChildrenDirectoriesArePresent rule", - current_path.display() - ); - *accept_by_children_dir = Some(false); - } - } - - // Then we mark this directory to maybe be walked in too - if let Some(ref mut to_keep_walking) = maybe_to_keep_walking { - to_keep_walking.push(ToWalkEntry { - path: current_path.to_path_buf(), - parent_dir_accepted_by_its_children: *accept_by_children_dir, - }); - } - - false -} - -fn accept_ancestors( - current_path: PathBuf, - metadata: InnerMetadata, - root: &Path, - accepted: &mut HashMap, - iso_file_path_factory: &impl IsoFilePathFactory, - accepted_ancestors: &mut HashMap, PathBuf>, - errors: &mut Vec, -) { - // If the ancestors directories wasn't indexed before, now we do - for ancestor in current_path - .ancestors() - .skip(1) // Skip the current directory as it was already indexed - .take_while(|&ancestor| ancestor != root) - { - if let Ok(iso_file_path) = iso_file_path_factory - .build(ancestor, true) - .map_err(|e| errors.push(indexer::NonCriticalError::IsoFilePath(e.to_string()).into())) - { - match accepted_ancestors.entry(iso_file_path) { - Entry::Occupied(_) => { - // If we already accepted this ancestor, then it will contain - // also all if its ancestors too, so we can stop here - break; - } - Entry::Vacant(entry) => { - trace!("Accepted ancestor {}", ancestor.display()); - entry.insert(ancestor.to_path_buf()); - } - } - } - } - - accepted.insert(current_path, metadata); -} - -fn rejected_by_accept_glob(acceptance_per_rule_kind: &HashMap>) -> bool { - acceptance_per_rule_kind - .get(&RuleKind::AcceptFilesByGlob) - .map_or(false, |accept_rules| { - accept_rules.iter().all(|accept| !accept) - }) -} - -fn rejected_by_children_directories( - acceptance_per_rule_kind: &HashMap>, -) -> bool { - acceptance_per_rule_kind - .get(&RuleKind::RejectIfChildrenDirectoriesArePresent) - .map_or(false, |reject_results| { - reject_results.iter().any(|reject| !reject) - }) -} - -fn rejected_by_reject_glob(acceptance_per_rule_kind: &HashMap>) -> bool { - acceptance_per_rule_kind - .get(&RuleKind::RejectFilesByGlob) - .map_or(false, |reject_results| { - reject_results.iter().any(|reject| !reject) - }) -} - -async fn gather_file_paths_to_remove( - accepted_paths: &mut HashMap, - entry_iso_file_path: &IsolatedFilePathData<'_>, - iso_file_path_factory: &impl IsoFilePathFactory, - db_proxy: &impl WalkerDBProxy, - errors: &mut Vec, -) -> (Vec, Vec) { - let (walking, to_delete_params) = accepted_paths - .drain() - .filter_map(|(path, metadata)| { - iso_file_path_factory - .build(&path, metadata.is_dir()) - .map(|iso_file_path| { - let params = file_path::WhereParam::from(&iso_file_path); - - ( - WalkingEntry { - iso_file_path, - metadata: FilePathMetadata::from(metadata), - }, - params, - ) - }) - .map_err(|e| { - errors.push(indexer::NonCriticalError::IsoFilePath(e.to_string()).into()); - }) - .ok() - }) - .unzip::<_, _, Vec<_>, Vec<_>>(); - - // We continue the function even if we fail to fetch `file_path`s to remove, - // the DB will have old `file_path`s but at least this is better than - // don't adding the newly indexed paths - let to_remove_entries = db_proxy - .fetch_file_paths_to_remove(entry_iso_file_path, to_delete_params) - .await - .map_err(|e| errors.push(e.into())) - .unwrap_or_default(); - - (walking, to_remove_entries) -} - -#[cfg(test)] -mod tests { - use super::*; - - use sd_core_indexer_rules::{IndexerRule, RulePerKind}; - use sd_task_system::{TaskOutput, TaskStatus, TaskSystem}; - - use chrono::Utc; - use futures_concurrency::future::FutureGroup; - use globset::{Glob, GlobSetBuilder}; - use lending_stream::{LendingStream, StreamExt}; - use tempfile::{tempdir, TempDir}; - use tokio::{fs, io::AsyncWriteExt}; - use tracing::debug; - use tracing_test::traced_test; - - #[derive(Debug, Clone)] - struct DummyIsoPathFactory { - root_path: Arc, - } - - impl IsoFilePathFactory for DummyIsoPathFactory { - fn build( - &self, - path: impl AsRef, - is_dir: bool, - ) -> Result, FilePathError> { - IsolatedFilePathData::new(0, self.root_path.as_ref(), path, is_dir).map_err(Into::into) - } - } - - #[derive(Debug, Clone)] - struct DummyDBProxy; - - impl WalkerDBProxy for DummyDBProxy { - async fn fetch_file_paths( - &self, - _: Vec, - ) -> Result, indexer::Error> { - Ok(vec![]) - } - - async fn fetch_file_paths_to_remove( - &self, - _: &IsolatedFilePathData<'_>, - _: Vec, - ) -> Result, indexer::NonCriticalError> { - Ok(vec![]) - } - } - - fn new_indexer_rule( - name: impl Into, - default: bool, - rules: Vec, - ) -> IndexerRule { - IndexerRule { - id: None, - name: name.into(), - default, - rules, - date_created: Utc::now(), - date_modified: Utc::now(), - } - } - - async fn prepare_location() -> TempDir { - // root - // |__ rust_project - // | |__ .git - // | | |__ - // | |__ .gitignore - // | |__ ignorable.file - // | |__ Cargo.toml - // | |__ src - // | | |__ main.rs - // | |__ target - // | | |__ debug - // | | |__ main - // | |__ partial - // | | |__ ignoreme - // | | |__ readme - // | |__ inner - // | |__ node_project - // | |__ .git - // | | |__ - // | |__ .gitignore - // | |__ ignorable.file - // | |__ package.json - // | |__ src - // | | |__ App.tsx - // | |__ node_modules - // | |__ react - // | |__ package.json - // |__ photos - // |__ photo1.png - // |__ photo2.jpg - // |__ photo3.jpeg - // |__ text.txt - - let root = tempdir().unwrap(); - let root_path = root.path(); - let rust_project = root_path.join("rust_project"); - let inner_project = root_path.join("inner"); - let node_project = inner_project.join("node_project"); - let photos = root_path.join("photos"); - - fs::create_dir(&rust_project).await.unwrap(); - fs::create_dir(&inner_project).await.unwrap(); - fs::create_dir(&node_project).await.unwrap(); - fs::create_dir(&photos).await.unwrap(); - - // Inner directory partially ignored by git - let partial_dir = rust_project.join("partial"); - fs::create_dir(&partial_dir).await.unwrap(); - fs::File::create(partial_dir.join("ignoreme")) - .await - .unwrap(); - fs::File::create(partial_dir.join("readme")).await.unwrap(); - - // Making rust and node projects a git repository - fs::create_dir(rust_project.join(".git")).await.unwrap(); - let gitignore = rust_project.join(".gitignore"); - let mut file = fs::File::create(gitignore).await.unwrap(); - file.write_all(b"*.file\n/target\npartial/ignoreme") - .await - .unwrap(); - fs::create_dir(node_project.join(".git")).await.unwrap(); - let gitignore = node_project.join(".gitignore"); - let mut file = fs::File::create(gitignore).await.unwrap(); - file.write_all(b"ignorable.file").await.unwrap(); - - // Populating rust project - fs::File::create(rust_project.join("Cargo.toml")) - .await - .unwrap(); - fs::File::create(rust_project.join("ignorable.file")) - .await - .unwrap(); - let rust_src_dir = rust_project.join("src"); - fs::create_dir(&rust_src_dir).await.unwrap(); - fs::File::create(rust_src_dir.join("main.rs")) - .await - .unwrap(); - let rust_target_dir = rust_project.join("target"); - fs::create_dir(&rust_target_dir).await.unwrap(); - let rust_build_dir = rust_target_dir.join("debug"); - fs::create_dir(&rust_build_dir).await.unwrap(); - fs::File::create(rust_build_dir.join("main")).await.unwrap(); - - // Populating node project - fs::File::create(node_project.join("package.json")) - .await - .unwrap(); - fs::File::create(node_project.join("ignorable.file")) - .await - .unwrap(); - let node_src_dir = node_project.join("src"); - fs::create_dir(&node_src_dir).await.unwrap(); - fs::File::create(node_src_dir.join("App.tsx")) - .await - .unwrap(); - let node_modules = node_project.join("node_modules"); - fs::create_dir(&node_modules).await.unwrap(); - let node_modules_dep = node_modules.join("react"); - fs::create_dir(&node_modules_dep).await.unwrap(); - fs::File::create(node_modules_dep.join("package.json")) - .await - .unwrap(); - - // Photos directory - for photo in ["photo1.png", "photo2.jpg", "photo3.jpeg", "text.txt"] { - fs::File::create(photos.join(photo)).await.unwrap(); - } - - root - } - - async fn run_test( - root_path: &Path, - indexer_ruler: IndexerRuler, - expected: HashSet, - ) { - let system = TaskSystem::new(); - - let handle = system - .dispatch( - WalkDirTask::new_deep( - root_path.to_path_buf(), - Arc::new(root_path.to_path_buf()), - indexer_ruler, - DummyIsoPathFactory { - root_path: Arc::new(root_path.to_path_buf()), - }, - DummyDBProxy, - system.get_dispatcher(), - ) - .unwrap(), - ) - .await; - - let mut group = FutureGroup::new(); - - group.insert(handle); - - let mut group = group.lend_mut(); - - let mut actual_set = HashSet::new(); - - let mut ancestors = HashSet::new(); - - while let Some((group, task_result)) = group.next().await { - let TaskStatus::Done((_task_id, TaskOutput::Out(output))) = task_result.unwrap() else { - panic!("unexpected task output") - }; - - let WalkTaskOutput { - to_create, - accepted_ancestors, - errors, - handles, - .. - } = *output.downcast::().unwrap(); - - assert!(errors.is_empty(), "errors: {errors:#?}"); - - actual_set.extend(to_create); - ancestors.extend(accepted_ancestors); - - for handle in handles { - group.insert(handle); - } - } - - for actual in &actual_set { - ancestors.remove(actual); - } - - if !ancestors.is_empty() { - debug!("Adding ancestors to actual: {:#?}", ancestors); - actual_set.extend(ancestors); - } - - assert_eq!( - actual_set, - expected, - "Expected \\ Actual: {:#?};\n Actual \\ Expected: {:#?}", - expected.difference(&actual_set), - actual_set.difference(&expected) - ); - } - - #[tokio::test] - #[traced_test] - async fn test_walk_without_rules() { - let root = prepare_location().await; - let root_path = root.path(); - - let metadata = FilePathMetadata { - inode: 0, - size_in_bytes: 0, - created_at: Utc::now(), - modified_at: Utc::now(), - hidden: false, - }; - - let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); - let pub_id = Uuid::new_v4(); - let maybe_object_id = None; - - #[rustfmt::skip] - let expected = [ - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/.gitignore"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/Cargo.toml"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial/readme"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src/main.rs"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.gitignore"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/package.json"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src/App.tsx"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react/package.json"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo1.png"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo2.jpg"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo3.jpeg"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/text.txt"), false), metadata }, - ] - .into_iter() - .collect::>(); - - run_test(root_path, IndexerRuler::default(), expected).await; - } - - #[tokio::test] - #[traced_test] - async fn test_only_photos() { - let root = prepare_location().await; - let root_path = root.path(); - - let metadata = FilePathMetadata { - inode: 0, - size_in_bytes: 0, - created_at: Utc::now(), - modified_at: Utc::now(), - hidden: false, - }; - - let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); - let pub_id = Uuid::new_v4(); - let maybe_object_id = None; - - #[rustfmt::skip] - let expected = [ - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo1.png"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo2.jpg"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo3.jpeg"), false), metadata }, - ] - .into_iter() - .collect::>(); - - run_test( - root_path, - IndexerRuler::new(vec![new_indexer_rule( - "only photos", - false, - vec![RulePerKind::AcceptFilesByGlob( - vec![], - GlobSetBuilder::new() - .add(Glob::new("{*.png,*.jpg,*.jpeg}").unwrap()) - .build() - .unwrap(), - )], - )]), - expected, - ) - .await; - } - - #[tokio::test] - #[traced_test] - async fn test_git_repos() { - let root = prepare_location().await; - let root_path = root.path(); - - let metadata = FilePathMetadata { - inode: 0, - size_in_bytes: 0, - created_at: Utc::now(), - modified_at: Utc::now(), - hidden: false, - }; - - let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); - let pub_id = Uuid::new_v4(); - let maybe_object_id = None; - - #[rustfmt::skip] - let expected = [ - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/.gitignore"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/Cargo.toml"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src/main.rs"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial/readme"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/package.json"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.gitignore"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src/App.tsx"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react/package.json"), false), metadata }, - ] - .into_iter() - .collect::>(); - - run_test( - root_path, - IndexerRuler::new(vec![new_indexer_rule( - "git repos", - false, - vec![RulePerKind::AcceptIfChildrenDirectoriesArePresent( - HashSet::from([".git".to_string()]), - )], - )]), - expected, - ) - .await; - } - - #[tokio::test] - #[traced_test] - async fn git_repos_without_deps_or_build_dirs() { - let root = prepare_location().await; - let root_path = root.path(); - - let metadata = FilePathMetadata { - inode: 0, - size_in_bytes: 0, - created_at: Utc::now(), - modified_at: Utc::now(), - hidden: false, - }; - - let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); - let pub_id = Uuid::new_v4(); - let maybe_object_id = None; - - #[rustfmt::skip] - let expected = [ - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/.gitignore"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/Cargo.toml"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial/readme"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src/main.rs"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.gitignore"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/package.json"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src/App.tsx"), false), metadata }, - ] - .into_iter() - .collect::>(); - - run_test( - root_path, - IndexerRuler::new(vec![ - new_indexer_rule( - "git repos", - false, - vec![RulePerKind::AcceptIfChildrenDirectoriesArePresent( - HashSet::from([".git".into()]), - )], - ), - new_indexer_rule( - "reject node_modules", - false, - vec![RulePerKind::RejectFilesByGlob( - vec![], - GlobSetBuilder::new() - .add(Glob::new("{**/node_modules/*,**/node_modules}").unwrap()) - .build() - .unwrap(), - )], - ), - new_indexer_rule( - "reject rust build dir", - false, - vec![RulePerKind::RejectFilesByGlob( - vec![], - GlobSetBuilder::new() - .add(Glob::new("{**/target/*,**/target}").unwrap()) - .build() - .unwrap(), - )], - ), - ]), - expected, - ) - .await; - } -} diff --git a/core/crates/heavy-lifting/src/indexer/tasks/walker/entry.rs b/core/crates/heavy-lifting/src/indexer/tasks/walker/entry.rs new file mode 100644 index 000000000..4e460f972 --- /dev/null +++ b/core/crates/heavy-lifting/src/indexer/tasks/walker/entry.rs @@ -0,0 +1,93 @@ +use sd_core_file_path_helper::{FilePathMetadata, IsolatedFilePathData}; + +use sd_core_prisma_helpers::FilePathPubId; +use sd_prisma::prisma::file_path; + +use std::{ + hash::{Hash, Hasher}, + path::{Path, PathBuf}, +}; + +use serde::{Deserialize, Serialize}; + +/// `WalkedEntry` represents a single path in the filesystem +#[derive(Debug, Serialize, Deserialize)] +pub struct WalkedEntry { + pub pub_id: FilePathPubId, + pub maybe_object_id: file_path::object_id::Type, + pub iso_file_path: IsolatedFilePathData<'static>, + pub metadata: FilePathMetadata, +} + +impl PartialEq for WalkedEntry { + fn eq(&self, other: &Self) -> bool { + self.iso_file_path == other.iso_file_path + } +} + +impl Eq for WalkedEntry {} + +impl Hash for WalkedEntry { + fn hash(&self, state: &mut H) { + self.iso_file_path.hash(state); + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub(super) struct WalkingEntry { + pub(super) iso_file_path: IsolatedFilePathData<'static>, + pub(super) metadata: FilePathMetadata, +} + +impl From for WalkedEntry { + fn from( + WalkingEntry { + iso_file_path, + metadata, + }: WalkingEntry, + ) -> Self { + Self { + pub_id: FilePathPubId::new(), + maybe_object_id: None, + iso_file_path, + metadata, + } + } +} + +impl> From<(PubId, file_path::object_id::Type, WalkingEntry)> + for WalkedEntry +{ + fn from( + ( + pub_id, + maybe_object_id, + WalkingEntry { + iso_file_path, + metadata, + }, + ): (PubId, file_path::object_id::Type, WalkingEntry), + ) -> Self { + Self { + pub_id: pub_id.into(), + maybe_object_id, + iso_file_path, + metadata, + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ToWalkEntry { + pub(super) path: PathBuf, + pub(super) parent_dir_accepted_by_its_children: Option, +} + +impl> From

for ToWalkEntry { + fn from(path: P) -> Self { + Self { + path: path.as_ref().into(), + parent_dir_accepted_by_its_children: None, + } + } +} diff --git a/core/crates/heavy-lifting/src/indexer/tasks/walker/metadata.rs b/core/crates/heavy-lifting/src/indexer/tasks/walker/metadata.rs new file mode 100644 index 000000000..31ad443fc --- /dev/null +++ b/core/crates/heavy-lifting/src/indexer/tasks/walker/metadata.rs @@ -0,0 +1,64 @@ +use crate::indexer; + +use sd_core_file_path_helper::FilePathMetadata; +use sd_core_indexer_rules::MetadataForIndexerRules; + +use std::{fs::Metadata, path::Path}; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize)] +pub(super) struct InnerMetadata { + pub is_dir: bool, + pub is_symlink: bool, + pub inode: u64, + pub size_in_bytes: u64, + pub hidden: bool, + pub created_at: DateTime, + pub modified_at: DateTime, +} + +impl InnerMetadata { + pub fn new( + path: impl AsRef, + metadata: &Metadata, + ) -> Result { + let FilePathMetadata { + inode, + size_in_bytes, + created_at, + modified_at, + hidden, + } = FilePathMetadata::from_path(path, metadata) + .map_err(|e| indexer::NonCriticalIndexerError::FilePathMetadata(e.to_string()))?; + + Ok(Self { + is_dir: metadata.is_dir(), + is_symlink: metadata.is_symlink(), + inode, + size_in_bytes, + hidden, + created_at, + modified_at, + }) + } +} + +impl MetadataForIndexerRules for InnerMetadata { + fn is_dir(&self) -> bool { + self.is_dir + } +} + +impl From for FilePathMetadata { + fn from(metadata: InnerMetadata) -> Self { + Self { + inode: metadata.inode, + size_in_bytes: metadata.size_in_bytes, + hidden: metadata.hidden, + created_at: metadata.created_at, + modified_at: metadata.modified_at, + } + } +} diff --git a/core/crates/heavy-lifting/src/indexer/tasks/walker/mod.rs b/core/crates/heavy-lifting/src/indexer/tasks/walker/mod.rs new file mode 100644 index 000000000..5fef9e689 --- /dev/null +++ b/core/crates/heavy-lifting/src/indexer/tasks/walker/mod.rs @@ -0,0 +1,1176 @@ +use crate::{ + indexer::{ + self, + tasks::walker::rules::{apply_indexer_rules, process_rules_results}, + }, + Error, NonCriticalError, +}; + +use sd_core_file_path_helper::{FilePathError, FilePathMetadata, IsolatedFilePathData}; +use sd_core_indexer_rules::{ + seed::{GitIgnoreRules, GITIGNORE}, + IndexerRuler, MetadataForIndexerRules, RuleKind, +}; +use sd_core_prisma_helpers::{file_path_pub_and_cas_ids, file_path_walker}; + +use sd_prisma::prisma::file_path; +use sd_task_system::{ + check_interruption, ExecStatus, Interrupter, IntoAnyTaskOutput, Task, TaskId, +}; +use sd_utils::{ + db::{inode_from_db, inode_to_db}, + error::FileIOError, +}; + +use std::{ + collections::{HashMap, HashSet}, + fmt, + future::Future, + mem, + path::{Path, PathBuf}, + sync::Arc, + time::Duration, +}; + +use chrono::{DateTime, Duration as ChronoDuration, FixedOffset}; +use futures_concurrency::future::Join; +use tokio::{fs, time::Instant}; +use tokio_stream::{wrappers::ReadDirStream, StreamExt}; +use tracing::{instrument, trace, Level}; + +mod entry; +mod metadata; +mod rules; +mod save_state; + +pub use entry::{ToWalkEntry, WalkedEntry}; + +use entry::WalkingEntry; +use metadata::InnerMetadata; + +pub trait IsoFilePathFactory: Clone + Send + Sync + fmt::Debug + 'static { + fn build( + &self, + path: impl AsRef, + is_dir: bool, + ) -> Result, FilePathError>; +} + +pub trait WalkerDBProxy: Clone + Send + Sync + fmt::Debug + 'static { + fn fetch_file_paths( + &self, + found_paths: Vec, + ) -> impl Future, indexer::Error>> + Send; + + fn fetch_file_paths_to_remove( + &self, + parent_iso_file_path: &IsolatedFilePathData<'_>, + existing_inodes: HashSet>, + unique_location_id_materialized_path_name_extension_params: Vec, + ) -> impl Future< + Output = Result, indexer::NonCriticalIndexerError>, + > + Send; +} + +#[derive(Debug)] +pub struct Walker +where + DBProxy: WalkerDBProxy, + IsoPathFactory: IsoFilePathFactory, +{ + // Task control + id: TaskId, + is_shallow: bool, + + // Received input args + entry: ToWalkEntry, + root: Arc, + entry_iso_file_path: IsolatedFilePathData<'static>, + indexer_ruler: IndexerRuler, + + // Inner state + stage: WalkerStage, + + // Dependencies + iso_file_path_factory: IsoPathFactory, + db_proxy: DBProxy, + + // Non critical errors that happened during the task execution + errors: Vec, + + // Time spent walking through the received directory + scan_time: Duration, +} + +/// [`Walker`] Task output +#[derive(Debug)] +pub struct Output +where + DBProxy: WalkerDBProxy, + IsoPathFactory: IsoFilePathFactory, +{ + /// Entries found in the file system that need to be created in database + pub to_create: Vec, + /// Entries found in the file system that need to be updated in database + pub to_update: Vec, + /// Entries found in the file system that need to be removed from database + pub to_remove: Vec, + /// Entries found in the file system that will not be indexed + pub non_indexed_paths: Vec, + /// Ancestors of entries that were indexed + pub accepted_ancestors: HashSet, + /// Errors that happened during the task execution + pub errors: Vec, + /// Directory that was indexed + pub directory_iso_file_path: IsolatedFilePathData<'static>, + /// Total size of the directory that was indexed + pub total_size: u64, + /// Task handles that were dispatched to run `WalkDir` tasks for inner directories + pub keep_walking_tasks: Vec>, + /// Time spent walking through the received directory + pub scan_time: Duration, +} + +#[async_trait::async_trait] +impl Task for Walker +where + DBProxy: WalkerDBProxy, + IsoPathFactory: IsoFilePathFactory, +{ + fn id(&self) -> TaskId { + self.id + } + + fn with_priority(&self) -> bool { + // If we're running in shallow mode, then we want priority + self.is_shallow + } + + #[instrument( + skip_all, + fields( + task_id = %self.id, + walked_entry = %self.entry.path.display(), + is_shallow = self.is_shallow, + ), + ret(level = Level::TRACE), + err, + )] + #[allow(clippy::blocks_in_conditions)] // Due to `err` on `instrument` macro above + async fn run(&mut self, interrupter: &Interrupter) -> Result { + let is_shallow = self.is_shallow; + let Self { + root, + entry: ToWalkEntry { + path, + parent_dir_accepted_by_its_children, + }, + entry_iso_file_path, + iso_file_path_factory, + indexer_ruler, + db_proxy, + stage, + errors, + scan_time, + .. + } = self; + + let start_time = Instant::now(); + + let ( + to_create, + to_update, + to_remove, + non_indexed_paths, + accepted_ancestors, + total_size, + keep_walking_tasks, + ) = loop { + match stage { + WalkerStage::Start => { + trace!("Preparing git indexer rules for walking root"); + if indexer_ruler.has_system(&GITIGNORE) { + if let Some(rules) = + GitIgnoreRules::get_rules_if_in_git_repo(root.as_ref(), path).await + { + trace!("Found gitignore rules to follow"); + indexer_ruler.extend(rules.map(Into::into)); + } + } + + *stage = WalkerStage::Walking { + read_dir_stream: ReadDirStream::new(fs::read_dir(&path).await.map_err( + |e| { + indexer::Error::FileIO( + (&path, e, "Failed to open directory to read its entries") + .into(), + ) + }, + )?), + found_paths: Vec::new(), + }; + trace!("Starting to walk!"); + } + + WalkerStage::Walking { + read_dir_stream, + found_paths, + } => { + trace!("Walking..."); + while let Some(res) = read_dir_stream.next().await { + match res { + Ok(dir_entry) => { + found_paths.push(dir_entry.path()); + trace!( + new_path = %dir_entry.path().display(), + total_paths = found_paths.len(), + "Found path;" + ); + } + Err(e) => { + errors.push(NonCriticalError::Indexer( + indexer::NonCriticalIndexerError::FailedDirectoryEntry( + FileIOError::from((&path, e)).to_string(), + ), + )); + } + } + + check_interruption!(interrupter, start_time, scan_time); + } + + trace!(total_paths = found_paths.len(), "Finished walking!;"); + + *stage = WalkerStage::CollectingMetadata { + found_paths: mem::take(found_paths), + }; + + check_interruption!(interrupter, start_time, scan_time); + } + + WalkerStage::CollectingMetadata { found_paths } => { + trace!("Collecting metadata for found paths"); + *stage = WalkerStage::CheckingIndexerRules { + paths_and_metadatas: collect_metadata(found_paths, errors).await, + }; + trace!("Finished collecting metadata!"); + + check_interruption!(interrupter, start_time, scan_time); + } + + WalkerStage::CheckingIndexerRules { + paths_and_metadatas, + } => { + trace!("Checking indexer rules for found paths"); + *stage = WalkerStage::ProcessingRulesResults { + paths_metadatas_and_acceptance: apply_indexer_rules( + paths_and_metadatas, + indexer_ruler, + errors, + ) + .await, + }; + trace!("Finished checking indexer rules!"); + + check_interruption!(interrupter, start_time, scan_time); + } + + WalkerStage::ProcessingRulesResults { + paths_metadatas_and_acceptance, + } => { + trace!("Processing rules results"); + let mut maybe_to_keep_walking = (!is_shallow).then(Vec::new); + let (accepted_paths, accepted_ancestors, rejected_paths) = + process_rules_results( + root, + iso_file_path_factory, + *parent_dir_accepted_by_its_children, + paths_metadatas_and_acceptance, + &mut maybe_to_keep_walking, + is_shallow, + errors, + ) + .await; + + trace!( + total_accepted_paths = accepted_paths.len(), + total_accepted_ancestors = accepted_ancestors.len(), + collect_rejected_paths = self.is_shallow, + total_rejected_paths = rejected_paths.len(), + "Finished processing rules results!;" + ); + + *stage = WalkerStage::GatheringFilePathsToRemove { + accepted_paths, + maybe_to_keep_walking, + accepted_ancestors, + non_indexed_paths: rejected_paths, + }; + + check_interruption!(interrupter, start_time, scan_time); + } + + WalkerStage::GatheringFilePathsToRemove { + accepted_paths, + maybe_to_keep_walking, + accepted_ancestors, + non_indexed_paths, + } => { + trace!("Gathering file paths to remove"); + let (walking_entries, to_remove_entries) = gather_file_paths_to_remove( + accepted_paths, + entry_iso_file_path, + iso_file_path_factory, + db_proxy, + errors, + ) + .await; + trace!("Finished gathering file paths to remove!"); + + *stage = WalkerStage::Finalize { + walking_entries, + to_remove_entries, + maybe_to_keep_walking: mem::take(maybe_to_keep_walking), + accepted_ancestors: mem::take(accepted_ancestors), + non_indexed_paths: mem::take(non_indexed_paths), + }; + + check_interruption!(interrupter, start_time, scan_time); + } + + // From this points onwards, we will not allow to be interrupted anymore + WalkerStage::Finalize { + walking_entries, + to_remove_entries, + maybe_to_keep_walking, + accepted_ancestors, + non_indexed_paths, + } => { + trace!("Segregating creates and updates"); + let (to_create, to_update, total_size) = + segregate_creates_and_updates(walking_entries, db_proxy).await?; + trace!( + total_to_create = to_create.len(), + total_to_update = to_update.len(), + total_to_remove = to_remove_entries.len(), + total_non_indexed_paths = non_indexed_paths.len(), + total_size, + "Finished segregating creates and updates!;" + ); + + let keep_walking_tasks = keep_walking( + root, + indexer_ruler, + iso_file_path_factory, + db_proxy, + maybe_to_keep_walking.as_mut(), + errors, + ); + + break ( + to_create, + to_update, + mem::take(to_remove_entries), + mem::take(non_indexed_paths), + mem::take(accepted_ancestors), + total_size, + keep_walking_tasks, + ); + } + } + }; + + *scan_time += start_time.elapsed(); + + // Taking out some data as the task is finally complete + Ok(ExecStatus::Done( + Output { + to_create, + to_update, + to_remove, + non_indexed_paths, + accepted_ancestors, + errors: mem::take(errors), + directory_iso_file_path: mem::take(entry_iso_file_path), + total_size, + keep_walking_tasks, + scan_time: *scan_time, + } + .into_output(), + )) + } +} + +#[derive(Debug)] +enum WalkerStage { + Start, + Walking { + read_dir_stream: ReadDirStream, + found_paths: Vec, + }, + CollectingMetadata { + found_paths: Vec, + }, + CheckingIndexerRules { + paths_and_metadatas: HashMap, + }, + ProcessingRulesResults { + paths_metadatas_and_acceptance: + HashMap>)>, + }, + GatheringFilePathsToRemove { + accepted_paths: HashMap, + maybe_to_keep_walking: Option>, + accepted_ancestors: HashSet, + non_indexed_paths: Vec, + }, + Finalize { + walking_entries: Vec, + accepted_ancestors: HashSet, + to_remove_entries: Vec, + maybe_to_keep_walking: Option>, + non_indexed_paths: Vec, + }, +} + +impl Walker +where + DBProxy: WalkerDBProxy, + IsoPathFactory: IsoFilePathFactory, +{ + pub fn new_deep( + entry: impl Into + Send, + root: Arc, + indexer_ruler: IndexerRuler, + iso_file_path_factory: IsoPathFactory, + db_proxy: DBProxy, + ) -> Result { + let entry = entry.into(); + Ok(Self { + id: TaskId::new_v4(), + root, + indexer_ruler, + entry_iso_file_path: iso_file_path_factory.build(&entry.path, true)?, + iso_file_path_factory, + db_proxy, + stage: WalkerStage::Start, + entry, + is_shallow: false, + errors: Vec::new(), + scan_time: Duration::ZERO, + }) + } +} + +impl Walker +where + DBProxy: WalkerDBProxy, + IsoPathFactory: IsoFilePathFactory, +{ + pub fn new_shallow( + entry: impl Into + Send, + root: Arc, + indexer_ruler: IndexerRuler, + iso_file_path_factory: IsoPathFactory, + db_proxy: DBProxy, + ) -> Result { + let entry = entry.into(); + Ok(Self { + id: TaskId::new_v4(), + root, + indexer_ruler, + entry_iso_file_path: iso_file_path_factory.build(&entry.path, true)?, + iso_file_path_factory, + db_proxy, + stage: WalkerStage::Start, + entry, + is_shallow: true, + errors: Vec::new(), + scan_time: Duration::ZERO, + }) + } +} + +#[instrument( + skip_all, + fields(entries_count = walking_entries.len()), + err, +)] +async fn segregate_creates_and_updates( + walking_entries: &mut Vec, + db_proxy: &impl WalkerDBProxy, +) -> Result<(Vec, Vec, u64), Error> { + if walking_entries.is_empty() { + Ok((vec![], vec![], 0)) + } else { + let iso_paths_already_in_db = db_proxy + .fetch_file_paths( + walking_entries + .iter() + .map(|entry| file_path::WhereParam::from(&entry.iso_file_path)) + .collect(), + ) + .await? + .into_iter() + .flat_map(|file_path| { + IsolatedFilePathData::try_from(file_path.clone()) + .map(|iso_file_path| (iso_file_path, file_path)) + }) + .collect::>(); + + Ok(walking_entries.drain(..).fold( + (Vec::new(), Vec::new(), 0), + |(mut to_create, mut to_update, mut total_size), entry| { + let WalkingEntry { + iso_file_path, + metadata, + } = &entry; + + total_size += metadata.size_in_bytes; + + if let Some(file_path) = iso_paths_already_in_db.get(iso_file_path) { + if let (Some(inode), Some(date_modified)) = + (&file_path.inode, &file_path.date_modified) + { + if ( + inode_from_db(&inode[0..8]) != metadata.inode + // Datetimes stored in DB loses a bit of precision, + // so we need to check against a delta + // instead of using != operator + || ( + DateTime::::from(metadata.modified_at) - *date_modified + > ChronoDuration::milliseconds(1) + ) + || file_path.hidden.is_none() + || metadata.hidden != file_path.hidden.unwrap_or_default() + ) + // We ignore the size of directories because it is not reliable, we need to + // calculate it ourselves later + && !( + iso_file_path.to_parts().is_dir + && metadata.size_in_bytes + != file_path + .size_in_bytes_bytes + .as_ref() + .map(|size_in_bytes_bytes| { + u64::from_be_bytes([ + size_in_bytes_bytes[0], + size_in_bytes_bytes[1], + size_in_bytes_bytes[2], + size_in_bytes_bytes[3], + size_in_bytes_bytes[4], + size_in_bytes_bytes[5], + size_in_bytes_bytes[6], + size_in_bytes_bytes[7], + ]) + }) + .unwrap_or_default() + ) { + to_update.push(WalkedEntry::from(( + &file_path.pub_id, + file_path.object_id, + entry, + ))); + } + } + } else { + to_create.push(WalkedEntry::from(entry)); + } + + (to_create, to_update, total_size) + }, + )) + } +} + +fn keep_walking( + root: &Arc, + indexer_ruler: &IndexerRuler, + iso_file_path_factory: &IsoPathFactory, + db_proxy: &DBProxy, + maybe_to_keep_walking: Option<&mut Vec>, + errors: &mut Vec, +) -> Vec> +where + DBProxy: WalkerDBProxy, + IsoPathFactory: IsoFilePathFactory, +{ + maybe_to_keep_walking + .map(|to_keep_walking| { + to_keep_walking + .drain(..) + .map(|entry| { + Walker::new_deep( + entry, + Arc::clone(root), + indexer_ruler.clone(), + iso_file_path_factory.clone(), + db_proxy.clone(), + ) + .map_err(|e| { + indexer::NonCriticalIndexerError::DispatchKeepWalking(e.to_string()) + }) + }) + .filter_map(|res| res.map_err(|e| errors.push(e.into())).ok()) + .collect() + }) + .unwrap_or_default() +} + +async fn collect_metadata( + found_paths: &mut Vec, + errors: &mut Vec, +) -> HashMap { + found_paths + .drain(..) + .map(|current_path| async move { + fs::metadata(¤t_path) + .await + .map_err(|e| { + indexer::NonCriticalIndexerError::Metadata( + FileIOError::from((¤t_path, e)).to_string(), + ) + }) + .and_then(|metadata| { + InnerMetadata::new(¤t_path, &metadata) + .map(|metadata| (current_path, metadata)) + }) + }) + .collect::>() + .join() + .await + .into_iter() + .filter_map(|res| res.map_err(|e| errors.push(e.into())).ok()) + .collect() +} + +async fn gather_file_paths_to_remove( + accepted_paths: &mut HashMap, + entry_iso_file_path: &IsolatedFilePathData<'_>, + iso_file_path_factory: &impl IsoFilePathFactory, + db_proxy: &impl WalkerDBProxy, + errors: &mut Vec, +) -> (Vec, Vec) { + let mut existing_inodes = HashSet::new(); + + let (walking, to_delete_params) = accepted_paths + .drain() + .filter_map(|(path, metadata)| { + iso_file_path_factory + .build(&path, metadata.is_dir()) + .map(|iso_file_path| { + let params = file_path::WhereParam::from(&iso_file_path); + existing_inodes.insert(inode_to_db(metadata.inode)); + + ( + WalkingEntry { + iso_file_path, + metadata: FilePathMetadata::from(metadata), + }, + params, + ) + }) + .map_err(|e| { + errors + .push(indexer::NonCriticalIndexerError::IsoFilePath(e.to_string()).into()); + }) + .ok() + }) + .unzip::<_, _, Vec<_>, Vec<_>>(); + + // We continue the function even if we fail to fetch `file_path`s to remove, + // the DB will have old `file_path`s but at least this is better than + // don't adding the newly indexed paths + let to_remove_entries = db_proxy + .fetch_file_paths_to_remove(entry_iso_file_path, existing_inodes, to_delete_params) + .await + .map_err(|e| errors.push(e.into())) + .unwrap_or_default(); + + (walking, to_remove_entries) +} + +#[cfg(test)] +mod tests { + use super::*; + + use sd_core_indexer_rules::{IndexerRule, RulePerKind}; + use sd_core_prisma_helpers::FilePathPubId; + use sd_task_system::{TaskOutput, TaskStatus, TaskSystem}; + + use chrono::Utc; + use futures::stream::FuturesUnordered; + use globset::{Glob, GlobSetBuilder}; + use lending_stream::{LendingStream, StreamExt}; + use tempfile::{tempdir, TempDir}; + use tokio::{fs, io::AsyncWriteExt}; + use tracing::debug; + use tracing_test::traced_test; + + #[derive(Debug, Clone)] + struct DummyIsoPathFactory { + root_path: Arc, + } + + impl IsoFilePathFactory for DummyIsoPathFactory { + fn build( + &self, + path: impl AsRef, + is_dir: bool, + ) -> Result, FilePathError> { + IsolatedFilePathData::new(0, self.root_path.as_ref(), path, is_dir).map_err(Into::into) + } + } + + #[derive(Debug, Clone)] + struct DummyDBProxy; + + impl WalkerDBProxy for DummyDBProxy { + async fn fetch_file_paths( + &self, + _: Vec, + ) -> Result, indexer::Error> { + Ok(vec![]) + } + + async fn fetch_file_paths_to_remove( + &self, + _: &IsolatedFilePathData<'_>, + _: HashSet>, + _: Vec, + ) -> Result, indexer::NonCriticalIndexerError> { + Ok(vec![]) + } + } + + fn new_indexer_rule( + name: impl Into, + default: bool, + rules: Vec, + ) -> IndexerRule { + IndexerRule { + id: None, + name: name.into(), + default, + rules, + date_created: Utc::now(), + date_modified: Utc::now(), + } + } + + #[allow(clippy::cognitive_complexity)] + async fn prepare_location() -> TempDir { + // root + // |__ rust_project + // | |__ .git + // | | |__ + // | |__ .gitignore + // | |__ ignorable.file + // | |__ Cargo.toml + // | |__ src + // | | |__ main.rs + // | |__ target + // | | |__ debug + // | | |__ main + // | |__ partial + // | | |__ ignoreme + // | | |__ readme + // | |__ inner + // | |__ node_project + // | |__ .git + // | | |__ + // | |__ .gitignore + // | |__ ignorable.file + // | |__ package.json + // | |__ src + // | | |__ App.tsx + // | |__ node_modules + // | |__ react + // | |__ package.json + // |__ photos + // |__ photo1.png + // |__ photo2.jpg + // |__ photo3.jpeg + // |__ text.txt + + let root = tempdir().unwrap(); + let root_path = root.path(); + let rust_project = root_path.join("rust_project"); + let inner_project = root_path.join("inner"); + let node_project = inner_project.join("node_project"); + let photos = root_path.join("photos"); + + fs::create_dir(&rust_project).await.unwrap(); + fs::create_dir(&inner_project).await.unwrap(); + fs::create_dir(&node_project).await.unwrap(); + fs::create_dir(&photos).await.unwrap(); + + // Inner directory partially ignored by git + let partial_dir = rust_project.join("partial"); + fs::create_dir(&partial_dir).await.unwrap(); + fs::File::create(partial_dir.join("ignoreme")) + .await + .unwrap(); + fs::File::create(partial_dir.join("readme")).await.unwrap(); + + // Making rust and node projects a git repository + fs::create_dir(rust_project.join(".git")).await.unwrap(); + let gitignore = rust_project.join(".gitignore"); + let mut file = fs::File::create(gitignore).await.unwrap(); + file.write_all(b"*.file\n/target\npartial/ignoreme") + .await + .unwrap(); + fs::create_dir(node_project.join(".git")).await.unwrap(); + let gitignore = node_project.join(".gitignore"); + let mut file = fs::File::create(gitignore).await.unwrap(); + file.write_all(b"ignorable.file").await.unwrap(); + + // Populating rust project + fs::File::create(rust_project.join("Cargo.toml")) + .await + .unwrap(); + fs::File::create(rust_project.join("ignorable.file")) + .await + .unwrap(); + let rust_src_dir = rust_project.join("src"); + fs::create_dir(&rust_src_dir).await.unwrap(); + fs::File::create(rust_src_dir.join("main.rs")) + .await + .unwrap(); + let rust_target_dir = rust_project.join("target"); + fs::create_dir(&rust_target_dir).await.unwrap(); + let rust_build_dir = rust_target_dir.join("debug"); + fs::create_dir(&rust_build_dir).await.unwrap(); + fs::File::create(rust_build_dir.join("main")).await.unwrap(); + + // Populating node project + fs::File::create(node_project.join("package.json")) + .await + .unwrap(); + fs::File::create(node_project.join("ignorable.file")) + .await + .unwrap(); + let node_src_dir = node_project.join("src"); + fs::create_dir(&node_src_dir).await.unwrap(); + fs::File::create(node_src_dir.join("App.tsx")) + .await + .unwrap(); + let node_modules = node_project.join("node_modules"); + fs::create_dir(&node_modules).await.unwrap(); + let node_modules_dep = node_modules.join("react"); + fs::create_dir(&node_modules_dep).await.unwrap(); + fs::File::create(node_modules_dep.join("package.json")) + .await + .unwrap(); + + // Photos directory + for photo in ["photo1.png", "photo2.jpg", "photo3.jpeg", "text.txt"] { + fs::File::create(photos.join(photo)).await.unwrap(); + } + + root + } + + async fn run_test( + root_path: &Path, + indexer_ruler: IndexerRuler, + expected: HashSet, + ) { + let system = TaskSystem::new(); + + let handle = system + .dispatch( + Walker::new_deep( + root_path.to_path_buf(), + Arc::new(root_path.to_path_buf()), + indexer_ruler, + DummyIsoPathFactory { + root_path: Arc::new(root_path.to_path_buf()), + }, + DummyDBProxy, + ) + .unwrap(), + ) + .await + .unwrap(); + + let group = FuturesUnordered::new(); + + group.push(handle); + + let mut group = group.lend_mut(); + + let mut actual_set = HashSet::new(); + + let mut ancestors = HashSet::new(); + + while let Some((group, task_result)) = group.next().await { + let TaskStatus::Done((_task_id, TaskOutput::Out(output))) = task_result.unwrap() else { + panic!("unexpected task output") + }; + + let Output { + to_create, + accepted_ancestors, + errors, + keep_walking_tasks, + .. + } = *output + .downcast::>() + .unwrap(); + + assert!(errors.is_empty(), "errors: {errors:#?}"); + + actual_set.extend(to_create); + ancestors.extend(accepted_ancestors); + + group.extend(system.dispatch_many(keep_walking_tasks).await.unwrap()); + } + + for actual in &actual_set { + ancestors.remove(actual); + } + + if !ancestors.is_empty() { + debug!(?ancestors, "Adding ancestors to actual"); + actual_set.extend(ancestors); + } + + assert_eq!( + actual_set, + expected, + "Expected \\ Actual: {:#?};\n Actual \\ Expected: {:#?}", + expected.difference(&actual_set), + actual_set.difference(&expected) + ); + } + + #[tokio::test] + #[traced_test] + async fn test_walk_without_rules() { + let root = prepare_location().await; + let root_path = root.path(); + + let metadata = FilePathMetadata { + inode: 0, + size_in_bytes: 0, + created_at: Utc::now(), + modified_at: Utc::now(), + hidden: false, + }; + + let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); + let pub_id = FilePathPubId::new(); + let maybe_object_id = None; + + #[rustfmt::skip] + let expected = [ + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/.git"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/.gitignore"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/Cargo.toml"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial/readme"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/src"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/src/main.rs"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.git"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.gitignore"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/package.json"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src/App.tsx"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react/package.json"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("photos"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("photos/photo1.png"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("photos/photo2.jpg"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("photos/photo3.jpeg"), false), metadata }, + WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/text.txt"), false), metadata }, + ] + .into_iter() + .collect::>(); + + run_test(root_path, IndexerRuler::default(), expected).await; + } + + #[tokio::test] + #[traced_test] + async fn test_only_photos() { + let root = prepare_location().await; + let root_path = root.path(); + + let metadata = FilePathMetadata { + inode: 0, + size_in_bytes: 0, + created_at: Utc::now(), + modified_at: Utc::now(), + hidden: false, + }; + + let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); + let pub_id = FilePathPubId::new(); + let maybe_object_id = None; + + #[rustfmt::skip] + let expected = [ + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("photos"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("photos/photo1.png"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("photos/photo2.jpg"), false), metadata }, + WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo3.jpeg"), false), metadata }, + ] + .into_iter() + .collect::>(); + + run_test( + root_path, + IndexerRuler::new(vec![new_indexer_rule( + "only photos", + false, + vec![RulePerKind::AcceptFilesByGlob( + vec![], + GlobSetBuilder::new() + .add(Glob::new("{*.png,*.jpg,*.jpeg}").unwrap()) + .build() + .unwrap(), + )], + )]), + expected, + ) + .await; + } + + #[tokio::test] + #[traced_test] + async fn test_git_repos() { + let root = prepare_location().await; + let root_path = root.path(); + + let metadata = FilePathMetadata { + inode: 0, + size_in_bytes: 0, + created_at: Utc::now(), + modified_at: Utc::now(), + hidden: false, + }; + + let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); + let pub_id = FilePathPubId::new(); + let maybe_object_id = None; + + #[rustfmt::skip] + let expected = [ + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/.git"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/.gitignore"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/Cargo.toml"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/src"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/src/main.rs"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial/readme"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.git"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/package.json"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.gitignore"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src/App.tsx"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react"), true), metadata }, + WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react/package.json"), false), metadata }, + ] + .into_iter() + .collect::>(); + + run_test( + root_path, + IndexerRuler::new(vec![new_indexer_rule( + "git repos", + false, + vec![RulePerKind::AcceptIfChildrenDirectoriesArePresent( + HashSet::from([".git".to_string()]), + )], + )]), + expected, + ) + .await; + } + + #[tokio::test] + #[traced_test] + async fn git_repos_without_deps_or_build_dirs() { + let root = prepare_location().await; + let root_path = root.path(); + + let metadata = FilePathMetadata { + inode: 0, + size_in_bytes: 0, + created_at: Utc::now(), + modified_at: Utc::now(), + hidden: false, + }; + + let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); + let pub_id = FilePathPubId::new(); + let maybe_object_id = None; + + #[rustfmt::skip] + let expected = [ + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/.git"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/.gitignore"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/Cargo.toml"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/partial/readme"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/src"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("rust_project/src/main.rs"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.git"), true), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.gitignore"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/package.json"), false), metadata }, + WalkedEntry { pub_id: pub_id.clone(), maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src"), true), metadata }, + WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src/App.tsx"), false), metadata }, + ] + .into_iter() + .collect::>(); + + run_test( + root_path, + IndexerRuler::new(vec![ + new_indexer_rule( + "git repos", + false, + vec![RulePerKind::AcceptIfChildrenDirectoriesArePresent( + HashSet::from([".git".into()]), + )], + ), + new_indexer_rule( + "reject node_modules", + false, + vec![RulePerKind::RejectFilesByGlob( + vec![], + GlobSetBuilder::new() + .add(Glob::new("{**/node_modules/*,**/node_modules}").unwrap()) + .build() + .unwrap(), + )], + ), + new_indexer_rule( + "reject rust build dir", + false, + vec![RulePerKind::RejectFilesByGlob( + vec![], + GlobSetBuilder::new() + .add(Glob::new("{**/target/*,**/target}").unwrap()) + .build() + .unwrap(), + )], + ), + ]), + expected, + ) + .await; + } +} diff --git a/core/crates/heavy-lifting/src/indexer/tasks/walker/rules.rs b/core/crates/heavy-lifting/src/indexer/tasks/walker/rules.rs new file mode 100644 index 000000000..f8f3fd0e1 --- /dev/null +++ b/core/crates/heavy-lifting/src/indexer/tasks/walker/rules.rs @@ -0,0 +1,261 @@ +use crate::{indexer, NonCriticalError}; + +use sd_core_file_path_helper::{FilePathMetadata, IsolatedFilePathData}; +use sd_core_indexer_rules::{IndexerRuler, MetadataForIndexerRules, RuleKind}; + +use sd_utils::error::FileIOError; + +use std::{ + collections::{hash_map::Entry, HashMap, HashSet}, + path::{Path, PathBuf}, + sync::Arc, +}; + +use futures_concurrency::future::Join; +use tokio::fs; +use tracing::{instrument, trace}; + +use super::{ + entry::{ToWalkEntry, WalkingEntry}, + InnerMetadata, IsoFilePathFactory, WalkedEntry, +}; + +pub(super) async fn apply_indexer_rules( + paths_and_metadatas: &mut HashMap, + indexer_ruler: &IndexerRuler, + errors: &mut Vec, +) -> HashMap>)> { + paths_and_metadatas + .drain() + // TODO: Hard ignoring symlinks for now, but this should be configurable + .filter(|(_, metadata)| !metadata.is_symlink) + .map(|(current_path, metadata)| async { + indexer_ruler + .apply_all(¤t_path, &metadata) + .await + .map(|acceptance_per_rule_kind| { + (current_path, (metadata, acceptance_per_rule_kind)) + }) + .map_err(|e| indexer::NonCriticalIndexerError::IndexerRule(e.to_string())) + }) + .collect::>() + .join() + .await + .into_iter() + .filter_map(|res| res.map_err(|e| errors.push(e.into())).ok()) + .collect() +} + +pub(super) async fn process_rules_results( + root: &Arc, + iso_file_path_factory: &impl IsoFilePathFactory, + parent_dir_accepted_by_its_children: Option, + paths_metadatas_and_acceptance: &mut HashMap< + PathBuf, + (InnerMetadata, HashMap>), + >, + maybe_to_keep_walking: &mut Option>, + collect_rejected_paths: bool, + errors: &mut Vec, +) -> ( + HashMap, + HashSet, + Vec, +) { + let (accepted, accepted_ancestors, rejected) = segregate_paths( + root, + iso_file_path_factory, + paths_metadatas_and_acceptance.drain(), + parent_dir_accepted_by_its_children, + maybe_to_keep_walking, + collect_rejected_paths, + errors, + ); + + ( + accepted, + accepted_ancestors + .into_iter() + .map(|(ancestor_iso_file_path, ancestor_path)| async move { + fs::metadata(&ancestor_path) + .await + .map_err(|e| { + indexer::NonCriticalIndexerError::Metadata( + FileIOError::from((&ancestor_path, e)).to_string(), + ) + }) + .and_then(|metadata| { + FilePathMetadata::from_path(&ancestor_path, &metadata) + .map(|metadata| { + WalkingEntry { + iso_file_path: ancestor_iso_file_path, + metadata, + } + .into() + }) + .map_err(|e| { + indexer::NonCriticalIndexerError::FilePathMetadata(e.to_string()) + }) + }) + }) + .collect::>() + .join() + .await + .into_iter() + .filter_map(|res| res.map_err(|e| errors.push(e.into())).ok()) + .collect(), + rejected, + ) +} + +fn segregate_paths( + root: &Arc, + iso_file_path_factory: &impl IsoFilePathFactory, + paths_metadatas_and_acceptance: impl IntoIterator< + Item = (PathBuf, (InnerMetadata, HashMap>)), + >, + parent_dir_accepted_by_its_children: Option, + maybe_to_keep_walking: &mut Option>, + collect_rejected_paths: bool, + errors: &mut Vec, +) -> ( + HashMap, + HashMap, PathBuf>, + Vec, +) { + let root = root.as_ref(); + + let mut accepted = HashMap::new(); + let mut accepted_ancestors = HashMap::new(); + let mut rejected = Vec::new(); + + for (current_path, (metadata, acceptance_per_rule_kind)) in paths_metadatas_and_acceptance { + // Accept by children has three states, + // None if we don't now yet or if this check doesn't apply + // Some(true) if this check applies and it passes + // Some(false) if this check applies and it was rejected + // and we pass the current parent state to its children + let mut accept_by_children_dir = parent_dir_accepted_by_its_children; + + if !reject_path( + ¤t_path, + &metadata, + &acceptance_per_rule_kind, + &mut accept_by_children_dir, + maybe_to_keep_walking, + ) && accept_by_children_dir.unwrap_or(true) + { + accept_path_and_ancestors( + current_path, + metadata, + root, + &mut accepted, + iso_file_path_factory, + &mut accepted_ancestors, + errors, + ); + + continue; + } + + if collect_rejected_paths { + rejected.push(current_path); + } + } + + (accepted, accepted_ancestors, rejected) +} + +#[instrument(skip_all, fields(current_path = %current_path.display()))] +fn reject_path( + current_path: &Path, + metadata: &InnerMetadata, + acceptance_per_rule_kind: &HashMap>, + accept_by_children_dir: &mut Option, + maybe_to_keep_walking: &mut Option>, +) -> bool { + IndexerRuler::rejected_by_reject_glob(acceptance_per_rule_kind) + || IndexerRuler::rejected_by_git_ignore(acceptance_per_rule_kind) + || (metadata.is_dir() + && process_and_maybe_reject_by_directory_rules( + current_path, + acceptance_per_rule_kind, + accept_by_children_dir, + maybe_to_keep_walking, + )) || IndexerRuler::rejected_by_accept_glob(acceptance_per_rule_kind) +} + +fn process_and_maybe_reject_by_directory_rules( + current_path: &Path, + acceptance_per_rule_kind: &HashMap>, + accept_by_children_dir: &mut Option, + maybe_to_keep_walking: &mut Option>, +) -> bool { + // If it is a directory, first we check if we must reject it and its children entirely + if IndexerRuler::rejected_by_children_directories(acceptance_per_rule_kind) { + return true; + } + + // Then we check if we must accept it and its children + if let Some(accepted_by_children_rules) = + acceptance_per_rule_kind.get(&RuleKind::AcceptIfChildrenDirectoriesArePresent) + { + if accepted_by_children_rules.iter().any(|accept| *accept) { + *accept_by_children_dir = Some(true); + } + + // If it wasn't accepted then we mark as rejected + if accept_by_children_dir.is_none() { + trace!( + "Rejected because it didn't passed in any \ + `RuleKind::AcceptIfChildrenDirectoriesArePresent` rule", + ); + *accept_by_children_dir = Some(false); + } + } + + // Then we mark this directory to maybe be walked in too + if let Some(ref mut to_keep_walking) = maybe_to_keep_walking { + to_keep_walking.push(ToWalkEntry { + path: current_path.to_path_buf(), + parent_dir_accepted_by_its_children: *accept_by_children_dir, + }); + } + + false +} + +fn accept_path_and_ancestors( + current_path: PathBuf, + metadata: InnerMetadata, + root: &Path, + accepted: &mut HashMap, + iso_file_path_factory: &impl IsoFilePathFactory, + accepted_ancestors: &mut HashMap, PathBuf>, + errors: &mut Vec, +) { + // If the ancestors directories wasn't indexed before, now we do + for ancestor in current_path + .ancestors() + .skip(1) // Skip the current directory as it was already indexed + .take_while(|&ancestor| ancestor != root) + { + if let Ok(iso_file_path) = iso_file_path_factory.build(ancestor, true).map_err(|e| { + errors.push(indexer::NonCriticalIndexerError::IsoFilePath(e.to_string()).into()); + }) { + match accepted_ancestors.entry(iso_file_path) { + Entry::Occupied(_) => { + // If we already accepted this ancestor, then it will contain + // also all if its ancestors too, so we can stop here + break; + } + Entry::Vacant(entry) => { + trace!(ancestor = %ancestor.display(), "Accepted ancestor"); + entry.insert(ancestor.to_path_buf()); + } + } + } + } + + accepted.insert(current_path, metadata); +} diff --git a/core/crates/heavy-lifting/src/indexer/tasks/walker/save_state.rs b/core/crates/heavy-lifting/src/indexer/tasks/walker/save_state.rs new file mode 100644 index 000000000..2dd66fc50 --- /dev/null +++ b/core/crates/heavy-lifting/src/indexer/tasks/walker/save_state.rs @@ -0,0 +1,219 @@ +use crate::{Error, NonCriticalError}; + +use sd_core_file_path_helper::IsolatedFilePathData; +use sd_core_indexer_rules::{IndexerRuler, RuleKind}; +use sd_core_prisma_helpers::file_path_pub_and_cas_ids; + +use std::{ + collections::{HashMap, HashSet}, + path::PathBuf, + sync::Arc, + time::Duration, +}; + +use sd_task_system::{SerializableTask, TaskId}; +use serde::{Deserialize, Serialize}; + +use super::{ + entry::{ToWalkEntry, WalkingEntry}, + metadata::InnerMetadata, + IsoFilePathFactory, WalkedEntry, Walker, WalkerDBProxy, WalkerStage, +}; + +#[derive(Debug, Serialize, Deserialize)] +pub(super) struct WalkDirSaveState { + id: TaskId, + is_shallow: bool, + + entry: ToWalkEntry, + root: Arc, + entry_iso_file_path: IsolatedFilePathData<'static>, + + stage: WalkerStageSaveState, + + errors: Vec, + scan_time: Duration, +} + +#[derive(Debug, Serialize, Deserialize)] +pub(super) enum WalkerStageSaveState { + Start, + CollectingMetadata { + found_paths: Vec, + }, + CheckingIndexerRules { + paths_and_metadatas: HashMap, + }, + ProcessingRulesResults { + paths_metadatas_and_acceptance: + HashMap>)>, + }, + GatheringFilePathsToRemove { + accepted_paths: HashMap, + maybe_to_keep_walking: Option>, + accepted_ancestors: HashSet, + non_indexed_paths: Vec, + }, + Finalize { + walking_entries: Vec, + accepted_ancestors: HashSet, + to_remove_entries: Vec, + maybe_to_keep_walking: Option>, + non_indexed_paths: Vec, + }, +} + +impl From for WalkerStageSaveState { + fn from(stage: WalkerStage) -> Self { + match stage { + // We can't store the current state of `ReadDirStream` so we start again from the beginning + WalkerStage::Start | WalkerStage::Walking { .. } => Self::Start, + WalkerStage::CollectingMetadata { found_paths } => { + Self::CollectingMetadata { found_paths } + } + WalkerStage::CheckingIndexerRules { + paths_and_metadatas, + } => Self::CheckingIndexerRules { + paths_and_metadatas, + }, + WalkerStage::ProcessingRulesResults { + paths_metadatas_and_acceptance, + } => Self::ProcessingRulesResults { + paths_metadatas_and_acceptance, + }, + WalkerStage::GatheringFilePathsToRemove { + accepted_paths, + maybe_to_keep_walking, + accepted_ancestors, + non_indexed_paths, + } => Self::GatheringFilePathsToRemove { + accepted_paths, + maybe_to_keep_walking, + accepted_ancestors, + non_indexed_paths, + }, + WalkerStage::Finalize { + walking_entries, + accepted_ancestors, + to_remove_entries, + maybe_to_keep_walking, + non_indexed_paths, + } => Self::Finalize { + walking_entries, + accepted_ancestors, + to_remove_entries, + maybe_to_keep_walking, + non_indexed_paths, + }, + } + } +} + +impl From for WalkerStage { + fn from(value: WalkerStageSaveState) -> Self { + match value { + WalkerStageSaveState::Start => Self::Start, + WalkerStageSaveState::CollectingMetadata { found_paths } => { + Self::CollectingMetadata { found_paths } + } + WalkerStageSaveState::CheckingIndexerRules { + paths_and_metadatas, + } => Self::CheckingIndexerRules { + paths_and_metadatas, + }, + WalkerStageSaveState::ProcessingRulesResults { + paths_metadatas_and_acceptance, + } => Self::ProcessingRulesResults { + paths_metadatas_and_acceptance, + }, + WalkerStageSaveState::GatheringFilePathsToRemove { + accepted_paths, + maybe_to_keep_walking, + accepted_ancestors, + non_indexed_paths, + } => Self::GatheringFilePathsToRemove { + accepted_paths, + maybe_to_keep_walking, + accepted_ancestors, + non_indexed_paths, + }, + WalkerStageSaveState::Finalize { + walking_entries, + accepted_ancestors, + to_remove_entries, + maybe_to_keep_walking, + non_indexed_paths, + } => Self::Finalize { + walking_entries, + accepted_ancestors, + to_remove_entries, + maybe_to_keep_walking, + non_indexed_paths, + }, + } + } +} + +impl SerializableTask for Walker +where + DBProxy: WalkerDBProxy, + IsoPathFactory: IsoFilePathFactory, +{ + type SerializeError = rmp_serde::encode::Error; + type DeserializeError = rmp_serde::decode::Error; + type DeserializeCtx = (IndexerRuler, DBProxy, IsoPathFactory); + + async fn serialize(self) -> Result, Self::SerializeError> { + let Self { + id, + entry, + root, + entry_iso_file_path, + stage, + errors, + scan_time, + is_shallow, + .. + } = self; + rmp_serde::to_vec_named(&WalkDirSaveState { + id, + is_shallow, + entry, + root, + entry_iso_file_path, + stage: stage.into(), + errors, + scan_time, + }) + } + + async fn deserialize( + data: &[u8], + (indexer_ruler, db_proxy, iso_file_path_factory): Self::DeserializeCtx, + ) -> Result { + rmp_serde::from_slice(data).map( + |WalkDirSaveState { + id, + entry, + root, + entry_iso_file_path, + stage, + errors, + scan_time, + is_shallow, + }| Self { + id, + entry, + root, + entry_iso_file_path, + indexer_ruler, + iso_file_path_factory, + db_proxy, + stage: stage.into(), + errors, + scan_time, + is_shallow, + }, + ) + } +} diff --git a/core/crates/heavy-lifting/src/job_system/error.rs b/core/crates/heavy-lifting/src/job_system/error.rs index af212ef4e..98c5b8f8e 100644 --- a/core/crates/heavy-lifting/src/job_system/error.rs +++ b/core/crates/heavy-lifting/src/job_system/error.rs @@ -1,5 +1,4 @@ -use crate::Error; - +use sd_task_system::{DispatcherShutdownError, Task}; use sd_utils::error::FileIOError; use prisma_client_rust::QueryError; @@ -17,9 +16,6 @@ pub enum JobSystemError { already_running_id: JobId, }, - #[error("job canceled: ")] - Canceled(JobId), - #[error("failed to load job reports from database to resume jobs: {0}")] LoadReportsForResume(#[from] QueryError), @@ -34,9 +30,6 @@ pub enum JobSystemError { #[error(transparent)] Report(#[from] ReportError), - - #[error(transparent)] - Processing(#[from] Error), } impl From for rspc::Error { @@ -45,17 +38,36 @@ impl From for rspc::Error { JobSystemError::NotFound(_) => { Self::with_cause(rspc::ErrorCode::NotFound, e.to_string(), e) } + JobSystemError::AlreadyRunning { .. } => { Self::with_cause(rspc::ErrorCode::Conflict, e.to_string(), e) } - JobSystemError::Canceled(_) => { - Self::with_cause(rspc::ErrorCode::ClientClosedRequest, e.to_string(), e) - } - JobSystemError::Processing(e) => e.into(), JobSystemError::Report(e) => e.into(), _ => Self::with_cause(rspc::ErrorCode::InternalServerError, e.to_string(), e), } } } + +#[derive(thiserror::Error, Debug)] +pub enum DispatcherError { + #[error("job canceled: ")] + JobCanceled(JobId), + #[error("system entered on shutdown mode ", .0.len())] + Shutdown(Vec>>), +} + +#[derive(Debug, thiserror::Error)] +pub enum JobErrorOrDispatcherError> { + #[error(transparent)] + JobError(#[from] JobError), + #[error(transparent)] + Dispatcher(#[from] DispatcherError), +} + +impl From> for DispatcherError { + fn from(DispatcherShutdownError(tasks): DispatcherShutdownError) -> Self { + Self::Shutdown(tasks) + } +} diff --git a/core/crates/heavy-lifting/src/job_system/job.rs b/core/crates/heavy-lifting/src/job_system/job.rs index 4006481fd..91b3173b1 100644 --- a/core/crates/heavy-lifting/src/job_system/job.rs +++ b/core/crates/heavy-lifting/src/job_system/job.rs @@ -9,11 +9,14 @@ use sd_task_system::{ use std::{ collections::{hash_map::DefaultHasher, VecDeque}, + fmt, hash::{Hash, Hasher}, marker::PhantomData, + ops::{Deref, DerefMut}, path::Path, pin::pin, sync::Arc, + time::Duration, }; use async_channel as chan; @@ -28,12 +31,14 @@ use specta::Type; use strum::{Display, EnumString}; use tokio::{ spawn, - sync::{watch, Mutex}, + sync::{oneshot, watch, Mutex}, + time::Instant, }; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, instrument, trace, warn, Instrument, Level}; use uuid::Uuid; use super::{ + error::DispatcherError, report::{ Report, ReportBuilder, ReportInputMetadata, ReportMetadata, ReportOutputMetadata, Status, }, @@ -49,12 +54,27 @@ pub enum JobName { FileIdentifier, MediaProcessor, // TODO: Add more job names as needed + Copy, + Move, + Delete, + Erase, + FileValidator, } pub enum ReturnStatus { Completed(JobReturn), Shutdown(Result>, rmp_serde::encode::Error>), - Canceled, + Canceled(JobReturn), +} + +impl fmt::Debug for ReturnStatus { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Completed(job_return) => f.debug_tuple("Completed").field(job_return).finish(), + Self::Shutdown(_) => f.write_str("Shutdown()"), + Self::Canceled(job_return) => f.debug_tuple("Canceled").field(job_return).finish(), + } + } } pub enum ProgressUpdate { @@ -80,53 +100,69 @@ pub trait OuterContext: Send + Sync + Clone + 'static { fn sync(&self) -> &Arc; fn invalidate_query(&self, query: &'static str); fn query_invalidator(&self) -> impl Fn(&'static str) + Send + Sync; - fn progress(&self, updates: Vec); - fn progress_msg(&self, msg: impl Into) { - self.progress(vec![ProgressUpdate::Message(msg.into())]); - } fn report_update(&self, update: UpdateEvent); fn get_data_directory(&self) -> &Path; } +pub trait JobContext: OuterContext { + fn new(report: Report, ctx: OuterCtx) -> Self; + fn progress( + &self, + updates: impl IntoIterator + Send, + ) -> impl Future + Send; + fn progress_msg(&self, msg: impl Into) -> impl Future + Send { + let msg = msg.into(); + async move { + self.progress([ProgressUpdate::Message(msg)]).await; + } + } + fn report(&self) -> impl Future + Send> + Send; + fn report_mut(&self) -> impl Future + Send> + Send; + fn get_outer_ctx(&self) -> OuterCtx; +} + pub trait Job: Send + Sync + Hash + 'static { const NAME: JobName; #[allow(unused_variables)] - fn resume_tasks( + fn resume_tasks( &mut self, dispatcher: &JobTaskDispatcher, - ctx: &impl OuterContext, + ctx: &impl JobContext, serialized_tasks: SerializedTasks, ) -> impl Future> + Send { async move { Ok(()) } } - fn run( + fn run( self, dispatcher: JobTaskDispatcher, - ctx: Ctx, + ctx: impl JobContext, ) -> impl Future> + Send; } -pub trait IntoJob +pub trait IntoJob where - J: Job + SerializableJob, - Ctx: OuterContext, + J: Job + SerializableJob, + OuterCtx: OuterContext, + JobCtx: JobContext, { - fn into_job(self) -> Box>; + fn into_job(self) -> Box>; } -impl IntoJob for J +impl IntoJob for J where - J: Job + SerializableJob, - Ctx: OuterContext, + J: Job + SerializableJob, + OuterCtx: OuterContext, + JobCtx: JobContext, { - fn into_job(self) -> Box> { + fn into_job(self) -> Box> { let id = JobId::new_v4(); Box::new(JobHolder { id, job: self, + run_time: Duration::ZERO, report: ReportBuilder::new(id, J::NAME).build(), next_jobs: VecDeque::new(), _ctx: PhantomData, @@ -134,12 +170,13 @@ where } } -impl IntoJob for JobBuilder +impl IntoJob for JobEnqueuer where - J: Job + SerializableJob, - Ctx: OuterContext, + J: Job + SerializableJob, + OuterCtx: OuterContext, + JobCtx: JobContext, { - fn into_job(self) -> Box> { + fn into_job(self) -> Box> { self.build() } } @@ -147,7 +184,7 @@ where #[derive(Debug)] pub struct JobReturn { data: JobOutputData, - metadata: Option, + metadata: Vec, non_critical_errors: Vec, } @@ -164,7 +201,7 @@ impl Default for JobReturn { fn default() -> Self { Self { data: JobOutputData::Empty, - metadata: None, + metadata: vec![], non_critical_errors: vec![], } } @@ -183,8 +220,8 @@ impl JobReturnBuilder { } #[must_use] - pub fn with_metadata(mut self, metadata: impl Into) -> Self { - self.job_return.metadata = Some(metadata.into()); + pub fn with_metadata(mut self, metadata: impl Into>) -> Self { + self.job_return.metadata = metadata.into(); self } @@ -215,6 +252,13 @@ pub struct JobOutput { } impl JobOutput { + #[instrument( + skip_all, + fields( + name = %report.name, + non_critical_errors_count = non_critical_errors.len(), + ) + )] pub fn prepare_output_and_report( JobReturn { data, @@ -225,23 +269,18 @@ impl JobOutput { ) -> Self { if non_critical_errors.is_empty() { report.status = Status::Completed; - debug!("Job completed", report.id, report.name); + debug!("Job completed"); } else { report.status = Status::CompletedWithErrors; - report.non_critical_errors = non_critical_errors - .iter() - .map(ToString::to_string) - .collect(); + report.non_critical_errors.extend(non_critical_errors); warn!( - "Job completed with errors: {non_critical_errors:#?}", - report.id, report.name + non_critical_errors = ?report.non_critical_errors, + "Job completed with errors;", ); } - if let Some(metadata) = metadata { - report.metadata.push(ReportMetadata::Output(metadata)); - } + report.metadata.extend(metadata.into_iter().map(Into::into)); report.completed_at = Some(Utc::now()); @@ -251,7 +290,7 @@ impl JobOutput { job_name: report.name, data, metadata: report.metadata.clone(), - non_critical_errors, + non_critical_errors: report.non_critical_errors.clone(), } } } @@ -259,33 +298,36 @@ impl JobOutput { #[derive(Debug, Serialize, Type)] pub enum JobOutputData { Empty, - // TODO: Add more types + // TODO: Add more types as needed } -pub struct JobBuilder +pub struct JobEnqueuer where - J: Job + SerializableJob, - Ctx: OuterContext, + J: Job + SerializableJob, + OuterCtx: OuterContext, + JobCtx: JobContext, { id: JobId, job: J, report_builder: ReportBuilder, - next_jobs: VecDeque>>, - _ctx: PhantomData, + next_jobs: VecDeque>>, + _ctx: PhantomData, } -impl JobBuilder +impl JobEnqueuer where - J: Job + SerializableJob, - Ctx: OuterContext, + J: Job + SerializableJob, + OuterCtx: OuterContext, + JobCtx: JobContext, { - pub fn build(self) -> Box> { + fn build(self) -> Box> { Box::new(JobHolder { id: self.id, job: self.job, + run_time: Duration::ZERO, report: self.report_builder.build(), - next_jobs: VecDeque::new(), - _ctx: PhantomData, + next_jobs: self.next_jobs, + _ctx: self._ctx, }) } @@ -319,10 +361,10 @@ where } #[must_use] - pub fn enqueue_next(mut self, next: impl Job + SerializableJob) -> Self { + pub fn enqueue_next(mut self, next: impl Job + SerializableJob) -> Self { let next_job_order = self.next_jobs.len() + 1; - let mut child_job_builder = JobBuilder::new(next).with_parent_id(self.id); + let mut child_job_builder = JobEnqueuer::new(next).with_parent_id(self.id); if let Some(parent_action) = &self.report_builder.action { child_job_builder = @@ -335,40 +377,97 @@ where } } -pub struct JobHolder +pub struct JobHolder where - J: Job + SerializableJob, - Ctx: OuterContext, + J: Job + SerializableJob, + OuterCtx: OuterContext, + JobCtx: JobContext, { pub(super) id: JobId, pub(super) job: J, pub(super) report: Report, - pub(super) next_jobs: VecDeque>>, - pub(super) _ctx: PhantomData, + pub(super) run_time: Duration, + pub(super) next_jobs: VecDeque>>, + pub(super) _ctx: PhantomData, } -pub struct JobHandle { - pub(crate) next_jobs: VecDeque>>, - pub(crate) ctx: Ctx, - pub(crate) report: Report, - pub(crate) commands_tx: chan::Sender, +pub struct JobHandle> { + pub(crate) id: JobId, + pub(crate) start_time: Instant, + pub(crate) run_time: Duration, + pub(crate) is_running: bool, + pub(crate) next_jobs: VecDeque>>, + pub(crate) ctx: JobCtx, + pub(crate) commands_tx: chan::Sender<(Command, oneshot::Sender<()>)>, } -impl JobHandle { - pub async fn send_command(&mut self, command: Command) -> Result<(), JobSystemError> { - if self.commands_tx.send(command).await.is_err() { - warn!("Tried to send a {command:?} to a job that was already completed"); +impl> JobHandle { + #[instrument(skip(self, outer_ack_tx), fields(job_id = %self.id))] + pub async fn send_command( + &mut self, + command: Command, + outer_ack_tx: oneshot::Sender>, + ) { + trace!("JobHandle sending command"); + + let (ack_tx, ack_rx) = oneshot::channel(); + + let res = if self.commands_tx.send((command, ack_tx)).await.is_err() { + warn!("Tried to send command to a job that was already completed"); Ok(()) } else { - self.command_children(command).await + ack_rx + .await + .expect("inner ack channel closed before sending response to handle a job command"); + + match self.execute_command(command).await { + Ok(()) => self.command_children(command).await, + Err(e) => Err(e), + } + }; + + if res.is_ok() { + match command { + Command::Pause | Command::Cancel | Command::Shutdown => self.is_running = false, + Command::Resume => self.is_running = true, + } } + + outer_ack_tx + .send(res) + .unwrap_or_else(|_| panic!("ack channel closed before sending {command:?} response")); } - pub async fn command_children(&mut self, command: Command) -> Result<(), JobSystemError> { + #[instrument(skip_all, err)] + async fn execute_command(&mut self, command: Command) -> Result<(), JobSystemError> { let (new_status, completed_at) = match command { Command::Pause => (Status::Paused, None), - Command::Resume => return Ok(()), + Command::Resume => (Status::Running, None), + Command::Cancel => (Status::Canceled, Some(Utc::now())), + Command::Shutdown => { + // We don't need to do anything here, we will handle when the job returns its output + return Ok(()); + } + }; + + { + let mut report = self.ctx.report_mut().await; + + report.status = new_status; + report.completed_at = completed_at; + + report.update(self.ctx.db()).await?; + } + + Ok(()) + } + + #[instrument(skip_all, err)] + async fn command_children(&mut self, command: Command) -> Result<(), JobSystemError> { + let (new_status, completed_at) = match command { + Command::Pause | Command::Shutdown => (Status::Paused, None), + Command::Resume => (Status::Queued, None), Command::Cancel => (Status::Canceled, Some(Utc::now())), }; @@ -379,6 +478,11 @@ impl JobHandle { next_job_report.status = new_status; next_job_report.completed_at = completed_at; + trace!( + %next_job_report.id, + "Parent job sent command to children job;", + ); + next_job_report.update(self.ctx.db()).await }) .collect::>() @@ -388,39 +492,54 @@ impl JobHandle { .map_err(Into::into) } + #[instrument( + skip(self), + fields(job_id = %self.id), + ret(level = Level::TRACE), + err, + )] pub async fn register_start( &mut self, start_time: DateTime, ) -> Result<(), JobSystemError> { - let Self { - next_jobs, - report, - ctx, - .. - } = self; - - report.status = Status::Running; - if report.started_at.is_none() { - report.started_at = Some(start_time); - } + trace!("JobHandle registering start of job"); + let Self { next_jobs, ctx, .. } = self; let db = ctx.db(); - // If the report doesn't have a created_at date, it's a new report - if report.created_at.is_none() { - report.create(db).await?; - } else { - // Otherwise it can be a job being resumed or a children job that was already been created - report.update(db).await?; + let now = Utc::now(); + + { + let mut report = ctx.report_mut().await; + + report.status = Status::Running; + if report.started_at.is_none() { + report.started_at = Some(start_time); + } + + // If the report doesn't have a created_at date, it's a new report + if report.created_at.is_none() { + report.create(db, now).await?; + } else { + // Otherwise it can be a job being resumed or a children job that was already been created + report.update(db).await?; + } } // Registering children jobs - next_jobs + let res = next_jobs .iter_mut() - .map(|dyn_job| dyn_job.report_mut()) - .map(|next_job_report| async { + .enumerate() + .map(|(idx, dyn_job)| (idx, dyn_job.report_mut())) + .map(|(idx, next_job_report)| async move { + trace!( + %next_job_report.id, + "Parent job registering children;", + ); if next_job_report.created_at.is_none() { - next_job_report.create(db).await + next_job_report + .create(db, now + Duration::from_secs((idx + 1) as u64)) + .await } else { Ok(()) } @@ -429,70 +548,127 @@ impl JobHandle { .try_join() .await .map(|_| ()) - .map_err(Into::into) + .map_err(Into::into); + + ctx.invalidate_query("jobs.isActive"); + ctx.invalidate_query("jobs.reports"); + + res } + #[instrument( + skip_all, + fields( + id = %self.id, + + ), + err + )] pub async fn complete_job( &mut self, job_return: JobReturn, ) -> Result { - let Self { report, ctx, .. } = self; + let Self { ctx, .. } = self; - let output = JobOutput::prepare_output_and_report(job_return, report); + let mut report = ctx.report_mut().await; + + trace!("JobHandle completing"); + + let output = JobOutput::prepare_output_and_report(job_return, &mut report); report.update(ctx.db()).await?; + trace!("JobHandle completed"); + Ok(output) } + #[instrument( + skip(self), + fields( + id = %self.id, + ), + err + )] pub async fn failed_job(&mut self, e: &Error) -> Result<(), JobSystemError> { - let Self { report, ctx, .. } = self; - error!( - "Job failed with a critical error: {e:#?};", - report.id, report.name - ); + trace!("JobHandle registering failed job"); - report.status = Status::Failed; - report.critical_error = Some(e.to_string()); - report.completed_at = Some(Utc::now()); + let db = self.ctx.db(); + { + let mut report = self.ctx.report_mut().await; - report.update(ctx.db()).await?; + error!( + job_name = %report.name, + "Job failed with a critical error;", + ); + + report.status = Status::Failed; + report.critical_error = Some(e.to_string()); + report.completed_at = Some(Utc::now()); + + report.update(db).await?; + } + + trace!("JobHandle sending cancel command to children due to failure"); self.command_children(Command::Cancel).await } - pub async fn shutdown_pause_job(&mut self) -> Result<(), JobSystemError> { - let Self { report, ctx, .. } = self; - info!( - "Job paused due to system shutdown, we will pause all children jobs", - report.id, report.name - ); + #[instrument( + skip(self), + fields( + id = %self.id, + ), + err + )] + pub async fn cancel_job( + &mut self, + JobReturn { + data, + metadata, + non_critical_errors, + }: JobReturn, + ) -> Result { + trace!("JobHandle canceling job"); + let db = self.ctx.db(); - report.status = Status::Paused; + let output = { + let mut report = self.ctx.report_mut().await; - report.update(ctx.db()).await?; + debug!( + job_name = %report.name, + "Job canceled, we will cancel all children jobs;", + ); - self.command_children(Command::Pause).await - } + report.status = Status::Canceled; + report.non_critical_errors.extend(non_critical_errors); + report.metadata.extend(metadata.into_iter().map(Into::into)); + report.completed_at = Some(Utc::now()); - pub async fn cancel_job(&mut self) -> Result<(), JobSystemError> { - let Self { report, ctx, .. } = self; - info!( - "Job canceled, we will cancel all children jobs", - report.id, report.name - ); + report.update(db).await?; - report.status = Status::Canceled; - report.completed_at = Some(Utc::now()); + JobOutput { + id: report.id, + status: report.status, + job_name: report.name, + data, + metadata: report.metadata.clone(), + non_critical_errors: report.non_critical_errors.clone(), + } + }; - report.update(ctx.db()).await?; + trace!("JobHandle sending cancel command to children"); - self.command_children(Command::Cancel).await + self.command_children(Command::Cancel).await?; + + Ok(output) } } #[async_trait::async_trait] -pub trait DynJob: Send + Sync + 'static { +pub trait DynJob>: + Send + Sync + 'static +{ fn id(&self) -> JobId; fn job_name(&self) -> JobName; @@ -501,33 +677,34 @@ pub trait DynJob: Send + Sync + 'static { fn report_mut(&mut self) -> &mut Report; - fn set_next_jobs(&mut self, next_jobs: VecDeque>>); + fn set_next_jobs(&mut self, next_jobs: VecDeque>>); - fn next_jobs(&self) -> &VecDeque>>; + fn next_jobs(&self) -> &VecDeque>>; async fn serialize(self: Box) -> Result>, rmp_serde::encode::Error>; fn dispatch( self: Box, base_dispatcher: BaseTaskDispatcher, - ctx: Ctx, + ctx: OuterCtx, done_tx: chan::Sender<(JobId, Result)>, - ) -> JobHandle; + ) -> JobHandle; fn resume( self: Box, base_dispatcher: BaseTaskDispatcher, - ctx: Ctx, + ctx: OuterCtx, serialized_tasks: Option, done_tx: chan::Sender<(JobId, Result)>, - ) -> JobHandle; + ) -> JobHandle; } #[async_trait::async_trait] -impl DynJob for JobHolder +impl DynJob for JobHolder where - J: Job + SerializableJob, - Ctx: OuterContext, + J: Job + SerializableJob, + OuterCtx: OuterContext, + JobCtx: JobContext, { fn id(&self) -> JobId { self.id @@ -548,11 +725,11 @@ where &mut self.report } - fn set_next_jobs(&mut self, next_jobs: VecDeque>>) { + fn set_next_jobs(&mut self, next_jobs: VecDeque>>) { self.next_jobs = next_jobs; } - fn next_jobs(&self) -> &VecDeque>> { + fn next_jobs(&self) -> &VecDeque>> { &self.next_jobs } @@ -560,15 +737,20 @@ where self.job.serialize().await } + #[instrument(skip_all, fields(id = %self.id))] fn dispatch( self: Box, base_dispatcher: BaseTaskDispatcher, - ctx: Ctx, + ctx: OuterCtx, done_tx: chan::Sender<(JobId, Result)>, - ) -> JobHandle { + ) -> JobHandle { let (commands_tx, commands_rx) = chan::bounded(8); - spawn(to_spawn_job( + let ctx = JobCtx::new(self.report, ctx); + + trace!("Dispatching job"); + + spawn(to_spawn_job::( self.id, self.job, ctx.clone(), @@ -579,23 +761,37 @@ where )); JobHandle { + id: self.id, + start_time: Instant::now(), + is_running: true, + run_time: Duration::ZERO, next_jobs: self.next_jobs, ctx, - report: self.report, commands_tx, } } + #[instrument( + skip_all, + fields( + id = %self.id, + has_serialized_tasks = %serialized_tasks.is_some(), + ) + )] fn resume( self: Box, base_dispatcher: BaseTaskDispatcher, - ctx: Ctx, + ctx: OuterCtx, serialized_tasks: Option, done_tx: chan::Sender<(JobId, Result)>, - ) -> JobHandle { + ) -> JobHandle { let (commands_tx, commands_rx) = chan::bounded(8); - spawn(to_spawn_job( + let ctx = JobCtx::new(self.report, ctx); + + trace!("Resuming job"); + + spawn(to_spawn_job::( self.id, self.job, ctx.clone(), @@ -606,25 +802,33 @@ where )); JobHandle { + id: self.id, + start_time: Instant::now(), + is_running: true, + run_time: self.run_time, next_jobs: self.next_jobs, ctx, - report: self.report, commands_tx, } } } -async fn to_spawn_job( - id: JobId, - mut job: impl Job, - ctx: Ctx, +#[instrument(name = "job_executor", skip_all, fields(%job_id, name = %J::NAME))] +async fn to_spawn_job( + job_id: JobId, + mut job: J, + ctx: JobCtx, existing_tasks: Option, base_dispatcher: BaseTaskDispatcher, - commands_rx: chan::Receiver, + commands_rx: chan::Receiver<(Command, oneshot::Sender<()>)>, done_tx: chan::Sender<(JobId, Result)>, -) { +) where + OuterCtx: OuterContext, + JobCtx: JobContext, + J: Job, +{ enum StreamMessage { - Commands(Command), + Commands((Command, oneshot::Sender<()>)), NewRemoteController(TaskRemoteController), Done(Result), } @@ -634,12 +838,12 @@ async fn to_spawn_job( let (running_state_tx, running_state_rx) = watch::channel(JobRunningState::Running); let (dispatcher, remote_controllers_rx) = - JobTaskDispatcher::new(base_dispatcher, running_state_rx); + JobTaskDispatcher::new(job_id, base_dispatcher, running_state_rx); if let Some(existing_tasks) = existing_tasks { if let Err(e) = job.resume_tasks(&dispatcher, &ctx, existing_tasks).await { done_tx - .send((id, Err(e))) + .send((job_id, Err(e))) .await .expect("jobs done tx closed on error at resume_tasks"); @@ -647,24 +851,53 @@ async fn to_spawn_job( } } + let (tx, rx) = chan::bounded(1); + + spawn( + async move { + tx.send(job.run::(dispatcher, ctx).await) + .await + .expect("job run channel closed"); + } + .in_current_span(), + ); + + let commands_rx_to_close = commands_rx.clone(); + let mut msgs_stream = pin!(( commands_rx.map(StreamMessage::Commands), - remote_controllers_rx.map(StreamMessage::NewRemoteController), - stream::once(job.run(dispatcher, ctx)).map(StreamMessage::Done), + remote_controllers_rx + .clone() + .map(StreamMessage::NewRemoteController), + stream::once({ + let rx = rx.clone(); + async move { rx.recv().await.expect("job run rx closed") } + }) + .map(StreamMessage::Done), ) .merge()); while let Some(msg) = msgs_stream.next().await { match msg { StreamMessage::NewRemoteController(remote_controller) => { + trace!("new remote controller received"); remote_controllers.push(remote_controller); + trace!("added new remote controller"); } - StreamMessage::Commands(command) => { + StreamMessage::Commands((command, ack_tx)) => { + // Add any possible pending remote controllers to the list + while let Ok(remote_controller) = remote_controllers_rx.try_recv() { + remote_controllers.push(remote_controller); + } + remote_controllers.retain(|controller| !controller.is_done()); match command { Command::Pause => { + trace!("Pausing job"); running_state_tx.send_modify(|state| *state = JobRunningState::Paused); + trace!(tasks_count = remote_controllers.len(), "pausing tasks;"); + remote_controllers .iter() .map(TaskRemoteController::pause) @@ -676,12 +909,18 @@ async fn to_spawn_job( if let Err(e) = res { assert!(matches!(e, TaskSystemError::TaskNotFound(_))); - warn!("Tried to pause a task that was already completed"); + trace!("Tried to pause a task that was already completed"); } }); + + ack_tx.send(()).expect("ack channel closed"); + trace!("paused job"); } + Command::Resume => { + trace!("Resuming job"); running_state_tx.send_modify(|state| *state = JobRunningState::Running); + trace!(tasks_count = remote_controllers.len(), "resuming tasks"); remote_controllers .iter() @@ -694,45 +933,102 @@ async fn to_spawn_job( if let Err(e) = res { assert!(matches!(e, TaskSystemError::TaskNotFound(_))); - warn!("Tried to pause a task that was already completed"); + trace!("Tried to resume a task that was already completed"); } }); + + ack_tx.send(()).expect("ack channel closed"); + trace!("resumed job"); } + Command::Cancel => { + trace!("Canceling job"); + running_state_tx.send_modify(|state| *state = JobRunningState::Canceled); + trace!(tasks_count = remote_controllers.len(), "canceling tasks;"); + remote_controllers .iter() .map(TaskRemoteController::cancel) .collect::>() .join() - .await; - - return done_tx - .send((id, Ok(ReturnStatus::Canceled))) .await - .expect("jobs done tx closed"); + .into_iter() + .for_each(|res| { + if let Err(e) = res { + assert!(matches!(e, TaskSystemError::TaskNotFound(_))); + + trace!("Tried to cancel a task that was already completed"); + } + }); + + trace!("canceled job"); + + commands_rx_to_close.close(); + let res = rx.recv().await.expect("job run rx closed"); + ack_tx.send(()).expect("ack channel closed"); + trace!("Job cancellation done"); + + return finish_job(job_id, res, remote_controllers, done_tx).await; + } + + Command::Shutdown => { + trace!("Shutting down job"); + running_state_tx.send_modify(|state| *state = JobRunningState::Shutdown); + debug!( + tasks_count = remote_controllers.len(), + "shutting down tasks;" + ); + + commands_rx_to_close.close(); + // Just need to wait for the job to finish with the shutdown status + let res = rx.recv().await.expect("job run rx closed"); + ack_tx.send(()).expect("ack channel closed"); + trace!("Job shutdown done"); + + return finish_job(job_id, res, remote_controllers, done_tx).await; } } } StreamMessage::Done(res) => { - #[cfg(debug_assertions)] - { - // Just a sanity check to make sure we don't have any pending tasks left - remote_controllers.retain(|controller| !controller.is_done()); - assert!(remote_controllers.is_empty()); - // Using #[cfg(debug_assertions)] to don't pay this retain cost in release builds - } - - return done_tx.send((id, res)).await.expect("jobs done tx closed"); + trace!("Job done"); + commands_rx_to_close.close(); + return finish_job(job_id, res, remote_controllers, done_tx).await; } } } } +#[instrument(skip(remote_controllers, done_tx))] +async fn finish_job( + job_id: JobId, + job_result: Result, + mut remote_controllers: Vec, + done_tx: chan::Sender<(JobId, Result)>, +) { + trace!("Checking remove controllers"); + #[cfg(debug_assertions)] + { + // Just a sanity check to make sure we don't have any pending tasks left + remote_controllers.retain(|controller| !controller.is_done()); + assert!(remote_controllers.is_empty()); + // Using #[cfg(debug_assertions)] to don't pay this retain cost in release builds + } + + trace!("Sending job done message"); + + done_tx + .send((job_id, job_result)) + .await + .expect("jobs done tx closed"); +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum JobRunningState { Running, Paused, + Canceled, + Shutdown, } impl Default for JobRunningState { @@ -743,47 +1039,70 @@ impl Default for JobRunningState { #[derive(Debug, Clone)] pub struct JobTaskDispatcher { + job_id: JobId, dispatcher: BaseTaskDispatcher, remote_controllers_tx: chan::Sender, running_state: Arc>>, } impl TaskDispatcher for JobTaskDispatcher { - async fn dispatch_boxed(&self, boxed_task: Box>) -> TaskHandle { - self.wait_for_dispatch_approval().await; + type DispatchError = DispatcherError; - let handle = self.dispatcher.dispatch_boxed(boxed_task).await; + async fn dispatch_boxed( + &self, + boxed_task: Box>, + ) -> Result, Self::DispatchError> { + match self.wait_for_dispatch_approval().await { + DispatchApproval::Canceled => Err(DispatcherError::JobCanceled(self.job_id)), + DispatchApproval::Shutdown => Err(DispatcherError::Shutdown(vec![boxed_task])), + DispatchApproval::Approved => { + let handle = self.dispatcher.dispatch_boxed(boxed_task).await?; - self.remote_controllers_tx - .send(handle.remote_controller()) - .await - .expect("remote controllers tx closed"); + self.remote_controllers_tx + .send(handle.remote_controller()) + .await + .expect("remote controllers tx closed"); - handle + Ok(handle) + } + } } async fn dispatch_many_boxed( &self, boxed_tasks: impl IntoIterator>> + Send, - ) -> Vec> { - self.wait_for_dispatch_approval().await; + ) -> Result>, Self::DispatchError> { + match self.wait_for_dispatch_approval().await { + DispatchApproval::Canceled => Err(DispatcherError::JobCanceled(self.job_id)), + DispatchApproval::Shutdown => { + Err(DispatcherError::Shutdown(boxed_tasks.into_iter().collect())) + } + DispatchApproval::Approved => { + let handles = self.dispatcher.dispatch_many_boxed(boxed_tasks).await?; - let handles = self.dispatcher.dispatch_many_boxed(boxed_tasks).await; + handles + .iter() + .map(|handle| self.remote_controllers_tx.send(handle.remote_controller())) + .collect::>() + .try_join() + .await + .expect("remote controllers tx closed"); - handles - .iter() - .map(|handle| self.remote_controllers_tx.send(handle.remote_controller())) - .collect::>() - .try_join() - .await - .expect("remote controllers tx closed"); - - handles + Ok(handles) + } + } } } +enum DispatchApproval { + Approved, + Canceled, + Shutdown, +} + impl JobTaskDispatcher { fn new( + job_id: JobId, dispatcher: BaseTaskDispatcher, running_state_rx: watch::Receiver, ) -> (Self, chan::Receiver) { @@ -791,6 +1110,7 @@ impl JobTaskDispatcher { ( Self { + job_id, dispatcher, remote_controllers_tx, running_state: Arc::new(Mutex::new(running_state_rx)), @@ -799,12 +1119,36 @@ impl JobTaskDispatcher { ) } - async fn wait_for_dispatch_approval(&self) { - self.running_state - .lock() - .await - .wait_for(|state| *state == JobRunningState::Running) - .await - .expect("job running state watch channel unexpectedly closed"); + async fn wait_for_dispatch_approval(&self) -> DispatchApproval { + { + let mut running_state_rx = self.running_state.lock().await; + + if running_state_rx + .has_changed() + .expect("job running state watch channel unexpectedly closed") + { + trace!("waiting for job running state to change"); + running_state_rx + .wait_for(|state| { + matches!( + *state, + JobRunningState::Running + | JobRunningState::Canceled | JobRunningState::Shutdown + ) + }) + .await + .expect("job running state watch channel unexpectedly closed"); + + let state = { *running_state_rx.borrow() }; + + match state { + JobRunningState::Shutdown => return DispatchApproval::Shutdown, + JobRunningState::Canceled => return DispatchApproval::Canceled, + _ => {} + } + } + } + + DispatchApproval::Approved } } diff --git a/core/crates/heavy-lifting/src/job_system/mod.rs b/core/crates/heavy-lifting/src/job_system/mod.rs index a8b552a70..64a315a02 100644 --- a/core/crates/heavy-lifting/src/job_system/mod.rs +++ b/core/crates/heavy-lifting/src/job_system/mod.rs @@ -1,16 +1,22 @@ -use crate::Error; +use crate::{Error, JobContext}; use sd_prisma::prisma::location; use sd_task_system::BaseTaskDispatcher; use sd_utils::error::FileIOError; -use std::{cell::RefCell, collections::hash_map::HashMap, path::Path, sync::Arc}; +use std::{ + cell::RefCell, + collections::hash_map::HashMap, + panic, + path::{Path, PathBuf}, + sync::Arc, +}; use async_channel as chan; use futures::Stream; use futures_concurrency::future::{Join, TryJoin}; use tokio::{fs, spawn, sync::oneshot, task::JoinHandle}; -use tracing::{error, info, trace, warn}; +use tracing::{debug, error, info, instrument, trace, warn}; use uuid::Uuid; mod error; @@ -20,8 +26,9 @@ mod runner; mod store; pub mod utils; -use error::JobSystemError; +pub use error::{DispatcherError, JobErrorOrDispatcherError, JobSystemError}; use job::{IntoJob, Job, JobName, JobOutput, OuterContext}; +use report::Report; use runner::{run, JobSystemRunner, RunnerMessage}; use store::{load_jobs, StoredJobEntry}; @@ -36,22 +43,23 @@ pub enum Command { Pause, Resume, Cancel, + Shutdown, } -pub struct JobSystem { - msgs_tx: chan::Sender>, - job_outputs_rx: chan::Receiver<(JobId, Result)>, +pub struct JobSystem> { + msgs_tx: chan::Sender>, + job_outputs_rx: chan::Receiver<(JobId, Result)>, + store_jobs_file: Arc, runner_handle: RefCell>>, } -impl JobSystem { - pub async fn new( +impl> JobSystem { + pub fn new( base_dispatcher: BaseTaskDispatcher, - data_directory: impl AsRef + Send, - previously_existing_contexts: &HashMap, - ) -> Result { + data_directory: impl AsRef, + ) -> Self { let (job_outputs_tx, job_outputs_rx) = chan::unbounded(); - let (job_return_status_tx, job_return_status_rx) = chan::bounded(16); + let (job_done_tx, job_done_rx) = chan::bounded(16); let (msgs_tx, msgs_rx) = chan::bounded(8); let store_jobs_file = Arc::new(data_directory.as_ref().join(PENDING_JOBS_FILE)); @@ -63,8 +71,8 @@ impl JobSystem { while let Err(e) = spawn({ let store_jobs_file = Arc::clone(&store_jobs_file); let base_dispatcher = base_dispatcher.clone(); - let job_return_status_tx = job_return_status_tx.clone(); - let job_return_status_rx = job_return_status_rx.clone(); + let job_return_status_tx = job_done_tx.clone(); + let job_done_rx = job_done_rx.clone(); let job_outputs_tx = job_outputs_tx.clone(); let msgs_rx = msgs_rx.clone(); @@ -77,7 +85,7 @@ impl JobSystem { ), store_jobs_file.as_ref(), msgs_rx, - job_return_status_rx, + job_done_rx, ) .await; } @@ -85,7 +93,7 @@ impl JobSystem { .await { if e.is_panic() { - error!("Job system panicked: {e:#?}"); + error!(?e, "Job system panicked;"); } else { trace!("JobSystemRunner received shutdown signal and will exit..."); break; @@ -97,22 +105,47 @@ impl JobSystem { } }))); - load_stored_job_entries( - store_jobs_file.as_ref(), - previously_existing_contexts, - &msgs_tx, - ) - .await?; - - Ok(Self { + Self { msgs_tx, job_outputs_rx, + store_jobs_file, runner_handle, - }) + } + } + + pub async fn init( + &self, + previously_existing_contexts: &HashMap, + ) -> Result<(), JobSystemError> { + load_stored_job_entries( + &*self.store_jobs_file, + previously_existing_contexts, + &self.msgs_tx, + ) + .await + } + + /// Get a map of all active reports with their respective job ids + /// + /// # Panics + /// + /// Panics only happen if internal channels are unexpectedly closed + pub async fn get_active_reports(&self) -> HashMap { + let (ack_tx, ack_rx) = oneshot::channel(); + self.msgs_tx + .send(RunnerMessage::GetActiveReports { ack_tx }) + .await + .expect("runner msgs channel unexpectedly closed on get active reports request"); + + ack_rx + .await + .expect("ack channel closed before receiving get active reports response") } /// Checks if *any* of the desired jobs is running for the desired location + /// /// # Panics + /// /// Panics only happen if internal channels are unexpectedly closed pub async fn check_running_jobs( &self, @@ -122,7 +155,7 @@ impl JobSystem { let (ack_tx, ack_rx) = oneshot::channel(); self.msgs_tx - .send(RunnerMessage::CheckIfJobAreRunning { + .send(RunnerMessage::CheckIfJobsAreRunning { job_names, location_id, ack_tx, @@ -136,7 +169,9 @@ impl JobSystem { } /// Shutdown the job system + /// /// # Panics + /// /// Panics only happen if internal channels are unexpectedly closed pub async fn shutdown(&self) { if let Some(handle) = self @@ -152,7 +187,7 @@ impl JobSystem { if let Err(e) = handle.await { if e.is_panic() { - error!("JobSystem panicked: {e:#?}"); + error!(?e, "JobSystem panicked;"); } } info!("JobSystem gracefully shutdown"); @@ -162,13 +197,15 @@ impl JobSystem { } /// Dispatch a new job to the system + /// /// # Panics + /// /// Panics only happen if internal channels are unexpectedly closed - pub async fn dispatch>( - &mut self, - job: impl IntoJob + Send, + pub async fn dispatch>( + &self, + job: impl IntoJob + Send, location_id: location::id::Type, - ctx: Ctx, + ctx: OuterCtx, ) -> Result { let dyn_job = job.into_job(); let id = dyn_job.id(); @@ -176,7 +213,7 @@ impl JobSystem { let (ack_tx, ack_rx) = oneshot::channel(); self.msgs_tx .send(RunnerMessage::NewJob { - id, + job_id: id, location_id, dyn_job, ctx, @@ -191,17 +228,35 @@ impl JobSystem { .map(|()| id) } - pub fn receive_job_outputs( - &self, - ) -> impl Stream)> { + /// Check if there are any active jobs for the desired [`OuterContext`] + /// + /// # Panics + /// + /// Panics only happen if internal channels are unexpectedly closed + pub async fn has_active_jobs(&self, ctx: OuterCtx) -> bool { + let ctx_id = ctx.id(); + + let (ack_tx, ack_rx) = oneshot::channel(); + self.msgs_tx + .send(RunnerMessage::HasActiveJobs { ctx_id, ack_tx }) + .await + .expect("runner msgs channel unexpectedly closed on has active jobs request"); + + ack_rx + .await + .expect("ack channel closed before receiving has active jobs response") + } + + pub fn receive_job_outputs(&self) -> impl Stream)> { self.job_outputs_rx.clone() } - async fn send_command(&self, id: JobId, command: Command) -> Result<(), JobSystemError> { + #[instrument(skip(self), err)] + async fn send_command(&self, job_id: JobId, command: Command) -> Result<(), JobSystemError> { let (ack_tx, ack_rx) = oneshot::channel(); self.msgs_tx .send(RunnerMessage::Command { - id, + job_id, command, ack_tx, }) @@ -215,38 +270,48 @@ impl JobSystem { .unwrap_or_else(|_| panic!("ack channel closed before receiving {command:?} response")) } - pub async fn pause(&self, id: JobId) -> Result<(), JobSystemError> { - self.send_command(id, Command::Pause).await + pub async fn pause(&self, job_id: JobId) -> Result<(), JobSystemError> { + self.send_command(job_id, Command::Pause).await } - pub async fn resume(&self, id: JobId) -> Result<(), JobSystemError> { - self.send_command(id, Command::Resume).await + pub async fn resume(&self, job_id: JobId) -> Result<(), JobSystemError> { + self.send_command(job_id, Command::Resume).await } - pub async fn cancel(&self, id: JobId) -> Result<(), JobSystemError> { - self.send_command(id, Command::Cancel).await + pub async fn cancel(&self, job_id: JobId) -> Result<(), JobSystemError> { + self.send_command(job_id, Command::Cancel).await } } /// SAFETY: Due to usage of refcell we lost `Sync` impl, but we only use it to have a shutdown method /// receiving `&self` which is called once, and we also use `try_borrow_mut` so we never panic -unsafe impl Sync for JobSystem {} +unsafe impl> Sync + for JobSystem +{ +} -async fn load_stored_job_entries( +async fn load_stored_job_entries>( store_jobs_file: impl AsRef + Send, - previously_existing_job_contexts: &HashMap, - msgs_tx: &chan::Sender>, + previously_existing_job_contexts: &HashMap, + msgs_tx: &chan::Sender>, ) -> Result<(), JobSystemError> { let store_jobs_file = store_jobs_file.as_ref(); let stores_jobs_by_db = rmp_serde::from_slice::>>( - &fs::read(store_jobs_file).await.map_err(|e| { - JobSystemError::StoredJobs(FileIOError::from(( - store_jobs_file, - e, - "Failed to load jobs from disk", - ))) - })?, + &match fs::read(store_jobs_file).await { + Ok(bytes) => bytes, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + debug!("No pending jobs found on disk"); + return Ok(()); + } + Err(e) => { + return Err(JobSystemError::StoredJobs(FileIOError::from(( + store_jobs_file, + e, + "Failed to load jobs from disk", + )))) + } + }, )?; stores_jobs_by_db @@ -254,7 +319,7 @@ async fn load_stored_job_entries( .filter_map(|(ctx_id, entries)| { previously_existing_job_contexts.get(&ctx_id).map_or_else( || { - warn!("Found stored jobs for a database that doesn't exist anymore: "); + warn!(%ctx_id, "Found stored jobs for a database that doesn't exist anymore;"); None }, |ctx| Some((entries, ctx.clone())), @@ -270,7 +335,7 @@ async fn load_stored_job_entries( .await .into_iter() .filter_map(|res| { - res.map_err(|e| error!("Failed to load stored jobs: {e:#?}")) + res.map_err(|e| error!(?e, "Failed to load stored jobs;")) .ok() }) .flat_map(|(stored_jobs, ctx)| { @@ -283,7 +348,7 @@ async fn load_stored_job_entries( msgs_tx .send(RunnerMessage::ResumeStoredJob { - id: dyn_job.id(), + job_id: dyn_job.id(), location_id, dyn_job, ctx, diff --git a/core/crates/heavy-lifting/src/job_system/report.rs b/core/crates/heavy-lifting/src/job_system/report.rs index dbb9af221..ff2e11496 100644 --- a/core/crates/heavy-lifting/src/job_system/report.rs +++ b/core/crates/heavy-lifting/src/job_system/report.rs @@ -1,14 +1,15 @@ -use sd_prisma::prisma::{job, PrismaClient}; +use crate::NonCriticalError; + +use sd_prisma::prisma::{file_path, job, location, PrismaClient}; use sd_utils::db::{maybe_missing, MissingFieldError}; -use std::{collections::HashMap, fmt, str::FromStr}; +use std::{collections::HashMap, fmt, path::PathBuf, str::FromStr}; use chrono::{DateTime, Utc}; use prisma_client_rust::QueryError; use serde::{Deserialize, Serialize}; use specta::Type; use strum::ParseError; -use tracing::error; use super::{job::JobName, JobId}; @@ -22,10 +23,8 @@ pub enum ReportError { InvalidJobStatusInt(i32), #[error("job not found in database: ")] MissingReport(JobId), - #[error("serialization error: {0}")] - Serialization(#[from] rmp_serde::encode::Error), - #[error("deserialization error: {0}")] - Deserialization(#[from] rmp_serde::decode::Error), + #[error("json error: {0}")] + Json(#[from] serde_json::Error), #[error(transparent)] MissingField(#[from] MissingFieldError), #[error("failed to parse job name from database: {0}")] @@ -44,10 +43,7 @@ impl From for rspc::Error { ReportError::MissingReport(_) => { Self::with_cause(rspc::ErrorCode::NotFound, e.to_string(), e) } - ReportError::Serialization(_) - | ReportError::Deserialization(_) - | ReportError::MissingField(_) - | ReportError::JobNameParse(_) => { + ReportError::Json(_) | ReportError::MissingField(_) | ReportError::JobNameParse(_) => { Self::with_cause(rspc::ErrorCode::InternalServerError, e.to_string(), e) } } @@ -55,21 +51,78 @@ impl From for rspc::Error { } #[derive(Debug, Serialize, Deserialize, Type, Clone)] +#[serde(rename_all = "snake_case")] +#[serde(tag = "type", content = "metadata")] pub enum ReportMetadata { Input(ReportInputMetadata), Output(ReportOutputMetadata), } #[derive(Debug, Serialize, Deserialize, Type, Clone)] +#[serde(rename_all = "snake_case")] +#[serde(tag = "type", content = "data")] pub enum ReportInputMetadata { - Placeholder, - // TODO: Add more types + // TODO: Add more variants as needed + Location(location::Data), + SubPath(PathBuf), } #[derive(Debug, Serialize, Deserialize, Type, Clone)] +#[serde(rename_all = "snake_case")] +#[serde(tag = "type", content = "data")] pub enum ReportOutputMetadata { Metrics(HashMap), - // TODO: Add more types + Indexer { + total_paths: (u32, u32), + }, + FileIdentifier { + total_orphan_paths: (u32, u32), + total_objects_created: (u32, u32), + total_objects_linked: (u32, u32), + }, + MediaProcessor { + media_data_extracted: (u32, u32), + media_data_skipped: (u32, u32), + thumbnails_generated: (u32, u32), + thumbnails_skipped: (u32, u32), + }, + Copier { + source_location_id: location::id::Type, + target_location_id: location::id::Type, + sources_file_path_ids: Vec, + target_location_relative_directory_path: PathBuf, + }, + Mover { + source_location_id: location::id::Type, + target_location_id: location::id::Type, + sources_file_path_ids: Vec, + target_location_relative_directory_path: PathBuf, + }, + Deleter { + location_id: location::id::Type, + file_path_ids: Vec, + }, + Eraser { + location_id: location::id::Type, + file_path_ids: Vec, + passes: u32, + }, + FileValidator { + location_id: location::id::Type, + sub_path: Option, + }, +} + +impl From for ReportMetadata { + fn from(value: ReportInputMetadata) -> Self { + Self::Input(value) + } +} + +impl From for ReportMetadata { + fn from(value: ReportOutputMetadata) -> Self { + Self::Output(value) + } } #[derive(Debug, Serialize, Type, Clone)] @@ -80,7 +133,7 @@ pub struct Report { pub metadata: Vec, pub critical_error: Option, - pub non_critical_errors: Vec, + pub non_critical_errors: Vec, pub created_at: Option>, pub started_at: Option>, @@ -111,46 +164,53 @@ impl fmt::Display for Report { impl TryFrom for Report { type Error = ReportError; - fn try_from(data: job::Data) -> Result { + fn try_from( + job::Data { + id, + name, + action, + status, + errors_text: _, // Deprecated + critical_error, + non_critical_errors, + data: _, // Deprecated + metadata, + parent_id, + task_count, + completed_task_count, + date_estimated_completion, + date_created, + date_started, + date_completed, + .. + }: job::Data, + ) -> Result { Ok(Self { - id: JobId::from_slice(&data.id).expect("corrupted database"), - name: JobName::from_str(&maybe_missing(data.name, "job.name")?)?, - action: data.action, - - metadata: data - .metadata - .map(|m| { - rmp_serde::from_slice(&m).unwrap_or_else(|e| { - error!("Failed to deserialize job metadata: {e:#?}"); - vec![] - }) - }) - .unwrap_or_default(), - critical_error: data.critical_error, - non_critical_errors: data.non_critical_errors.map_or_else( - Default::default, - |non_critical_errors| { - serde_json::from_slice(&non_critical_errors).unwrap_or_else(|e| { - error!("Failed to deserialize job non-critical errors: {e:#?}"); - vec![] - }) - }, - ), - created_at: data.date_created.map(DateTime::into), - started_at: data.date_started.map(DateTime::into), - completed_at: data.date_completed.map(DateTime::into), - parent_id: data - .parent_id - .map(|id| JobId::from_slice(&id).expect("corrupted database")), - status: Status::try_from(maybe_missing(data.status, "job.status")?) + id: JobId::from_slice(&id).expect("corrupted database"), + name: JobName::from_str(&maybe_missing(name, "job.name")?)?, + action, + metadata: if let Some(metadata) = metadata { + serde_json::from_slice(&metadata)? + } else { + vec![] + }, + critical_error, + non_critical_errors: if let Some(non_critical_errors) = non_critical_errors { + serde_json::from_slice(&non_critical_errors)? + } else { + vec![] + }, + created_at: date_created.map(DateTime::into), + started_at: date_started.map(DateTime::into), + completed_at: date_completed.map(DateTime::into), + parent_id: parent_id.map(|id| JobId::from_slice(&id).expect("corrupted database")), + status: Status::try_from(maybe_missing(status, "job.status")?) .expect("corrupted database"), - task_count: data.task_count.unwrap_or(0), - completed_task_count: data.completed_task_count.unwrap_or(0), + task_count: task_count.unwrap_or(0), + completed_task_count: completed_task_count.unwrap_or(0), phase: String::new(), message: String::new(), - estimated_completion: data - .date_estimated_completion - .map_or_else(Utc::now, DateTime::into), + estimated_completion: date_estimated_completion.map_or_else(Utc::now, DateTime::into), }) } } @@ -178,6 +238,10 @@ impl Report { } } + pub fn push_metadata(&mut self, metadata: ReportOutputMetadata) { + self.metadata.push(metadata.into()); + } + #[must_use] pub fn get_action_name_and_group_key(&self) -> (String, Option) { // actions are formatted like "added_location" or "added_location-1" @@ -197,9 +261,11 @@ impl Report { (action_name, Some(group_key)) } - pub async fn create(&mut self, db: &PrismaClient) -> Result<(), ReportError> { - let now = Utc::now(); - + pub async fn create( + &mut self, + db: &PrismaClient, + created_at: DateTime, + ) -> Result<(), ReportError> { db.job() .create( self.id.as_bytes().to_vec(), @@ -207,11 +273,11 @@ impl Report { [ job::name::set(Some(self.name.to_string())), job::action::set(self.action.clone()), - job::date_created::set(Some(now.into())), - job::metadata::set(Some(rmp_serde::to_vec(&self.metadata)?)), + job::date_created::set(Some(created_at.into())), + job::metadata::set(Some(serde_json::to_vec(&self.metadata)?)), job::status::set(Some(self.status as i32)), job::date_started::set(self.started_at.map(Into::into)), - job::task_count::set(Some(1)), + job::task_count::set(Some(0)), job::completed_task_count::set(Some(0)), ], [self @@ -224,7 +290,7 @@ impl Report { .map_err(ReportError::Create)?; // Only setting created_at after we successfully created the job in DB - self.created_at = Some(now); + self.created_at = Some(created_at); Ok(()) } @@ -236,10 +302,10 @@ impl Report { vec![ job::status::set(Some(self.status as i32)), job::critical_error::set(self.critical_error.clone()), - job::non_critical_errors::set(Some(rmp_serde::to_vec( + job::non_critical_errors::set(Some(serde_json::to_vec( &self.non_critical_errors, )?)), - job::metadata::set(Some(rmp_serde::to_vec(&self.metadata)?)), + job::metadata::set(Some(serde_json::to_vec(&self.metadata)?)), job::task_count::set(Some(self.task_count)), job::completed_task_count::set(Some(self.completed_task_count)), job::date_started::set(self.started_at.map(Into::into)), @@ -347,7 +413,7 @@ impl ReportBuilder { #[must_use] pub fn with_metadata(mut self, metadata: ReportInputMetadata) -> Self { - self.metadata.push(ReportMetadata::Input(metadata)); + self.metadata.push(metadata.into()); self } diff --git a/core/crates/heavy-lifting/src/job_system/runner.rs b/core/crates/heavy-lifting/src/job_system/runner.rs index f1ea8f137..ae067fb0b 100644 --- a/core/crates/heavy-lifting/src/job_system/runner.rs +++ b/core/crates/heavy-lifting/src/job_system/runner.rs @@ -1,4 +1,4 @@ -use crate::Error; +use crate::{Error, JobContext}; use sd_prisma::prisma::location; use sd_task_system::BaseTaskDispatcher; @@ -15,19 +15,23 @@ use std::{ use async_channel as chan; use chrono::Utc; use futures::StreamExt; -use futures_concurrency::{future::TryJoin, stream::Merge}; +use futures_concurrency::{ + future::{Join, TryJoin}, + stream::Merge, +}; +use serde_json::json; use tokio::{ fs, sync::oneshot, time::{interval_at, Instant}, }; use tokio_stream::wrappers::IntervalStream; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, info, instrument, trace, warn}; use uuid::Uuid; use super::{ job::{DynJob, JobHandle, JobName, JobOutput, OuterContext, ReturnStatus}, - report, + report::{self, ReportOutputMetadata}, store::{StoredJob, StoredJobEntry}, Command, JobId, JobSystemError, SerializedTasks, }; @@ -35,61 +39,76 @@ use super::{ const JOBS_INITIAL_CAPACITY: usize = 32; const FIVE_MINUTES: Duration = Duration::from_secs(5 * 60); -pub(super) enum RunnerMessage { +pub(super) enum RunnerMessage> { NewJob { - id: JobId, + job_id: JobId, location_id: location::id::Type, - dyn_job: Box>, - ctx: Ctx, + dyn_job: Box>, + ctx: OuterCtx, ack_tx: oneshot::Sender>, }, ResumeStoredJob { - id: JobId, + job_id: JobId, location_id: location::id::Type, - dyn_job: Box>, - ctx: Ctx, + dyn_job: Box>, + ctx: OuterCtx, serialized_tasks: Option, ack_tx: oneshot::Sender>, }, Command { - id: JobId, + job_id: JobId, command: Command, ack_tx: oneshot::Sender>, }, - CheckIfJobAreRunning { + GetActiveReports { + ack_tx: oneshot::Sender>, + }, + CheckIfJobsAreRunning { job_names: Vec, location_id: location::id::Type, ack_tx: oneshot::Sender, }, Shutdown, + HasActiveJobs { + ctx_id: Uuid, + ack_tx: oneshot::Sender, + }, } -pub(super) struct JobSystemRunner { - base_dispatcher: BaseTaskDispatcher, - handles: HashMap>, +struct JobsWorktables { job_hashes: HashMap, job_hashes_by_id: HashMap, running_jobs_by_job_id: HashMap, running_jobs_set: HashSet<(JobName, location::id::Type)>, jobs_to_store_by_ctx_id: HashMap>, - job_return_status_tx: chan::Sender<(JobId, Result)>, - job_outputs_tx: chan::Sender<(JobId, Result)>, } -impl JobSystemRunner { +pub(super) struct JobSystemRunner> { + on_shutdown_mode: bool, + base_dispatcher: BaseTaskDispatcher, + handles: HashMap>, + worktables: JobsWorktables, + job_return_status_tx: chan::Sender<(JobId, Result)>, + job_outputs_tx: chan::Sender<(JobId, Result)>, +} + +impl> JobSystemRunner { pub(super) fn new( base_dispatcher: BaseTaskDispatcher, job_return_status_tx: chan::Sender<(JobId, Result)>, - job_outputs_tx: chan::Sender<(JobId, Result)>, + job_outputs_tx: chan::Sender<(JobId, Result)>, ) -> Self { Self { + on_shutdown_mode: false, base_dispatcher, handles: HashMap::with_capacity(JOBS_INITIAL_CAPACITY), - job_hashes: HashMap::with_capacity(JOBS_INITIAL_CAPACITY), - job_hashes_by_id: HashMap::with_capacity(JOBS_INITIAL_CAPACITY), - running_jobs_by_job_id: HashMap::with_capacity(JOBS_INITIAL_CAPACITY), - running_jobs_set: HashSet::with_capacity(JOBS_INITIAL_CAPACITY), - jobs_to_store_by_ctx_id: HashMap::new(), + worktables: JobsWorktables { + job_hashes: HashMap::with_capacity(JOBS_INITIAL_CAPACITY), + job_hashes_by_id: HashMap::with_capacity(JOBS_INITIAL_CAPACITY), + running_jobs_by_job_id: HashMap::with_capacity(JOBS_INITIAL_CAPACITY), + running_jobs_set: HashSet::with_capacity(JOBS_INITIAL_CAPACITY), + jobs_to_store_by_ctx_id: HashMap::new(), + }, job_return_status_tx, job_outputs_tx, } @@ -97,42 +116,43 @@ impl JobSystemRunner { async fn new_job( &mut self, - id: JobId, + job_id: JobId, location_id: location::id::Type, - dyn_job: Box>, - ctx: Ctx, + dyn_job: Box>, + ctx: OuterCtx, maybe_existing_tasks: Option, ) -> Result<(), JobSystemError> { let Self { base_dispatcher, handles, - job_hashes, - job_hashes_by_id, + worktables: + JobsWorktables { + job_hashes, + job_hashes_by_id, + running_jobs_by_job_id, + running_jobs_set, + .. + }, job_return_status_tx, - running_jobs_by_job_id, - running_jobs_set, .. } = self; - let db = ctx.db(); let job_name = dyn_job.job_name(); let job_hash = dyn_job.hash(); if let Some(&already_running_id) = job_hashes.get(&job_hash) { return Err(JobSystemError::AlreadyRunning { - new_id: id, + new_id: job_id, already_running_id, job_name, }); } - running_jobs_by_job_id.insert(id, (job_name, location_id)); + running_jobs_by_job_id.insert(job_id, (job_name, location_id)); running_jobs_set.insert((job_name, location_id)); - job_hashes.insert(job_hash, id); - job_hashes_by_id.insert(id, job_hash); - - let start_time = Utc::now(); + job_hashes.insert(job_hash, job_id); + job_hashes_by_id.insert(job_id, job_hash); let mut handle = if maybe_existing_tasks.is_some() { dyn_job.resume( @@ -149,174 +169,220 @@ impl JobSystemRunner { ) }; - handle.report.status = report::Status::Running; - if handle.report.started_at.is_none() { - handle.report.started_at = Some(start_time); - } + handle.register_start(Utc::now()).await?; - // If the report doesn't have a created_at date, it's a new report - if handle.report.created_at.is_none() { - handle.report.create(db).await?; - } else { - // Otherwise it can be a job being resumed or a children job that was already been created - handle.report.update(db).await?; - } - - // Registering children jobs - handle - .next_jobs - .iter_mut() - .map(|dyn_job| dyn_job.report_mut()) - .map(|next_job_report| async { - if next_job_report.created_at.is_none() { - next_job_report.create(ctx.db()).await - } else { - Ok(()) - } - }) - .collect::>() - .try_join() - .await?; - - handles.insert(id, handle); + handles.insert(job_id, handle); Ok(()) } - async fn process_command(&mut self, id: JobId, command: Command) -> Result<(), JobSystemError> { - if let Some(handle) = self.handles.get_mut(&id) { - handle.send_command(command).await?; - Ok(()) + async fn get_active_reports(&self) -> HashMap { + self.handles + .iter() + .map(|(job_id, handle)| async { (*job_id, handle.ctx.report().await.clone()) }) + .collect::>() + .join() + .await + .into_iter() + .collect() + } + + async fn process_command( + &mut self, + job_id: JobId, + command: Command, + ack_tx: oneshot::Sender>, + ) { + if let Some(handle) = self.handles.get_mut(&job_id) { + match (command, handle.is_running) { + (Command::Pause, false) => { + warn!("Tried to pause a job already paused"); + return ack_tx.send(Ok(())).expect( + "ack channel closed before sending response to already paused job", + ); + } + (Command::Resume, true) => { + warn!("Tried to resume a job already running"); + return ack_tx.send(Ok(())).expect( + "ack channel closed before sending response to already running job", + ); + } + _ => {} + } + match command { + Command::Pause | Command::Cancel | Command::Shutdown => { + handle.is_running = false; + } + Command::Resume => { + handle.is_running = true; + } + } + handle.send_command(command, ack_tx).await; + handle.ctx.invalidate_query("jobs.isActive"); + handle.ctx.invalidate_query("jobs.reports"); } else { - Err(JobSystemError::NotFound(id)) + error!("Job not found"); + ack_tx + .send(Err(JobSystemError::NotFound(job_id))) + .unwrap_or_else(|_| { + panic!("ack channel closed before sending {command:?} response") + }); } } fn is_empty(&self) -> bool { - self.handles.is_empty() && self.job_hashes.is_empty() && self.job_hashes_by_id.is_empty() + self.handles.is_empty() + && self.worktables.job_hashes.is_empty() + && self.worktables.job_hashes_by_id.is_empty() } - fn check_if_job_are_running( + fn total_jobs(&self) -> usize { + self.handles.len() + } + + fn check_if_jobs_are_running( &self, job_names: Vec, location_id: location::id::Type, ) -> bool { - job_names - .into_iter() - .any(|job_name| self.running_jobs_set.contains(&(job_name, location_id))) + job_names.into_iter().any(|job_name| { + self.worktables + .running_jobs_set + .contains(&(job_name, location_id)) + }) } - async fn process_return_status(&mut self, job_id: JobId, status: Result) { + #[instrument(skip_all, fields(%job_id))] + async fn process_return_status( + &mut self, + job_id: JobId, + status: Result, + ) -> Result<(), JobSystemError> { let Self { + on_shutdown_mode, handles, - job_hashes, - job_hashes_by_id, + worktables, job_outputs_tx, job_return_status_tx, base_dispatcher, - jobs_to_store_by_ctx_id, - running_jobs_by_job_id, - running_jobs_set, .. } = self; - let job_hash = job_hashes_by_id.remove(&job_id).expect("it must be here"); - let (job_name, location_id) = running_jobs_by_job_id + let job_hash = worktables + .job_hashes_by_id + .remove(&job_id) + .expect("it must be here"); + + let (job_name, location_id) = worktables + .running_jobs_by_job_id .remove(&job_id) .expect("a JobName and location_id must've been inserted in the map with the job id"); - assert!(running_jobs_set.remove(&(job_name, location_id))); - assert!(job_hashes.remove(&job_hash).is_some()); + assert!(worktables.running_jobs_set.remove(&(job_name, location_id))); + assert!(worktables.job_hashes.remove(&job_hash).is_some()); + let mut handle = handles.remove(&job_id).expect("it must be here"); + handle.run_time += handle.start_time.elapsed(); + + handle + .ctx + .report_mut() + .await + .push_metadata(ReportOutputMetadata::Metrics(HashMap::from([( + "job_run_time".into(), + json!(handle.run_time), + )]))); let res = match status { Ok(ReturnStatus::Completed(job_return)) => { try_dispatch_next_job( &mut handle, + location_id, base_dispatcher.clone(), - (job_hashes, job_hashes_by_id), + worktables, handles, job_return_status_tx.clone(), - ); + ) + .await?; - handle.complete_job(job_return).await + handle.complete_job(job_return).await.map_err(Into::into) } - Ok(ReturnStatus::Shutdown(Ok(Some(serialized_job)))) => { - let name = handle.report.name; + Ok(ReturnStatus::Shutdown(res)) => { + match res { + Ok(Some(serialized_job)) => { + let name = { + let db = handle.ctx.db(); + let mut report = handle.ctx.report_mut().await; + if let Err(e) = report.update(db).await { + error!(?e, "Failed to update report on job shutdown;"); + } + report.name + }; - let Ok(next_jobs) = handle - .next_jobs - .into_iter() - .map(|next_job| async move { - let next_id = next_job.id(); - let next_name = next_job.job_name(); - next_job - .serialize() - .await - .map(|maybe_serialized_job| { - maybe_serialized_job.map(|serialized_job| StoredJob { - id: next_id, - name: next_name, + worktables + .jobs_to_store_by_ctx_id + .entry(handle.ctx.id()) + .or_default() + .push(StoredJobEntry { + location_id, + root_job: StoredJob { + id: job_id, + run_time: handle.start_time.elapsed(), + name, serialized_job, - }) - }) - .map_err(|e| { - error!( - "Failed to serialize next job: \ - : {e:#?}" - ); - }) - }) - .collect::>() - .try_join() - .await - else { - return; - }; + }, + next_jobs: serialize_next_jobs_to_shutdown( + job_id, + job_name, + handle.next_jobs, + ) + .await + .unwrap_or_default(), + }); - jobs_to_store_by_ctx_id - .entry(handle.ctx.id()) - .or_default() - .push(StoredJobEntry { - location_id, - root_job: StoredJob { - id: job_id, - name, - serialized_job, - }, - next_jobs: next_jobs.into_iter().flatten().collect(), - }); + debug!(%name, "Job was shutdown and serialized;"); + } - return; + Ok(None) => { + debug!( + "Job was shutdown but didn't returned any serialized data, \ + probably it isn't resumable job" + ); + } + + Err(e) => { + error!(?e, "Failed to serialize job;"); + } + } + + if *on_shutdown_mode && handles.is_empty() { + // Job system is empty and in shutdown mode so we close this channel to finish the shutdown process + job_return_status_tx.close(); + } + + return Ok(()); } - Ok(ReturnStatus::Shutdown(Ok(None))) => { - debug!( - "Job was shutdown but didn't returned any serialized data, \ - probably it isn't resumable job: " - ); - return; + Ok(ReturnStatus::Canceled(job_return)) => { + handle.cancel_job(job_return).await.map_err(Into::into) } - - Ok(ReturnStatus::Shutdown(Err(e))) => { - error!("Failed to serialize job: {e:#?}"); - return; - } - - Ok(ReturnStatus::Canceled) => handle - .cancel_job() + Err(e) => handle + .failed_job(&e) .await - .and_then(|()| Err(JobSystemError::Canceled(job_id))), - - Err(e) => handle.failed_job(&e).await.and_then(|()| Err(e.into())), + .map_err(Into::into) + .and_then(|()| Err(e)), }; job_outputs_tx .send((job_id, res)) .await .expect("job outputs channel unexpectedly closed on job completion"); + + handle.ctx.invalidate_query("jobs.isActive"); + handle.ctx.invalidate_query("jobs.reports"); + + Ok(()) } fn clean_memory(&mut self) { @@ -326,28 +392,34 @@ impl JobSystemRunner { self.handles.shrink_to(JOBS_INITIAL_CAPACITY); } - if self.job_hashes.capacity() > JOBS_INITIAL_CAPACITY - && self.job_hashes.len() < JOBS_INITIAL_CAPACITY + if self.worktables.job_hashes.capacity() > JOBS_INITIAL_CAPACITY + && self.worktables.job_hashes.len() < JOBS_INITIAL_CAPACITY { - self.job_hashes.shrink_to(JOBS_INITIAL_CAPACITY); + self.worktables.job_hashes.shrink_to(JOBS_INITIAL_CAPACITY); } - if self.job_hashes_by_id.capacity() > JOBS_INITIAL_CAPACITY - && self.job_hashes_by_id.len() < JOBS_INITIAL_CAPACITY + if self.worktables.job_hashes_by_id.capacity() > JOBS_INITIAL_CAPACITY + && self.worktables.job_hashes_by_id.len() < JOBS_INITIAL_CAPACITY { - self.job_hashes_by_id.shrink_to(JOBS_INITIAL_CAPACITY); + self.worktables + .job_hashes_by_id + .shrink_to(JOBS_INITIAL_CAPACITY); } - if self.running_jobs_by_job_id.capacity() > JOBS_INITIAL_CAPACITY - && self.running_jobs_by_job_id.len() < JOBS_INITIAL_CAPACITY + if self.worktables.running_jobs_by_job_id.capacity() > JOBS_INITIAL_CAPACITY + && self.worktables.running_jobs_by_job_id.len() < JOBS_INITIAL_CAPACITY { - self.running_jobs_by_job_id.shrink_to(JOBS_INITIAL_CAPACITY); + self.worktables + .running_jobs_by_job_id + .shrink_to(JOBS_INITIAL_CAPACITY); } - if self.running_jobs_set.capacity() > JOBS_INITIAL_CAPACITY - && self.running_jobs_set.len() < JOBS_INITIAL_CAPACITY + if self.worktables.running_jobs_set.capacity() > JOBS_INITIAL_CAPACITY + && self.worktables.running_jobs_set.len() < JOBS_INITIAL_CAPACITY { - self.running_jobs_set.shrink_to(JOBS_INITIAL_CAPACITY); + self.worktables + .running_jobs_set + .shrink_to(JOBS_INITIAL_CAPACITY); } } @@ -359,9 +431,13 @@ impl JobSystemRunner { let Self { handles, - job_hashes, - job_hashes_by_id, - jobs_to_store_by_ctx_id, + worktables: + JobsWorktables { + job_hashes, + job_hashes_by_id, + jobs_to_store_by_ctx_id, + .. + }, .. } = self; @@ -382,23 +458,113 @@ impl JobSystemRunner { .await .map_err(|e| JobSystemError::StoredJobs(FileIOError::from((store_jobs_file, e)))) } + + fn has_active_jobs(&self, ctx_id: Uuid) -> bool { + self.handles + .values() + .any(|handle| handle.ctx.id() == ctx_id && handle.is_running) + } + + async fn dispatch_shutdown_command_to_jobs(&mut self) { + self.handles + .values_mut() + .map(|handle| async move { + let (tx, rx) = oneshot::channel(); + + handle.send_command(Command::Shutdown, tx).await; + + rx.await.expect("Worker failed to ack shutdown request") + }) + .collect::>() + .join() + .await + .into_iter() + .for_each(|res| { + if let Err(e) = res { + error!(?e, "Failed to shutdown job;"); + } + }); + } } -fn try_dispatch_next_job( - handle: &mut JobHandle, +#[instrument(skip(next_jobs))] +async fn serialize_next_jobs_to_shutdown>( + parent_job_id: JobId, + parent_job_name: JobName, + next_jobs: impl IntoIterator>> + Send, +) -> Option> { + next_jobs + .into_iter() + .map(|next_job| async move { + let next_id = next_job.id(); + let next_name = next_job.job_name(); + next_job + .serialize() + .await + .map(|maybe_serialized_job| { + maybe_serialized_job.map(|serialized_job| StoredJob { + id: next_id, + run_time: Duration::ZERO, + name: next_name, + serialized_job, + }) + }) + .map_err(|e| { + error!(%next_id, %next_name, ?e, "Failed to serialize next job;"); + }) + }) + .collect::>() + .try_join() + .await + .map(|maybe_serialized_next_jobs| { + maybe_serialized_next_jobs.into_iter().flatten().collect() + }) + .ok() +} + +#[instrument( + skip_all, + fields( + job_id = %handle.id, + next_jobs_count = handle.next_jobs.len(), + location_id = %location_id, + total_running_jobs = handles.len(), + ) +)] +async fn try_dispatch_next_job>( + handle: &mut JobHandle, + location_id: location::id::Type, base_dispatcher: BaseTaskDispatcher, - (job_hashes, job_hashes_by_id): (&mut HashMap, &mut HashMap), - handles: &mut HashMap>, + JobsWorktables { + job_hashes, + job_hashes_by_id, + running_jobs_by_job_id, + running_jobs_set, + .. + }: &mut JobsWorktables, + handles: &mut HashMap>, job_return_status_tx: chan::Sender<(JobId, Result)>, -) { +) -> Result<(), JobSystemError> { if let Some(next) = handle.next_jobs.pop_front() { let next_id = next.id(); let next_hash = next.hash(); + let next_name = next.job_name(); + if let Entry::Vacant(e) = job_hashes.entry(next_hash) { e.insert(next_id); + trace!(%next_id, %next_name, "Dispatching next job;"); + job_hashes_by_id.insert(next_id, next_hash); - let mut next_handle = - next.dispatch(base_dispatcher, handle.ctx.clone(), job_return_status_tx); + running_jobs_by_job_id.insert(next_id, (next_name, location_id)); + running_jobs_set.insert((next_name, location_id)); + + let mut next_handle = next.dispatch( + base_dispatcher, + handle.ctx.get_outer_ctx(), + job_return_status_tx, + ); + + next_handle.register_start(Utc::now()).await?; assert!( next_handle.next_jobs.is_empty(), @@ -410,30 +576,34 @@ fn try_dispatch_next_job( handles.insert(next_id, next_handle); } else { - warn!("Unexpectedly found a job with the same hash as the next job: ", next.job_name()); + warn!(%next_id, %next_name, "Unexpectedly found a job with the same hash as the next job;"); } + } else { + trace!("No next jobs to dispatch"); } + + Ok(()) } -pub(super) async fn run( - mut runner: JobSystemRunner, +pub(super) async fn run>( + mut runner: JobSystemRunner, store_jobs_file: impl AsRef + Send, - msgs_rx: chan::Receiver>, - job_return_status_rx: chan::Receiver<(JobId, Result)>, + msgs_rx: chan::Receiver>, + job_done_rx: chan::Receiver<(JobId, Result)>, ) { - enum StreamMessage { + enum StreamMessage> { ReturnStatus((JobId, Result)), - RunnerMessage(RunnerMessage), + RunnerMessage(RunnerMessage), CleanMemoryTick, } let memory_cleanup_interval = interval_at(Instant::now() + FIVE_MINUTES, FIVE_MINUTES); - let job_return_status_rx_to_shutdown = job_return_status_rx.clone(); + let job_return_status_rx_to_shutdown = job_done_rx.clone(); let mut msg_stream = pin!(( msgs_rx.map(StreamMessage::RunnerMessage), - job_return_status_rx.map(StreamMessage::ReturnStatus), + job_done_rx.map(StreamMessage::ReturnStatus), IntervalStream::new(memory_cleanup_interval).map(|_| StreamMessage::CleanMemoryTick), ) .merge()); @@ -442,24 +612,41 @@ pub(super) async fn run( match msg { // Job return status messages StreamMessage::ReturnStatus((job_id, status)) => { - runner.process_return_status(job_id, status).await; + if let Err(e) = runner.process_return_status(job_id, status).await { + error!(?e, "Failed to process return status;"); + } } // Runner messages StreamMessage::RunnerMessage(RunnerMessage::NewJob { - id, + job_id, location_id, dyn_job, ctx, ack_tx, }) => { ack_tx - .send(runner.new_job(id, location_id, dyn_job, ctx, None).await) + .send( + runner + .new_job(job_id, location_id, dyn_job, ctx, None) + .await, + ) .expect("ack channel closed before sending new job response"); } + StreamMessage::RunnerMessage(RunnerMessage::HasActiveJobs { ctx_id, ack_tx }) => { + ack_tx + .send(runner.has_active_jobs(ctx_id)) + .expect("ack channel closed before sending has active jobs response"); + } + + StreamMessage::RunnerMessage(RunnerMessage::GetActiveReports { ack_tx }) => { + ack_tx + .send(runner.get_active_reports().await) + .expect("ack channel closed before sending active reports response"); + } StreamMessage::RunnerMessage(RunnerMessage::ResumeStoredJob { - id, + job_id, location_id, dyn_job, ctx, @@ -469,60 +656,58 @@ pub(super) async fn run( ack_tx .send( runner - .new_job(id, location_id, dyn_job, ctx, serialized_tasks) + .new_job(job_id, location_id, dyn_job, ctx, serialized_tasks) .await, ) .expect("ack channel closed before sending resume job response"); } StreamMessage::RunnerMessage(RunnerMessage::Command { - id, + job_id: id, command, ack_tx, - }) => { - ack_tx - .send(runner.process_command(id, command).await) - .unwrap_or_else(|_| { - panic!("ack channel closed before sending {command:?} response") - }); - } + }) => runner.process_command(id, command, ack_tx).await, StreamMessage::RunnerMessage(RunnerMessage::Shutdown) => { + runner.on_shutdown_mode = true; // Consuming all pending return status messages - loop { - while let Ok((job_id, status)) = job_return_status_rx_to_shutdown.try_recv() { - runner.process_return_status(job_id, status).await; + if !runner.is_empty() { + let mut job_return_status_stream = pin!(job_return_status_rx_to_shutdown); + + runner.dispatch_shutdown_command_to_jobs().await; + + debug!( + total_jobs = runner.total_jobs(), + "Waiting for jobs to shutdown before shutting down the job system...;", + ); + + while let Some((job_id, status)) = job_return_status_stream.next().await { + if let Err(e) = runner.process_return_status(job_id, status).await { + error!(?e, "Failed to process return status before shutting down;"); + } } - if runner.is_empty() { - break; + // Now the runner can shutdown + if let Err(e) = runner.save_jobs(store_jobs_file).await { + error!(?e, "Failed to save jobs before shutting down;"); } - - debug!("Waiting for all jobs to complete before shutting down..."); - } - - // Now the runner can shutdown - if let Err(e) = runner.save_jobs(store_jobs_file).await { - error!("Failed to save jobs before shutting down: {e:#?}"); } return; } - StreamMessage::RunnerMessage(RunnerMessage::CheckIfJobAreRunning { + StreamMessage::RunnerMessage(RunnerMessage::CheckIfJobsAreRunning { job_names, location_id, ack_tx, }) => { ack_tx - .send(runner.check_if_job_are_running(job_names, location_id)) + .send(runner.check_if_jobs_are_running(job_names, location_id)) .expect("ack channel closed before sending resume job response"); } // Memory cleanup tick - StreamMessage::CleanMemoryTick => { - runner.clean_memory(); - } + StreamMessage::CleanMemoryTick => runner.clean_memory(), } } } diff --git a/core/crates/heavy-lifting/src/job_system/store.rs b/core/crates/heavy-lifting/src/job_system/store.rs index 8c40c7dc5..74058708a 100644 --- a/core/crates/heavy-lifting/src/job_system/store.rs +++ b/core/crates/heavy-lifting/src/job_system/store.rs @@ -1,4 +1,4 @@ -use crate::{file_identifier, indexer, media_processor}; +use crate::{file_identifier, indexer, media_processor, JobContext}; use sd_prisma::prisma::{job, location}; use sd_utils::uuid_to_bytes; @@ -8,6 +8,7 @@ use std::{ future::Future, iter, marker::PhantomData, + time::Duration, }; use futures_concurrency::future::TryJoin; @@ -20,9 +21,11 @@ use super::{ }; #[derive(Debug, Serialize, Deserialize)] +#[repr(transparent)] +#[serde(transparent)] pub struct SerializedTasks(pub Vec); -pub trait SerializableJob: 'static +pub trait SerializableJob: 'static where Self: Sized, { @@ -35,7 +38,7 @@ where #[allow(unused_variables)] fn deserialize( serialized_job: &[u8], - ctx: &Ctx, + ctx: &OuterCtx, ) -> impl Future< Output = Result)>, rmp_serde::decode::Error>, > + Send { @@ -47,6 +50,7 @@ where pub struct StoredJob { pub(super) id: JobId, pub(super) name: JobName, + pub(super) run_time: Duration, pub(super) serialized_job: Vec, } @@ -57,13 +61,13 @@ pub struct StoredJobEntry { pub(super) next_jobs: Vec, } -pub async fn load_jobs( +pub async fn load_jobs>( entries: Vec, - ctx: &Ctx, + ctx: &OuterCtx, ) -> Result< Vec<( location::id::Type, - Box>, + Box>, Option, )>, JobSystemError, @@ -81,7 +85,7 @@ pub async fn load_jobs( .. }| { iter::once(*id).chain(next_jobs.iter().map(|StoredJob { id, .. }| *id)) }, ) - .map(uuid_to_bytes) + .map(|job_id| uuid_to_bytes(&job_id)) .collect::>(), )]) .exec() @@ -166,50 +170,58 @@ pub async fn load_jobs( } macro_rules! match_deserialize_job { - ($stored_job:ident, $report:ident, $ctx:ident, $ctx_type:ty, [$($job_type:ty),+ $(,)?]) => {{ + ($stored_job:ident, $report:ident, $outer_ctx:ident, $outer_ctx_type:ty, $job_ctx_type:ty, [$($job_type:ty),+ $(,)?]) => {{ let StoredJob { id, name, + run_time, serialized_job, } = $stored_job; match name { - $(<$job_type as Job>::NAME => <$job_type as SerializableJob<$ctx_type>>::deserialize( + $(<$job_type as Job>::NAME => <$job_type as SerializableJob<$outer_ctx_type>>::deserialize( &serialized_job, - $ctx, + $outer_ctx, ).await - .map(|maybe_job| maybe_job.map(|(job, tasks)| -> ( - Box>, + .map(|maybe_job| maybe_job.map(|(job, maybe_tasks)| -> ( + Box>, Option ) { ( Box::new(JobHolder { id, job, + run_time, report: $report, next_jobs: VecDeque::new(), _ctx: PhantomData, }), - tasks, + maybe_tasks.and_then( + |tasks| (!tasks.0.is_empty()).then_some(tasks) + ), ) } )) .map_err(Into::into),)+ + + // TODO(fogodev): this is temporary until we can get rid of the old job system + _ => unimplemented!("Job not implemented"), } }}; } -async fn load_job( +async fn load_job>( stored_job: StoredJob, report: Report, - ctx: &Ctx, -) -> Result>, Option)>, JobSystemError> { + ctx: &OuterCtx, +) -> Result>, Option)>, JobSystemError> { match_deserialize_job!( stored_job, report, ctx, - Ctx, + OuterCtx, + JobCtx, [ indexer::job::Indexer, file_identifier::job::FileIdentifier, diff --git a/core/crates/heavy-lifting/src/job_system/utils.rs b/core/crates/heavy-lifting/src/job_system/utils.rs index afa8ce56f..8ac0070c4 100644 --- a/core/crates/heavy-lifting/src/job_system/utils.rs +++ b/core/crates/heavy-lifting/src/job_system/utils.rs @@ -1,16 +1,35 @@ use crate::Error; -use sd_task_system::TaskHandle; +use sd_task_system::{TaskHandle, TaskStatus}; +use futures::{stream::FuturesUnordered, StreamExt}; use futures_concurrency::future::Join; +use tracing::{error, trace}; -pub async fn cancel_pending_tasks( - pending_tasks: impl IntoIterator> + Send, -) { +pub async fn cancel_pending_tasks(pending_tasks: &mut FuturesUnordered>) { pending_tasks - .into_iter() + .iter() .map(TaskHandle::cancel) .collect::>() .join() .await; + + trace!(total_tasks = %pending_tasks.len(), "canceled all pending tasks, now waiting completion"); + + while let Some(task_result) = pending_tasks.next().await { + match task_result { + Ok(TaskStatus::Done((task_id, _))) => trace!( + %task_id, + "tasks cancellation received a completed task;", + ), + + Ok(TaskStatus::Canceled | TaskStatus::ForcedAbortion | TaskStatus::Shutdown(_)) => { + // Job canceled task + } + + Ok(TaskStatus::Error(e)) => error!(%e, "job canceled an errored task;"), + + Err(e) => error!(%e, "task system failed to cancel a task;"), + } + } } diff --git a/core/crates/heavy-lifting/src/lib.rs b/core/crates/heavy-lifting/src/lib.rs index b0d18ffd7..c137584e7 100644 --- a/core/crates/heavy-lifting/src/lib.rs +++ b/core/crates/heavy-lifting/src/lib.rs @@ -44,8 +44,12 @@ pub mod utils; use media_processor::ThumbKey; pub use job_system::{ - job::{IntoJob, JobBuilder, JobName, JobOutput, JobOutputData, OuterContext, ProgressUpdate}, - JobId, JobSystem, + job::{ + IntoJob, JobContext, JobEnqueuer, JobName, JobOutput, JobOutputData, OuterContext, + ProgressUpdate, + }, + report::Report, + JobId, JobSystem, JobSystemError, }; #[derive(Error, Debug)] @@ -59,6 +63,9 @@ pub enum Error { #[error(transparent)] TaskSystem(#[from] TaskSystemError), + + #[error(transparent)] + JobSystem(#[from] JobSystemError), } impl From for rspc::Error { @@ -70,19 +77,21 @@ impl From for rspc::Error { Error::TaskSystem(e) => { Self::with_cause(rspc::ErrorCode::InternalServerError, e.to_string(), e) } + Error::JobSystem(e) => e.into(), } } } -#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)] +#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type, Clone)] +#[serde(rename_all = "snake_case")] pub enum NonCriticalError { // TODO: Add variants as needed #[error(transparent)] - Indexer(#[from] indexer::NonCriticalError), + Indexer(#[from] indexer::NonCriticalIndexerError), #[error(transparent)] - FileIdentifier(#[from] file_identifier::NonCriticalError), + FileIdentifier(#[from] file_identifier::NonCriticalFileIdentifierError), #[error(transparent)] - MediaProcessor(#[from] media_processor::NonCriticalError), + MediaProcessor(#[from] media_processor::NonCriticalMediaProcessorError), } #[repr(i32)] @@ -96,7 +105,7 @@ pub enum LocationScanState { #[derive(Debug, Serialize, Type)] pub enum UpdateEvent { - NewThumbnailEvent { + NewThumbnail { thumb_key: ThumbKey, }, NewIdentifiedObjects { diff --git a/core/crates/heavy-lifting/src/media_processor/helpers/exif_media_data.rs b/core/crates/heavy-lifting/src/media_processor/helpers/exif_media_data.rs index 3fa2c7618..82b722753 100644 --- a/core/crates/heavy-lifting/src/media_processor/helpers/exif_media_data.rs +++ b/core/crates/heavy-lifting/src/media_processor/helpers/exif_media_data.rs @@ -1,12 +1,24 @@ use crate::media_processor::{self, media_data_extractor}; +use sd_core_prisma_helpers::ObjectPubId; +use sd_core_sync::Manager as SyncManager; + use sd_file_ext::extensions::{Extension, ImageExtension, ALL_IMAGE_EXTENSIONS}; use sd_media_metadata::ExifMetadata; -use sd_prisma::prisma::{exif_data, object, PrismaClient}; +use sd_prisma::{ + prisma::{exif_data, object, PrismaClient}, + prisma_sync, +}; +use sd_sync::{option_sync_db_entry, OperationFactory}; +use sd_utils::chain_optional_iter; use std::path::Path; +use futures_concurrency::future::TryJoin; use once_cell::sync::Lazy; +use prisma_client_rust::QueryError; + +use super::from_slice_option_to_option; pub static AVAILABLE_EXTENSIONS: Lazy> = Lazy::new(|| { ALL_IMAGE_EXTENSIONS @@ -17,6 +29,7 @@ pub static AVAILABLE_EXTENSIONS: Lazy> = Lazy::new(|| { .collect() }); +#[must_use] pub const fn can_extract(image_extension: ImageExtension) -> bool { use ImageExtension::{ Avci, Avcs, Avif, Dng, Heic, Heif, Heifs, Hif, Jpeg, Jpg, Png, Tiff, Webp, @@ -27,33 +40,62 @@ pub const fn can_extract(image_extension: ImageExtension) -> bool { ) } -pub fn to_query( - mdi: ExifMetadata, +#[must_use] +fn to_query( + ExifMetadata { + resolution, + date_taken, + location, + camera_data, + artist, + description, + copyright, + exif_version, + }: ExifMetadata, object_id: exif_data::object_id::Type, -) -> exif_data::CreateUnchecked { - exif_data::CreateUnchecked { - object_id, - _params: vec![ - exif_data::camera_data::set(serde_json::to_vec(&mdi.camera_data).ok()), - exif_data::media_date::set(serde_json::to_vec(&mdi.date_taken).ok()), - exif_data::resolution::set(serde_json::to_vec(&mdi.resolution).ok()), - exif_data::media_location::set(serde_json::to_vec(&mdi.location).ok()), - exif_data::artist::set(mdi.artist), - exif_data::description::set(mdi.description), - exif_data::copyright::set(mdi.copyright), - exif_data::exif_version::set(mdi.exif_version), - exif_data::epoch_time::set(mdi.date_taken.map(|x| x.unix_timestamp())), +) -> (Vec<(&'static str, rmpv::Value)>, exif_data::Create) { + let (sync_params, db_params) = chain_optional_iter( + [], + [ + option_sync_db_entry!( + serde_json::to_vec(&camera_data).ok(), + exif_data::camera_data + ), + option_sync_db_entry!(serde_json::to_vec(&date_taken).ok(), exif_data::media_date), + option_sync_db_entry!(serde_json::to_vec(&resolution).ok(), exif_data::resolution), + option_sync_db_entry!( + serde_json::to_vec(&location).ok(), + exif_data::media_location + ), + option_sync_db_entry!(artist, exif_data::artist), + option_sync_db_entry!(description, exif_data::description), + option_sync_db_entry!(copyright, exif_data::copyright), + option_sync_db_entry!(exif_version, exif_data::exif_version), + option_sync_db_entry!( + date_taken.map(|x| x.unix_timestamp()), + exif_data::epoch_time + ), ], - } + ) + .into_iter() + .unzip(); + + ( + sync_params, + exif_data::Create { + object: object::id::equals(object_id), + _params: db_params, + }, + ) } pub async fn extract( path: impl AsRef + Send, -) -> Result, media_processor::NonCriticalError> { +) -> Result, media_processor::NonCriticalMediaProcessorError> { let path = path.as_ref(); ExifMetadata::from_path(&path).await.map_err(|e| { - media_data_extractor::NonCriticalError::FailedToExtractImageMediaData( + media_data_extractor::NonCriticalMediaDataExtractorError::FailedToExtractImageMediaData( path.to_path_buf(), e.to_string(), ) @@ -62,24 +104,62 @@ pub async fn extract( } pub async fn save( - media_datas: Vec<(ExifMetadata, object::id::Type)>, + exif_datas: impl IntoIterator + Send, db: &PrismaClient, -) -> Result { - db.exif_data() - .create_many( - media_datas - .into_iter() - .map(|(exif_data, object_id)| to_query(exif_data, object_id)) - .collect(), - ) - .skip_duplicates() - .exec() - .await - .map(|created| { - #[allow(clippy::cast_sign_loss)] - { - created as u64 - } + sync: &SyncManager, +) -> Result { + exif_datas + .into_iter() + .map(|(exif_data, object_id, object_pub_id)| async move { + let (sync_params, create) = to_query(exif_data, object_id); + let db_params = create._params.clone(); + + sync.write_ops( + db, + ( + sync.shared_create( + prisma_sync::exif_data::SyncId { + object: prisma_sync::object::SyncId { + pub_id: object_pub_id.into(), + }, + }, + sync_params, + ), + db.exif_data() + .upsert(exif_data::object_id::equals(object_id), create, db_params) + .select(exif_data::select!({ id })), + ), + ) + .await }) - .map_err(Into::into) + .collect::>() + .try_join() + .await + .map(|created_vec| created_vec.len() as u64) +} + +#[must_use] +pub fn from_prisma_data( + exif_data::Data { + resolution, + media_date, + media_location, + camera_data, + artist, + description, + copyright, + exif_version, + .. + }: exif_data::Data, +) -> ExifMetadata { + ExifMetadata { + camera_data: from_slice_option_to_option(camera_data).unwrap_or_default(), + date_taken: from_slice_option_to_option(media_date).unwrap_or_default(), + resolution: from_slice_option_to_option(resolution).unwrap_or_default(), + location: from_slice_option_to_option(media_location), + artist, + description, + copyright, + exif_version, + } } diff --git a/core/crates/heavy-lifting/src/media_processor/helpers/ffmpeg_media_data.rs b/core/crates/heavy-lifting/src/media_processor/helpers/ffmpeg_media_data.rs index 0d1734c22..82c932490 100644 --- a/core/crates/heavy-lifting/src/media_processor/helpers/ffmpeg_media_data.rs +++ b/core/crates/heavy-lifting/src/media_processor/helpers/ffmpeg_media_data.rs @@ -1,5 +1,7 @@ use crate::media_processor::{self, media_data_extractor}; +use sd_core_prisma_helpers::object_with_media_data; + use sd_file_ext::extensions::{ AudioExtension, Extension, VideoExtension, ALL_AUDIO_EXTENSIONS, ALL_VIDEO_EXTENSIONS, }; @@ -19,7 +21,10 @@ use sd_prisma::prisma::{ ffmpeg_data, ffmpeg_media_audio_props, ffmpeg_media_chapter, ffmpeg_media_codec, ffmpeg_media_program, ffmpeg_media_stream, ffmpeg_media_video_props, object, PrismaClient, }; -use sd_utils::db::ffmpeg_data_field_to_db; +use sd_utils::{ + db::{ffmpeg_data_field_from_db, ffmpeg_data_field_to_db}, + i64_to_frontend, +}; use std::{collections::HashMap, path::Path}; @@ -28,6 +33,8 @@ use once_cell::sync::Lazy; use prisma_client_rust::QueryError; use tracing::error; +use super::from_slice_option_to_option; + pub static AVAILABLE_EXTENSIONS: Lazy> = Lazy::new(|| { ALL_AUDIO_EXTENSIONS .iter() @@ -44,6 +51,7 @@ pub static AVAILABLE_EXTENSIONS: Lazy> = Lazy::new(|| { .collect() }); +#[must_use] pub const fn can_extract_for_audio(audio_extension: AudioExtension) -> bool { use AudioExtension::{ Aac, Adts, Aif, Aiff, Amr, Aptx, Ast, Caf, Flac, Loas, M4a, Mid, Mp2, Mp3, Oga, Ogg, Opus, @@ -63,34 +71,35 @@ pub const fn can_extract_for_audio(audio_extension: AudioExtension) -> bool { ) } +#[must_use] pub const fn can_extract_for_video(video_extension: VideoExtension) -> bool { use VideoExtension::{ - Asf, Avi, Avifs, F4v, Flv, Hevc, M2ts, M2v, M4v, Mjpeg, Mkv, Mov, Mp4, Mpe, Mpeg, Mpg, Mts, - Mxf, Ogv, Qt, Swf, Ts, Vob, Webm, Wm, Wmv, Wtv, _3gp, + Asf, Avi, Avifs, F4v, Flv, Hevc, M2ts, M2v, M4v, Mjpeg, Mkv, Mov, Mp4, Mpe, Mpeg, Mpg, Mxf, + Ogv, Qt, Swf, Vob, Webm, Wm, Wmv, Wtv, _3gp, }; matches!( video_extension, Avi | Avifs | Qt | Mov | Swf - | Mjpeg | Ts | Mts - | Mpeg | Mxf | M2v - | Mpg | Mpe | M2ts - | Flv | Wm | _3gp - | M4v | Wmv | Asf - | Mp4 | Webm | Mkv - | Vob | Ogv | Wtv - | Hevc | F4v + | Mjpeg | Mpeg + | Mxf | M2v | Mpg + | Mpe | M2ts | Flv + | Wm | _3gp | M4v + | Wmv | Asf | Mp4 + | Webm | Mkv | Vob + | Ogv | Wtv | Hevc + | F4v // | Ts | Mts TODO: Uncomment when we start using magic instead of extension ) } pub async fn extract( path: impl AsRef + Send, -) -> Result { +) -> Result { let path = path.as_ref(); FFmpegMetadata::from_path(&path).await.map_err(|e| { - media_data_extractor::NonCriticalError::FailedToExtractImageMediaData( + media_data_extractor::NonCriticalMediaDataExtractorError::FailedToExtractImageMediaData( path.to_path_buf(), e.to_string(), ) @@ -101,7 +110,7 @@ pub async fn extract( pub async fn save( ffmpeg_datas: impl IntoIterator + Send, db: &PrismaClient, -) -> Result { +) -> Result { ffmpeg_datas .into_iter() .map( @@ -180,9 +189,9 @@ async fn create_ffmpeg_data( )), ffmpeg_data::metadata::set( serde_json::to_vec(&metadata) - .map_err(|err| { - error!("Error reading FFmpegData metadata: {err:#?}"); - err + .map_err(|e| { + error!(?e, "Error reading FFmpegData metadata;"); + e }) .ok(), ), @@ -224,9 +233,9 @@ async fn create_ffmpeg_chapters( ffmpeg_data_id, _params: vec![ffmpeg_media_chapter::metadata::set( serde_json::to_vec(&metadata) - .map_err(|err| { - error!("Error reading FFmpegMediaChapter metadata: {err:#?}"); - err + .map_err(|e| { + error!(?e, "Error reading FFmpegMediaChapter metadata;"); + e }) .ok(), )], @@ -244,37 +253,36 @@ async fn create_ffmpeg_programs( programs: Vec, db: &PrismaClient, ) -> Result)>, QueryError> { - let (creates, streams_by_program_id) = - programs - .into_iter() - .map( - |Program { - id: program_id, - name, - metadata, - streams, - }| { - ( - ffmpeg_media_program::CreateUnchecked { - program_id, - ffmpeg_data_id: data_id, - _params: vec![ - ffmpeg_media_program::name::set(name), - ffmpeg_media_program::metadata::set( - serde_json::to_vec(&metadata) - .map_err(|err| { - error!("Error reading FFmpegMediaProgram metadata: {err:#?}"); - err - }) - .ok(), - ), - ], - }, - (program_id, streams), - ) - }, - ) - .unzip::<_, _, Vec<_>, Vec<_>>(); + let (creates, streams_by_program_id) = programs + .into_iter() + .map( + |Program { + id: program_id, + name, + metadata, + streams, + }| { + ( + ffmpeg_media_program::CreateUnchecked { + program_id, + ffmpeg_data_id: data_id, + _params: vec![ + ffmpeg_media_program::name::set(name), + ffmpeg_media_program::metadata::set( + serde_json::to_vec(&metadata) + .map_err(|e| { + error!(?e, "Error reading FFmpegMediaProgram metadata;"); + e + }) + .ok(), + ), + ], + }, + (program_id, streams), + ) + }, + ) + .unzip::<_, _, Vec<_>, Vec<_>>(); db.ffmpeg_media_program() .create_many(creates) @@ -333,9 +341,9 @@ async fn create_ffmpeg_streams( ffmpeg_media_stream::language::set(metadata.language.clone()), ffmpeg_media_stream::metadata::set( serde_json::to_vec(&metadata) - .map_err(|err| { - error!("Error reading FFmpegMediaStream metadata: {err:#?}"); - err + .map_err(|e| { + error!(?e, "Error reading FFmpegMediaStream metadata;"); + e }) .ok(), ), @@ -570,3 +578,207 @@ async fn create_ffmpeg_video_props( .await .map(|_| ()) } + +pub fn from_prisma_data( + object_with_media_data::ffmpeg_data::Data { + formats, + duration, + start_time, + bit_rate, + metadata, + chapters, + programs, + .. + }: object_with_media_data::ffmpeg_data::Data, +) -> FFmpegMetadata { + FFmpegMetadata { + formats: formats.split(',').map(String::from).collect::>(), + duration: duration.map(|duration| i64_to_frontend(ffmpeg_data_field_from_db(&duration))), + start_time: start_time + .map(|start_time| i64_to_frontend(ffmpeg_data_field_from_db(&start_time))), + bit_rate: i64_to_frontend(ffmpeg_data_field_from_db(&bit_rate)), + chapters: chapters_from_prisma_data(chapters), + programs: programs_from_prisma_data(programs), + metadata: from_slice_option_to_option(metadata).unwrap_or_default(), + } +} + +#[inline] +fn chapters_from_prisma_data(chapters: Vec) -> Vec { + chapters + .into_iter() + .map( + |ffmpeg_media_chapter::Data { + chapter_id, + start, + end, + time_base_den, + time_base_num, + metadata, + .. + }| Chapter { + id: chapter_id, + start: i64_to_frontend(ffmpeg_data_field_from_db(&start)), + end: i64_to_frontend(ffmpeg_data_field_from_db(&end)), + time_base_den, + time_base_num, + metadata: from_slice_option_to_option(metadata).unwrap_or_default(), + }, + ) + .collect() +} + +#[inline] +fn programs_from_prisma_data( + programs: Vec, +) -> Vec { + programs + .into_iter() + .map( + |object_with_media_data::ffmpeg_data::programs::Data { + program_id, + name, + metadata, + streams, + .. + }| Program { + id: program_id, + name, + streams: streams_from_prisma_data(streams), + metadata: from_slice_option_to_option(metadata).unwrap_or_default(), + }, + ) + .collect() +} + +fn streams_from_prisma_data( + streams: Vec, +) -> Vec { + streams + .into_iter() + .map( + |object_with_media_data::ffmpeg_data::programs::streams::Data { + stream_id, + name, + aspect_ratio_num, + aspect_ratio_den, + frames_per_second_num, + frames_per_second_den, + time_base_real_den, + time_base_real_num, + dispositions, + metadata, + codec, + .. + }| { + Stream { + id: stream_id, + name, + codec: codec_from_prisma_data(codec), + aspect_ratio_num, + aspect_ratio_den, + frames_per_second_num, + frames_per_second_den, + time_base_real_den, + time_base_real_num, + dispositions: dispositions + .map(|dispositions| { + dispositions + .split(',') + .map(String::from) + .collect::>() + }) + .unwrap_or_default(), + metadata: from_slice_option_to_option(metadata).unwrap_or_default(), + } + }, + ) + .collect() +} + +fn codec_from_prisma_data( + codec: Option, +) -> Option { + codec.map( + |object_with_media_data::ffmpeg_data::programs::streams::codec::Data { + kind, + sub_kind, + tag, + name, + profile, + bit_rate, + audio_props, + video_props, + .. + }| Codec { + kind, + sub_kind, + tag, + name, + profile, + bit_rate, + props: match (audio_props, video_props) { + ( + Some(ffmpeg_media_audio_props::Data { + delay, + padding, + sample_rate, + sample_format, + bit_per_sample, + channel_layout, + .. + }), + None, + ) => Some(Props::Audio(AudioProps { + delay, + padding, + sample_rate, + sample_format, + bit_per_sample, + channel_layout, + })), + ( + None, + Some(ffmpeg_media_video_props::Data { + pixel_format, + color_range, + bits_per_channel, + color_space, + color_primaries, + color_transfer, + field_order, + chroma_location, + width, + height, + aspect_ratio_num, + aspect_ratio_den, + properties, + .. + }), + ) => Some(Props::Video(VideoProps { + pixel_format, + color_range, + bits_per_channel, + color_space, + color_primaries, + color_transfer, + field_order, + chroma_location, + width, + height, + aspect_ratio_num, + aspect_ratio_den, + properties: properties + .map(|dispositions| { + dispositions + .split(',') + .map(String::from) + .collect::>() + }) + .unwrap_or_default(), + })), + _ => None, + }, + }, + ) +} diff --git a/core/crates/heavy-lifting/src/media_processor/helpers/mod.rs b/core/crates/heavy-lifting/src/media_processor/helpers/mod.rs index 4432d19a7..702981355 100644 --- a/core/crates/heavy-lifting/src/media_processor/helpers/mod.rs +++ b/core/crates/heavy-lifting/src/media_processor/helpers/mod.rs @@ -1,3 +1,12 @@ pub mod exif_media_data; pub mod ffmpeg_media_data; pub mod thumbnailer; + +#[must_use] +fn from_slice_option_to_option( + value: Option>, +) -> Option { + value + .map(|x| serde_json::from_slice(&x).ok()) + .unwrap_or_default() +} diff --git a/core/crates/heavy-lifting/src/media_processor/helpers/thumbnailer.rs b/core/crates/heavy-lifting/src/media_processor/helpers/thumbnailer.rs index 5f2de34e7..5f636f606 100644 --- a/core/crates/heavy-lifting/src/media_processor/helpers/thumbnailer.rs +++ b/core/crates/heavy-lifting/src/media_processor/helpers/thumbnailer.rs @@ -1,16 +1,37 @@ -use once_cell::sync::Lazy; +use crate::media_processor::thumbnailer; + +use sd_core_prisma_helpers::CasId; + use sd_file_ext::extensions::{ DocumentExtension, Extension, ImageExtension, ALL_DOCUMENT_EXTENSIONS, ALL_IMAGE_EXTENSIONS, }; +use sd_images::{format_image, scale_dimensions, ConvertibleExtension}; +use sd_media_metadata::exif::Orientation; +use sd_utils::error::FileIOError; #[cfg(feature = "ffmpeg")] use sd_file_ext::extensions::{VideoExtension, ALL_VIDEO_EXTENSIONS}; -use std::time::Duration; +use std::{ + ops::Deref, + path::{Path, PathBuf}, + str::FromStr, + time::Duration, +}; +use image::{imageops, DynamicImage, GenericImageView}; +use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; use specta::Type; +use tokio::{ + fs, io, + sync::{oneshot, Mutex}, + task::spawn_blocking, + time::{sleep, Instant}, +}; +use tracing::{error, instrument, trace}; use uuid::Uuid; +use webp::Encoder; // Files names constants pub const THUMBNAIL_CACHE_DIR_NAME: &str = "thumbnails"; @@ -25,8 +46,12 @@ pub const TARGET_PX: f32 = 1_048_576.0; // 1024x1024 /// and is treated as a percentage (so 60% in this case, or it's the same as multiplying by `0.6`). pub const TARGET_QUALITY: f32 = 60.0; -/// How much time we allow for the thumbnail generation process to complete before we give up. -pub const THUMBNAIL_GENERATION_TIMEOUT: Duration = Duration::from_secs(60); +/// How much time we allow for the thumbnailer task to complete before we give up. +pub const THUMBNAILER_TASK_TIMEOUT: Duration = Duration::from_secs(60 * 5); + +pub fn get_thumbnails_directory(data_directory: impl AsRef) -> PathBuf { + data_directory.as_ref().join(THUMBNAIL_CACHE_DIR_NAME) +} #[cfg(feature = "ffmpeg")] pub static THUMBNAILABLE_VIDEO_EXTENSIONS: Lazy> = Lazy::new(|| { @@ -68,25 +93,43 @@ pub static ALL_THUMBNAILABLE_EXTENSIONS: Lazy> = Lazy::new(|| { /// This type is used to pass the relevant data to the frontend so it can request the thumbnail. /// Tt supports extending the shard hex to support deeper directory structures in the future -#[derive(Debug, Serialize, Deserialize, Type)] +#[derive(Debug, Serialize, Deserialize, Type, Clone)] pub struct ThumbKey { pub shard_hex: String, - pub cas_id: String, + pub cas_id: CasId<'static>, pub base_directory_str: String, } impl ThumbKey { #[must_use] - pub fn new(cas_id: &str, kind: &ThumbnailKind) -> Self { + pub fn new(cas_id: CasId<'static>, kind: &ThumbnailKind) -> Self { Self { - shard_hex: get_shard_hex(cas_id).to_string(), - cas_id: cas_id.to_string(), + shard_hex: get_shard_hex(&cas_id).to_string(), + cas_id, base_directory_str: match kind { ThumbnailKind::Ephemeral => String::from(EPHEMERAL_DIR), ThumbnailKind::Indexed(library_id) => library_id.to_string(), }, } } + + #[must_use] + pub fn new_indexed(cas_id: CasId<'static>, library_id: Uuid) -> Self { + Self { + shard_hex: get_shard_hex(&cas_id).to_string(), + cas_id, + base_directory_str: library_id.to_string(), + } + } + + #[must_use] + pub fn new_ephemeral(cas_id: CasId<'static>) -> Self { + Self { + shard_hex: get_shard_hex(&cas_id).to_string(), + cas_id, + base_directory_str: String::from(EPHEMERAL_DIR), + } + } } #[derive(Debug, Serialize, Deserialize, Type, Clone, Copy)] @@ -95,6 +138,41 @@ pub enum ThumbnailKind { Indexed(Uuid), } +impl ThumbnailKind { + pub fn compute_path(&self, data_directory: impl AsRef, cas_id: &CasId<'_>) -> PathBuf { + let mut thumb_path = get_thumbnails_directory(data_directory); + match self { + Self::Ephemeral => thumb_path.push(EPHEMERAL_DIR), + Self::Indexed(library_id) => { + thumb_path.push(library_id.to_string()); + } + } + thumb_path.push(get_shard_hex(cas_id)); + thumb_path.push(cas_id.as_str()); + thumb_path.set_extension(WEBP_EXTENSION); + + thumb_path + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct GenerateThumbnailArgs<'cas_id> { + pub extension: String, + pub cas_id: CasId<'cas_id>, + pub path: PathBuf, +} + +impl<'cas_id> GenerateThumbnailArgs<'cas_id> { + #[must_use] + pub const fn new(extension: String, cas_id: CasId<'cas_id>, path: PathBuf) -> Self { + Self { + extension, + cas_id, + path, + } + } +} + /// The practice of dividing files into hex coded folders, often called "sharding," /// is mainly used to optimize file system performance. File systems can start to slow down /// as the number of files in a directory increases. Thus, it's often beneficial to split @@ -105,18 +183,21 @@ pub enum ThumbnailKind { /// three characters of a the hash, this will give us 4096 (16^3) possible directories, /// named 000 to fff. #[inline] -pub fn get_shard_hex(cas_id: &str) -> &str { +#[must_use] +pub fn get_shard_hex<'cas_id>(cas_id: &'cas_id CasId<'cas_id>) -> &'cas_id str { // Use the first three characters of the hash as the directory name - &cas_id[0..3] + &cas_id.as_str()[0..3] } #[cfg(feature = "ffmpeg")] +#[must_use] pub const fn can_generate_thumbnail_for_video(video_extension: VideoExtension) -> bool { use VideoExtension::{Hevc, M2ts, M2v, Mpg, Mts, Swf, Ts}; // File extensions that are specifically not supported by the thumbnailer !matches!(video_extension, Mpg | Swf | M2v | Hevc | M2ts | Mts | Ts) } +#[must_use] pub const fn can_generate_thumbnail_for_image(image_extension: ImageExtension) -> bool { use ImageExtension::{ Avif, Bmp, Gif, Heic, Heics, Heif, Heifs, Ico, Jpeg, Jpg, Png, Svg, Webp, @@ -128,8 +209,291 @@ pub const fn can_generate_thumbnail_for_image(image_extension: ImageExtension) - ) } +#[must_use] pub const fn can_generate_thumbnail_for_document(document_extension: DocumentExtension) -> bool { use DocumentExtension::Pdf; matches!(document_extension, Pdf) } + +#[derive(Debug)] +pub enum GenerationStatus { + Generated, + Skipped, +} + +#[instrument(skip(thumbnails_directory, cas_id, should_regenerate, kind))] +pub async fn generate_thumbnail( + thumbnails_directory: &Path, + GenerateThumbnailArgs { + extension, + cas_id, + path, + }: &GenerateThumbnailArgs<'_>, + kind: &ThumbnailKind, + should_regenerate: bool, +) -> ( + Duration, + Result<(ThumbKey, GenerationStatus), thumbnailer::NonCriticalThumbnailerError>, +) { + trace!("Generating thumbnail"); + let start = Instant::now(); + + let mut output_path = match kind { + ThumbnailKind::Ephemeral => thumbnails_directory.join(EPHEMERAL_DIR), + ThumbnailKind::Indexed(library_id) => thumbnails_directory.join(library_id.to_string()), + }; + + output_path.push(get_shard_hex(cas_id)); + output_path.push(cas_id.as_str()); + output_path.set_extension(WEBP_EXTENSION); + + if let Err(e) = fs::metadata(&*output_path).await { + if e.kind() != io::ErrorKind::NotFound { + error!( + ?e, + "Failed to check if thumbnail exists, but we will try to generate it anyway;" + ); + } + // Otherwise we good, thumbnail doesn't exist so we can generate it + } else if !should_regenerate { + trace!("Skipping thumbnail generation because it already exists"); + return ( + start.elapsed(), + Ok(( + ThumbKey::new(cas_id.to_owned(), kind), + GenerationStatus::Skipped, + )), + ); + } + + if let Ok(extension) = ImageExtension::from_str(extension) { + if can_generate_thumbnail_for_image(extension) { + trace!("Generating image thumbnail"); + if let Err(e) = generate_image_thumbnail(&path, &output_path).await { + return (start.elapsed(), Err(e)); + } + trace!("Generated image thumbnail"); + } + } else if let Ok(extension) = DocumentExtension::from_str(extension) { + if can_generate_thumbnail_for_document(extension) { + trace!("Generating document thumbnail"); + if let Err(e) = generate_image_thumbnail(&path, &output_path).await { + return (start.elapsed(), Err(e)); + } + trace!("Generating document thumbnail"); + } + } + + #[cfg(feature = "ffmpeg")] + { + use crate::media_processor::helpers::thumbnailer::can_generate_thumbnail_for_video; + use sd_file_ext::extensions::VideoExtension; + + if let Ok(extension) = VideoExtension::from_str(extension) { + if can_generate_thumbnail_for_video(extension) { + trace!("Generating video thumbnail"); + if let Err(e) = generate_video_thumbnail(&path, &output_path).await { + return (start.elapsed(), Err(e)); + } + trace!("Generated video thumbnail"); + } + } + } + + trace!("Generated thumbnail"); + + ( + start.elapsed(), + Ok(( + ThumbKey::new(cas_id.to_owned(), kind), + GenerationStatus::Generated, + )), + ) +} + +fn inner_generate_image_thumbnail( + file_path: PathBuf, +) -> Result, thumbnailer::NonCriticalThumbnailerError> { + let mut img = format_image(&file_path).map_err(|e| { + thumbnailer::NonCriticalThumbnailerError::FormatImage(file_path.clone(), e.to_string()) + })?; + + let (w, h) = img.dimensions(); + + #[allow(clippy::cast_precision_loss)] + let (w_scaled, h_scaled) = scale_dimensions(w as f32, h as f32, TARGET_PX); + + // Optionally, resize the existing photo and convert back into DynamicImage + if w != w_scaled && h != h_scaled { + img = DynamicImage::ImageRgba8(imageops::resize( + &img, + w_scaled, + h_scaled, + imageops::FilterType::Triangle, + )); + } + + // this corrects the rotation/flip of the image based on the *available* exif data + // not all images have exif data, so we don't error. we also don't rotate HEIF as that's against the spec + if let Some(orientation) = Orientation::from_path(&file_path) { + if ConvertibleExtension::try_from(file_path.as_ref()) + .expect("we already checked if the image was convertible") + .should_rotate() + { + img = orientation.correct_thumbnail(img); + } + } + + // Create the WebP encoder for the above image + let encoder = Encoder::from_image(&img).map_err(|reason| { + thumbnailer::NonCriticalThumbnailerError::WebPEncoding(file_path, reason.to_string()) + })?; + + // Type `WebPMemory` is !Send, which makes the `Future` in this function `!Send`, + // this make us `deref` to have a `&[u8]` and then `to_owned` to make a `Vec` + // which implies on a unwanted clone... + Ok(encoder.encode(TARGET_QUALITY).deref().to_owned()) +} + +#[instrument( + skip_all, + fields( + input_path = %file_path.as_ref().display(), + output_path = %output_path.as_ref().display() + ) +)] +async fn generate_image_thumbnail( + file_path: impl AsRef + Send, + output_path: impl AsRef + Send, +) -> Result<(), thumbnailer::NonCriticalThumbnailerError> { + let file_path = file_path.as_ref().to_path_buf(); + + let (tx, rx) = oneshot::channel(); + + // Using channel instead of waiting the JoinHandle as for some reason + // the JoinHandle can take some extra time to complete + let handle = spawn_blocking({ + let file_path = file_path.clone(); + + move || { + // Handling error on receiver side + let _ = tx.send(inner_generate_image_thumbnail(file_path)); + } + }); + + let webp = if let Ok(res) = rx.await { + res? + } else { + error!("Failed to generate thumbnail"); + return Err( + thumbnailer::NonCriticalThumbnailerError::PanicWhileGeneratingThumbnail( + file_path, + handle + .await + .expect_err("as the channel was closed, then the spawned task panicked") + .to_string(), + ), + ); + }; + + trace!("Generated thumbnail bytes"); + + let output_path = output_path.as_ref(); + + if let Some(shard_dir) = output_path.parent() { + fs::create_dir_all(shard_dir).await.map_err(|e| { + thumbnailer::NonCriticalThumbnailerError::CreateShardDirectory( + FileIOError::from((shard_dir, e)).to_string(), + ) + })?; + } else { + error!("Failed to get parent directory for sharding parent directory"); + } + + trace!("Created shard directory and writing it to disk"); + + let res = fs::write(output_path, &webp).await.map_err(|e| { + thumbnailer::NonCriticalThumbnailerError::SaveThumbnail( + file_path, + FileIOError::from((output_path, e)).to_string(), + ) + }); + + trace!("Wrote thumbnail to disk"); + res +} + +#[instrument( + skip_all, + fields( + input_path = %file_path.as_ref().display(), + output_path = %output_path.as_ref().display() + ) +)] +#[cfg(feature = "ffmpeg")] +async fn generate_video_thumbnail( + file_path: impl AsRef + Send, + output_path: impl AsRef + Send, +) -> Result<(), thumbnailer::NonCriticalThumbnailerError> { + use sd_ffmpeg::{to_thumbnail, ThumbnailSize}; + + let file_path = file_path.as_ref(); + + to_thumbnail( + file_path, + output_path, + ThumbnailSize::Scale(1024), + TARGET_QUALITY, + ) + .await + .map_err(|e| { + thumbnailer::NonCriticalThumbnailerError::VideoThumbnailGenerationFailed( + file_path.to_path_buf(), + e.to_string(), + ) + }) +} + +const HALF_SEC: Duration = Duration::from_millis(500); +static LAST_SINGLE_THUMB_GENERATED_LOCK: Lazy> = + Lazy::new(|| Mutex::new(Instant::now())); + +/// WARNING!!!! DON'T USE THIS FUNCTION IN A LOOP!!!!!!!!!!!!! It will be pretty slow on purpose! +pub async fn generate_single_thumbnail( + thumbnails_directory: impl AsRef + Send, + extension: String, + cas_id: CasId<'static>, + path: impl AsRef + Send, + kind: ThumbnailKind, +) -> Result<(), thumbnailer::NonCriticalThumbnailerError> { + let mut last_single_thumb_generated_guard = LAST_SINGLE_THUMB_GENERATED_LOCK.lock().await; + + let elapsed = Instant::now() - *last_single_thumb_generated_guard; + if elapsed < HALF_SEC { + // This will choke up in case someone try to use this method in a loop, otherwise + // it will consume all the machine resources like a gluton monster from hell + sleep(HALF_SEC - elapsed).await; + } + + let (_duration, res) = generate_thumbnail( + thumbnails_directory.as_ref(), + &GenerateThumbnailArgs { + extension, + cas_id, + path: path.as_ref().to_path_buf(), + }, + &kind, + false, + ) + .await; + + let (_thumb_key, status) = res?; + + if matches!(status, GenerationStatus::Generated) { + *last_single_thumb_generated_guard = Instant::now(); + drop(last_single_thumb_generated_guard); // Clippy was weirdly complaining about not doing an "early" drop here + } + + Ok(()) +} diff --git a/core/crates/heavy-lifting/src/media_processor/job.rs b/core/crates/heavy-lifting/src/media_processor/job.rs index a8e22cdb2..bab8e506c 100644 --- a/core/crates/heavy-lifting/src/media_processor/job.rs +++ b/core/crates/heavy-lifting/src/media_processor/job.rs @@ -3,25 +3,26 @@ use crate::{ job::{Job, JobReturn, JobTaskDispatcher, ReturnStatus}, report::ReportOutputMetadata, utils::cancel_pending_tasks, - SerializableJob, SerializedTasks, + DispatcherError, JobErrorOrDispatcherError, SerializableJob, SerializedTasks, }, media_processor::{self, helpers::thumbnailer::THUMBNAIL_CACHE_DIR_NAME}, - utils::sub_path::{self, maybe_get_iso_file_path_from_sub_path}, - Error, JobName, LocationScanState, OuterContext, ProgressUpdate, + utils::sub_path::maybe_get_iso_file_path_from_sub_path, + Error, JobContext, JobName, LocationScanState, OuterContext, ProgressUpdate, }; + use sd_core_file_path_helper::IsolatedFilePathData; use sd_core_prisma_helpers::file_path_for_media_processor; use sd_file_ext::extensions::Extension; use sd_prisma::prisma::{location, PrismaClient}; use sd_task_system::{ - AnyTaskOutput, IntoTask, SerializableTask, Task, TaskDispatcher, TaskHandle, TaskOutput, - TaskStatus, + AnyTaskOutput, IntoTask, SerializableTask, Task, TaskDispatcher, TaskHandle, TaskId, + TaskOutput, TaskStatus, TaskSystemError, }; -use sd_utils::db::maybe_missing; +use sd_utils::{db::maybe_missing, u64_to_frontend}; use std::{ - collections::HashMap, + collections::{HashMap, HashSet}, fmt, hash::{Hash, Hasher}, mem, @@ -36,12 +37,15 @@ use itertools::Itertools; use prisma_client_rust::{raw, PrismaValue}; use serde::{Deserialize, Serialize}; use serde_json::json; -use tracing::{debug, warn}; +use tracing::{debug, error, instrument, trace, warn, Level}; use super::{ - helpers, - tasks::{self, media_data_extractor, thumbnailer}, - NewThumbnailsReporter, BATCH_SIZE, + get_direct_children_files_by_extensions, helpers, + tasks::{ + self, media_data_extractor, + thumbnailer::{self, NewThumbnailReporter}, + }, + NewThumbnailsReporter, RawFilePathForMediaProcessor, BATCH_SIZE, }; #[derive(Debug, Clone, Copy, Serialize, Deserialize)] @@ -75,21 +79,24 @@ impl fmt::Display for Phase { #[derive(Debug)] pub struct MediaProcessor { + // Received arguments location: Arc, location_path: Arc, sub_path: Option, regenerate_thumbnails: bool, + // Job control + total_media_data_extraction_files: u64, total_media_data_extraction_tasks: u64, total_thumbnailer_tasks: u64, total_thumbnailer_files: u64, - phase: Phase, + // Run data metadata: Metadata, - errors: Vec, + // On shutdown data pending_tasks_on_resume: Vec>, tasks_for_shutdown: Vec>>, } @@ -97,15 +104,16 @@ pub struct MediaProcessor { impl Job for MediaProcessor { const NAME: JobName = JobName::MediaProcessor; - async fn resume_tasks( + async fn resume_tasks( &mut self, dispatcher: &JobTaskDispatcher, - ctx: &impl OuterContext, + ctx: &impl JobContext, SerializedTasks(serialized_tasks): SerializedTasks, ) -> Result<(), Error> { - let reporter = Arc::new(NewThumbnailsReporter { ctx: ctx.clone() }); + let reporter: Arc = + Arc::new(NewThumbnailsReporter { ctx: ctx.clone() }); - self.pending_tasks_on_resume = dispatcher + if let Ok(tasks) = dispatcher .dispatch_many_boxed( rmp_serde::from_slice::)>>(&serialized_tasks) .map_err(media_processor::Error::from)? @@ -117,18 +125,17 @@ impl Job for MediaProcessor { TaskKind::MediaDataExtractor => { tasks::MediaDataExtractor::deserialize( &task_bytes, - Arc::clone(ctx.db()), + (Arc::clone(ctx.db()), Arc::clone(ctx.sync())), ) .await .map(IntoTask::into_task) } - TaskKind::Thumbnailer => tasks::Thumbnailer::deserialize( - &task_bytes, - Arc::clone(&reporter), - ) - .await - .map(IntoTask::into_task), + TaskKind::Thumbnailer => { + tasks::Thumbnailer::deserialize(&task_bytes, reporter) + .await + .map(IntoTask::into_task) + } } } }) @@ -137,20 +144,57 @@ impl Job for MediaProcessor { .await .map_err(media_processor::Error::from)?, ) - .await; + .await + { + self.pending_tasks_on_resume = tasks; + } else { + warn!("Failed to dispatch tasks to resume as job was already canceled"); + } Ok(()) } - async fn run( + #[instrument( + skip_all, + fields( + location_id = self.location.id, + location_path = ?self.location.path, + sub_path = ?self.sub_path.as_ref().map(|path| path.display()), + regenerate_thumbnails = self.regenerate_thumbnails, + ), + ret(level = Level::TRACE), + err, + )] + async fn run( mut self, dispatcher: JobTaskDispatcher, - ctx: Ctx, + ctx: impl JobContext, ) -> Result { let mut pending_running_tasks = FuturesUnordered::new(); - self.init_or_resume(&mut pending_running_tasks, &ctx, &dispatcher) - .await?; + match self + .init_or_resume(&mut pending_running_tasks, &ctx, &dispatcher) + .await + { + Ok(()) => { /* Everything is awesome! */ } + Err(JobErrorOrDispatcherError::JobError(e)) => { + return Err(e.into()); + } + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::JobCanceled(_))) => { + return Ok(self.cancel_job(&mut pending_running_tasks).await); + } + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::Shutdown(tasks))) => { + self.tasks_for_shutdown.extend(tasks); + + if pending_running_tasks.is_empty() { + // If no task managed to be dispatched, we can just shutdown + // otherwise we have to process handles below and wait for them to be shutdown too + return Ok(ReturnStatus::Shutdown( + SerializableJob::::serialize(self).await, + )); + } + } + } if let Some(res) = self.process_handles(&mut pending_running_tasks, &ctx).await { return res; @@ -158,7 +202,7 @@ impl Job for MediaProcessor { if !self.tasks_for_shutdown.is_empty() { return Ok(ReturnStatus::Shutdown( - SerializableJob::::serialize(self).await, + SerializableJob::::serialize(self).await, )); } @@ -204,6 +248,7 @@ impl MediaProcessor { location: Arc::new(location), sub_path, regenerate_thumbnails, + total_media_data_extraction_files: 0, total_media_data_extraction_tasks: 0, total_thumbnailer_tasks: 0, total_thumbnailer_files: 0, @@ -215,90 +260,166 @@ impl MediaProcessor { }) } - async fn init_or_resume( + #[allow(clippy::too_many_lines)] + async fn init_or_resume( &mut self, pending_running_tasks: &mut FuturesUnordered>, - ctx: &impl OuterContext, + job_ctx: &impl JobContext, dispatcher: &JobTaskDispatcher, - ) -> Result<(), media_processor::Error> { + ) -> Result<(), JobErrorOrDispatcherError> { // if we don't have any pending task, then this is a fresh job if self.pending_tasks_on_resume.is_empty() { let location_id = self.location.id; let location_path = &*self.location_path; - let iso_file_path = maybe_get_iso_file_path_from_sub_path( + let iso_file_path = maybe_get_iso_file_path_from_sub_path::( location_id, - &self.sub_path, + self.sub_path.as_ref(), &*self.location_path, - ctx.db(), + job_ctx.db(), ) .await? .map_or_else( || { IsolatedFilePathData::new(location_id, location_path, location_path, true) - .map_err(sub_path::Error::from) + .map_err(media_processor::Error::from) }, Ok, )?; - debug!( - "Searching for media files in location {location_id} at directory \"{iso_file_path}\"" - ); - // First we will dispatch all tasks for media data extraction so we have a nice reporting - let (total_media_data_extraction_files, task_handles) = - dispatch_media_data_extractor_tasks( - ctx.db(), - &iso_file_path, - &self.location_path, - dispatcher, - ) - .await?; - self.total_media_data_extraction_tasks = task_handles.len() as u64; - - pending_running_tasks.extend(task_handles); - - ctx.progress(vec![ - ProgressUpdate::TaskCount(total_media_data_extraction_files), - ProgressUpdate::Phase(self.phase.to_string()), - ProgressUpdate::Message(format!( - "Preparing to process {total_media_data_extraction_files} files in {} chunks", - self.total_media_data_extraction_tasks - )), - ]); + let media_data_extraction_tasks_res = self + .dispatch_media_data_extractor_tasks(&iso_file_path, dispatcher, job_ctx) + .await; // Now we dispatch thumbnailer tasks - let (total_thumbnailer_tasks, task_handles) = dispatch_thumbnailer_tasks( - &iso_file_path, - self.regenerate_thumbnails, - &self.location_path, - dispatcher, - ctx, - ) - .await?; - pending_running_tasks.extend(task_handles); + let thumbnailer_tasks_res = self + .dispatch_thumbnailer_tasks( + &iso_file_path, + self.regenerate_thumbnails, + dispatcher, + job_ctx, + ) + .await; - self.total_thumbnailer_tasks = total_thumbnailer_tasks; + match (media_data_extraction_tasks_res, thumbnailer_tasks_res) { + (Ok(media_data_extraction_task_handles), Ok(thumbnailer_task_handles)) => { + pending_running_tasks.extend( + media_data_extraction_task_handles + .into_iter() + .chain(thumbnailer_task_handles), + ); + } + + ( + Ok(task_handles), + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::JobCanceled(e))), + ) + | ( + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::JobCanceled(e))), + Ok(task_handles), + ) => { + pending_running_tasks.extend(task_handles); + return Err(JobErrorOrDispatcherError::Dispatcher( + DispatcherError::JobCanceled(e), + )); + } + + ( + Ok(task_handles), + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::Shutdown(tasks))), + ) + | ( + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::Shutdown(tasks))), + Ok(task_handles), + ) => { + self.tasks_for_shutdown.extend(tasks); + pending_running_tasks.extend(task_handles); + } + + ( + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::Shutdown( + media_data_extraction_tasks, + ))), + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::Shutdown( + thumbnailer_tasks, + ))), + ) => { + self.tasks_for_shutdown.extend( + media_data_extraction_tasks + .into_iter() + .chain(thumbnailer_tasks), + ); + } + + ( + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::JobCanceled(e))), + _, + ) + | ( + _, + Err(JobErrorOrDispatcherError::Dispatcher(DispatcherError::JobCanceled(e))), + ) => { + return Err(JobErrorOrDispatcherError::Dispatcher( + DispatcherError::JobCanceled(e), + )); + } + + (Err(JobErrorOrDispatcherError::JobError(e)), _) + | (_, Err(JobErrorOrDispatcherError::JobError(e))) => { + return Err(e.into()); + } + } } else { + let updates = match self.phase { + Phase::MediaDataExtraction => vec![ + ProgressUpdate::TaskCount(self.total_media_data_extraction_files), + ProgressUpdate::CompletedTaskCount( + self.metadata.media_data_metrics.extracted + + self.metadata.media_data_metrics.skipped, + ), + ProgressUpdate::Phase(self.phase.to_string()), + ProgressUpdate::Message(format!( + "Preparing to process {} files in {} chunks", + self.total_media_data_extraction_files, + self.total_media_data_extraction_tasks + )), + ], + Phase::ThumbnailGeneration => vec![ + ProgressUpdate::TaskCount(self.total_thumbnailer_files), + ProgressUpdate::CompletedTaskCount( + self.metadata.thumbnailer_metrics_acc.generated + + self.metadata.thumbnailer_metrics_acc.skipped, + ), + ProgressUpdate::Phase(self.phase.to_string()), + ProgressUpdate::Message(format!( + "Preparing to process {} files in {} chunks", + self.total_thumbnailer_files, self.total_thumbnailer_tasks + )), + ], + }; + + job_ctx.progress(updates).await; + pending_running_tasks.extend(mem::take(&mut self.pending_tasks_on_resume)); } Ok(()) } - async fn process_handles( + async fn process_handles( &mut self, pending_running_tasks: &mut FuturesUnordered>, - ctx: &impl OuterContext, + job_ctx: &impl JobContext, ) -> Option> { while let Some(task) = pending_running_tasks.next().await { match task { Ok(TaskStatus::Done((task_id, TaskOutput::Out(out)))) => { - self.process_task_output(task_id, out, ctx); + self.process_task_output(task_id, out, job_ctx).await; } Ok(TaskStatus::Done((task_id, TaskOutput::Empty))) => { - warn!("Task returned an empty output"); + warn!(%task_id, "Task returned an empty output;"); } Ok(TaskStatus::Shutdown(task)) => { @@ -306,19 +427,31 @@ impl MediaProcessor { } Ok(TaskStatus::Error(e)) => { - cancel_pending_tasks(&*pending_running_tasks).await; + cancel_pending_tasks(pending_running_tasks).await; return Some(Err(e)); } Ok(TaskStatus::Canceled | TaskStatus::ForcedAbortion) => { - cancel_pending_tasks(&*pending_running_tasks).await; + return Some(Ok(self.cancel_job(pending_running_tasks).await)); + } - return Some(Ok(ReturnStatus::Canceled)); + Err(TaskSystemError::TaskTimeout(task_id)) => { + warn!( + %task_id, + "Thumbnailer task timed out, we will keep processing the rest of the tasks;", + ); + self.errors.push( + media_processor::NonCriticalMediaProcessorError::Thumbnailer( + media_processor::NonCriticalThumbnailerError::TaskTimeout(task_id), + ) + .into(), + ); } Err(e) => { - cancel_pending_tasks(&*pending_running_tasks).await; + error!(?e, "Task System error;"); + cancel_pending_tasks(pending_running_tasks).await; return Some(Err(e.into())); } @@ -328,11 +461,11 @@ impl MediaProcessor { None } - fn process_task_output( + async fn process_task_output( &mut self, - task_id: uuid::Uuid, + task_id: TaskId, any_task_output: Box, - ctx: &impl OuterContext, + job_ctx: &impl JobContext, ) { if any_task_output.is::() { let media_data_extractor::Output { @@ -347,23 +480,29 @@ impl MediaProcessor { self.metadata.media_data_metrics.extracted += extracted; self.metadata.media_data_metrics.skipped += skipped; - self.metadata.media_data_metrics.db_read_time += db_read_time; - self.metadata.media_data_metrics.filtering_time += filtering_time; - self.metadata.media_data_metrics.extraction_time += extraction_time; - self.metadata.media_data_metrics.db_write_time += db_write_time; + self.metadata.media_data_metrics.mean_db_read_time += db_read_time; + self.metadata.media_data_metrics.mean_filtering_time += filtering_time; + self.metadata.media_data_metrics.mean_extraction_time += extraction_time; + self.metadata.media_data_metrics.mean_db_write_time += db_write_time; self.metadata.media_data_metrics.total_successful_tasks += 1; - self.errors.extend(errors); + if !errors.is_empty() { + warn!(?errors, "Non critical errors while extracting media data;"); + self.errors.extend(errors); + } debug!( - "Processed {}/{} media data extraction tasks", + "Processed ({}/{}) media data extraction tasks, took: {:?};", self.metadata.media_data_metrics.total_successful_tasks, - self.total_media_data_extraction_tasks + self.total_media_data_extraction_tasks, + db_read_time + filtering_time + extraction_time + db_write_time, ); - ctx.progress(vec![ProgressUpdate::CompletedTaskCount( - self.metadata.media_data_metrics.extracted - + self.metadata.media_data_metrics.skipped, - )]); + job_ctx + .progress(vec![ProgressUpdate::CompletedTaskCount( + self.metadata.media_data_metrics.extracted + + self.metadata.media_data_metrics.skipped, + )]) + .await; if self.total_media_data_extraction_tasks == self.metadata.media_data_metrics.total_successful_tasks @@ -372,14 +511,16 @@ impl MediaProcessor { self.phase = Phase::ThumbnailGeneration; - ctx.progress(vec![ - ProgressUpdate::TaskCount(self.total_thumbnailer_files), - ProgressUpdate::Phase(self.phase.to_string()), - ProgressUpdate::Message(format!( - "Waiting for processing of {} thumbnails in {} tasks", - self.total_thumbnailer_files, self.total_thumbnailer_tasks - )), - ]); + job_ctx + .progress(vec![ + ProgressUpdate::TaskCount(self.total_thumbnailer_files), + ProgressUpdate::Phase(self.phase.to_string()), + ProgressUpdate::Message(format!( + "Waiting for processing of {} thumbnails in {} tasks", + self.total_thumbnailer_files, self.total_thumbnailer_tasks + )), + ]) + .await; } } else if any_task_output.is::() { let thumbnailer::Output { @@ -393,17 +534,30 @@ impl MediaProcessor { self.metadata.thumbnailer_metrics_acc.generated += generated; self.metadata.thumbnailer_metrics_acc.skipped += skipped; - self.metadata.thumbnailer_metrics_acc.total_time += total_time; + self.metadata.thumbnailer_metrics_acc.mean_total_time += total_time; self.metadata.thumbnailer_metrics_acc.mean_time_acc += mean_time_acc; self.metadata.thumbnailer_metrics_acc.std_dev_acc += std_dev_acc; self.metadata.thumbnailer_metrics_acc.total_successful_tasks += 1; - self.errors.extend(errors); + if !errors.is_empty() { + warn!(?errors, "Non critical errors while generating thumbnails;"); + self.errors.extend(errors); + } - ctx.progress(vec![ProgressUpdate::CompletedTaskCount( - self.metadata.thumbnailer_metrics_acc.generated - + self.metadata.thumbnailer_metrics_acc.skipped, - )]); + debug!( + "Processed ({}/{}) thumbnailer tasks, took: {total_time:?}", + self.metadata.thumbnailer_metrics_acc.total_successful_tasks, + self.total_thumbnailer_tasks + ); + + if matches!(self.phase, Phase::ThumbnailGeneration) { + job_ctx + .progress(vec![ProgressUpdate::CompletedTaskCount( + self.metadata.thumbnailer_metrics_acc.generated + + self.metadata.thumbnailer_metrics_acc.skipped, + )]) + .await; + } // if self.total_thumbnailer_tasks // == self.metadata.thumbnailer_metrics_acc.total_successful_tasks @@ -419,12 +573,207 @@ impl MediaProcessor { // "Waiting for processing of {} labels in {} tasks", // self.total_labeller_files, self.total_labeller_tasks // )), - // ]); + // ]).await; // } } else { unreachable!("Unexpected task output type: "); } } + + async fn cancel_job( + &mut self, + pending_running_tasks: &mut FuturesUnordered>, + ) -> ReturnStatus { + cancel_pending_tasks(pending_running_tasks).await; + + ReturnStatus::Canceled( + JobReturn::builder() + .with_metadata(mem::take(&mut self.metadata)) + .with_non_critical_errors(mem::take(&mut self.errors)) + .build(), + ) + } + + #[instrument(skip_all, fields(parent_iso_file_path = %parent_iso_file_path.as_ref().display()))] + async fn dispatch_media_data_extractor_tasks( + &mut self, + parent_iso_file_path: &IsolatedFilePathData<'_>, + dispatcher: &JobTaskDispatcher, + job_ctx: &impl JobContext, + ) -> Result>, JobErrorOrDispatcherError> { + let db = job_ctx.db(); + let sync = job_ctx.sync(); + + let (extract_exif_file_paths, extract_ffmpeg_file_paths) = ( + get_all_children_files_by_extensions( + parent_iso_file_path, + &helpers::exif_media_data::AVAILABLE_EXTENSIONS, + db, + ), + get_all_children_files_by_extensions( + parent_iso_file_path, + &helpers::ffmpeg_media_data::AVAILABLE_EXTENSIONS, + db, + ), + ) + .try_join() + .await?; + + let files_count = (extract_exif_file_paths.len() + extract_ffmpeg_file_paths.len()) as u64; + + let tasks = extract_exif_file_paths + .into_iter() + .chunks(BATCH_SIZE) + .into_iter() + .map(Iterator::collect::>) + .map(|chunked_file_paths| { + tasks::MediaDataExtractor::new_exif( + &chunked_file_paths, + parent_iso_file_path.location_id(), + Arc::clone(&self.location_path), + Arc::clone(db), + Arc::clone(sync), + ) + }) + .map(IntoTask::into_task) + .chain( + extract_ffmpeg_file_paths + .into_iter() + .chunks(BATCH_SIZE) + .into_iter() + .map(Iterator::collect::>) + .map(|chunked_file_paths| { + tasks::MediaDataExtractor::new_ffmpeg( + &chunked_file_paths, + parent_iso_file_path.location_id(), + Arc::clone(&self.location_path), + Arc::clone(db), + Arc::clone(sync), + ) + }) + .map(IntoTask::into_task), + ) + .collect::>(); + + trace!( + tasks_count = tasks.len(), + %files_count, + "Dispatching media data extraction tasks;", + ); + + self.total_media_data_extraction_files = files_count; + self.total_media_data_extraction_tasks = tasks.len() as u64; + + job_ctx + .progress(vec![ + ProgressUpdate::TaskCount(self.total_media_data_extraction_files), + ProgressUpdate::Phase(self.phase.to_string()), + ProgressUpdate::Message(format!( + "Preparing to process {} files in {} chunks", + self.total_media_data_extraction_files, self.total_media_data_extraction_tasks + )), + ]) + .await; + + dispatcher + .dispatch_many_boxed(tasks) + .await + .map_err(Into::into) + } + + async fn dispatch_thumbnailer_tasks( + &mut self, + parent_iso_file_path: &IsolatedFilePathData<'_>, + should_regenerate: bool, + dispatcher: &JobTaskDispatcher, + ctx: &impl OuterContext, + ) -> Result>, JobErrorOrDispatcherError> { + let thumbnails_directory_path = + Arc::new(ctx.get_data_directory().join(THUMBNAIL_CACHE_DIR_NAME)); + let location_id = parent_iso_file_path.location_id(); + let library_id = ctx.id(); + let db = ctx.db(); + let reporter: Arc = + Arc::new(NewThumbnailsReporter { ctx: ctx.clone() }); + + let priority_file_paths = get_direct_children_files_by_extensions( + parent_iso_file_path, + &helpers::thumbnailer::ALL_THUMBNAILABLE_EXTENSIONS, + db, + ) + .await?; + + let priority_file_path_ids = priority_file_paths + .iter() + .map(|file_path| file_path.id) + .collect::>(); + + let mut file_paths = get_all_children_files_by_extensions( + parent_iso_file_path, + &helpers::thumbnailer::ALL_THUMBNAILABLE_EXTENSIONS, + db, + ) + .await?; + + file_paths.retain(|file_path| !priority_file_path_ids.contains(&file_path.id)); + + if priority_file_path_ids.is_empty() && file_paths.is_empty() { + return Ok(Vec::new()); + } + + let thumbs_count = (priority_file_paths.len() + file_paths.len()) as u64; + + let priority_tasks = priority_file_paths + .into_iter() + .chunks(BATCH_SIZE) + .into_iter() + .map(|chunk| { + tasks::Thumbnailer::new_indexed( + Arc::clone(&thumbnails_directory_path), + &chunk.collect::>(), + (location_id, &self.location_path), + library_id, + should_regenerate, + true, + Arc::clone(&reporter), + ) + }) + .map(IntoTask::into_task) + .collect::>(); + + let non_priority_tasks = file_paths + .into_iter() + .chunks(BATCH_SIZE) + .into_iter() + .map(|chunk| { + tasks::Thumbnailer::new_indexed( + Arc::clone(&thumbnails_directory_path), + &chunk.collect::>(), + (location_id, &self.location_path), + library_id, + should_regenerate, + false, + Arc::clone(&reporter), + ) + }) + .map(IntoTask::into_task) + .collect::>(); + + debug!( + %thumbs_count, + priority_tasks_count = priority_tasks.len(), + non_priority_tasks_count = non_priority_tasks.len(), + "Dispatching thumbnails to be processed;", + ); + + self.total_thumbnailer_tasks = (priority_tasks.len() + non_priority_tasks.len()) as u64; + self.total_thumbnailer_files = thumbs_count; + + dispatcher + .dispatch_many_boxed(priority_tasks.into_iter().chain(non_priority_tasks)) + .await + .map_err(Into::into) + } } #[derive(Debug, Serialize, Deserialize, Default)] @@ -433,7 +782,7 @@ struct Metadata { thumbnailer_metrics_acc: ThumbnailerMetricsAccumulator, } -impl From for ReportOutputMetadata { +impl From for Vec { fn from( Metadata { media_data_metrics, @@ -442,19 +791,27 @@ impl From for ReportOutputMetadata { ) -> Self { let thumbnailer_metrics = ThumbnailerMetrics::from(thumbnailer_metrics_accumulator); - Self::Metrics(HashMap::from([ - // - // Media data extractor - // - ( - "media_data_extraction_metrics".into(), - json!(media_data_metrics), - ), - // - // Thumbnailer - // - ("thumbnailer_metrics".into(), json!(thumbnailer_metrics)), - ])) + vec![ + ReportOutputMetadata::MediaProcessor { + media_data_extracted: u64_to_frontend(media_data_metrics.extracted), + media_data_skipped: u64_to_frontend(media_data_metrics.skipped), + thumbnails_generated: u64_to_frontend(thumbnailer_metrics.generated), + thumbnails_skipped: u64_to_frontend(thumbnailer_metrics.skipped), + }, + ReportOutputMetadata::Metrics(HashMap::from([ + // + // Media data extractor + // + ( + "media_data_extraction_metrics".into(), + json!(media_data_metrics), + ), + // + // Thumbnailer + // + ("thumbnailer_metrics".into(), json!(thumbnailer_metrics)), + ])), + ] } } @@ -462,10 +819,10 @@ impl From for ReportOutputMetadata { struct MediaExtractorMetrics { extracted: u64, skipped: u64, - db_read_time: Duration, - filtering_time: Duration, - extraction_time: Duration, - db_write_time: Duration, + mean_db_read_time: Duration, + mean_filtering_time: Duration, + mean_extraction_time: Duration, + mean_db_write_time: Duration, total_successful_tasks: u64, } @@ -473,7 +830,7 @@ struct MediaExtractorMetrics { struct ThumbnailerMetricsAccumulator { generated: u64, skipped: u64, - total_time: Duration, + mean_total_time: Duration, mean_time_acc: f64, std_dev_acc: f64, total_successful_tasks: u64, @@ -483,7 +840,7 @@ struct ThumbnailerMetricsAccumulator { struct ThumbnailerMetrics { generated: u64, skipped: u64, - total_generation_time: Duration, + mean_total_time: Duration, mean_generation_time: Duration, std_dev: Duration, total_successful_tasks: u64, @@ -494,33 +851,104 @@ impl From for ThumbnailerMetrics { ThumbnailerMetricsAccumulator { generated, skipped, - total_time: total_generation_time, + mean_total_time, mean_time_acc: mean_generation_time_acc, std_dev_acc, total_successful_tasks, }: ThumbnailerMetricsAccumulator, ) -> Self { + if generated + skipped == 0 { + return Self { + generated, + skipped, + mean_total_time, + mean_generation_time: Duration::ZERO, + std_dev: Duration::ZERO, + total_successful_tasks, + }; + } + #[allow(clippy::cast_precision_loss)] // SAFETY: we're probably won't have 2^52 thumbnails being generated on a single job for this cast to have // a precision loss issue let total = (generated + skipped) as f64; let mean_generation_time = mean_generation_time_acc / total; - let std_dev = Duration::from_secs_f64( - (mean_generation_time.mul_add(-mean_generation_time, std_dev_acc / total)).sqrt(), - ); + let std_dev = if generated > 1 { + Duration::from_secs_f64( + (mean_generation_time.mul_add(-mean_generation_time, std_dev_acc / total)).sqrt(), + ) + } else { + Duration::ZERO + }; Self { generated, skipped, - total_generation_time, - mean_generation_time: Duration::from_secs_f64(mean_generation_time), + mean_total_time, + mean_generation_time: Duration::from_secs_f64(if generated > 1 { + mean_generation_time + } else { + mean_generation_time_acc + }), std_dev, total_successful_tasks, } } } +async fn get_all_children_files_by_extensions( + parent_iso_file_path: &IsolatedFilePathData<'_>, + extensions: &[Extension], + db: &PrismaClient, +) -> Result, media_processor::Error> { + // FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite + // We have no data coming from the user, so this is sql injection safe + let unique_by_object_id = db + ._query_raw::(raw!( + &format!( + "SELECT + file_path.id, + file_path.materialized_path, + file_path.is_dir, + file_path.name, + file_path.extension, + file_path.cas_id, + object.id as 'object_id', + object.pub_id as 'object_pub_id' + FROM file_path + INNER JOIN object ON object.id = file_path.object_id + WHERE + file_path.location_id={{}} + AND file_path.cas_id IS NOT NULL + AND LOWER(file_path.extension) IN ({}) + AND file_path.materialized_path LIKE {{}} + ORDER BY materialized_path ASC, name ASC", + // Ordering by materialized_path so we can prioritize processing the first files + // in the above part of the directories tree + extensions + .iter() + .map(|ext| format!("LOWER('{ext}')")) + .collect::>() + .join(",") + ), + PrismaValue::Int(parent_iso_file_path.location_id()), + PrismaValue::String(format!( + "{}%", + parent_iso_file_path + .materialized_path_for_children() + .expect("sub path iso_file_path must be a directory") + )) + )) + .exec() + .await? + .into_iter() + .map(|raw_file_path| (raw_file_path.object_id, raw_file_path)) + .collect::>(); + + Ok(unique_by_object_id.into_values().map(Into::into).collect()) +} + #[derive(Serialize, Deserialize)] struct SaveState { location: Arc, @@ -528,6 +956,7 @@ struct SaveState { sub_path: Option, regenerate_thumbnails: bool, + total_media_data_extraction_files: u64, total_media_data_extraction_tasks: u64, total_thumbnailer_tasks: u64, total_thumbnailer_files: u64, @@ -541,13 +970,14 @@ struct SaveState { tasks_for_shutdown_bytes: Option, } -impl SerializableJob for MediaProcessor { +impl SerializableJob for MediaProcessor { async fn serialize(self) -> Result>, rmp_serde::encode::Error> { let Self { location, location_path, sub_path, regenerate_thumbnails, + total_media_data_extraction_files, total_media_data_extraction_tasks, total_thumbnailer_tasks, total_thumbnailer_files, @@ -558,54 +988,62 @@ impl SerializableJob for MediaProcessor { .. } = self; + let serialized_tasks = tasks_for_shutdown + .into_iter() + .map(|task| async move { + if task.is::() { + task.downcast::() + .expect("just checked") + .serialize() + .await + .map(|bytes| (TaskKind::MediaDataExtractor, bytes)) + } else if task.is::() { + task.downcast::() + .expect("just checked") + .serialize() + .await + .map(|bytes| (TaskKind::Thumbnailer, bytes)) + } else { + unreachable!("Unexpected task type: ") + } + }) + .collect::>() + .try_join() + .await?; + + let tasks_for_shutdown_bytes = if serialized_tasks.is_empty() { + None + } else { + Some(SerializedTasks(rmp_serde::to_vec_named(&serialized_tasks)?)) + }; + rmp_serde::to_vec_named(&SaveState { location, location_path, sub_path, regenerate_thumbnails, + total_media_data_extraction_files, total_media_data_extraction_tasks, total_thumbnailer_tasks, total_thumbnailer_files, phase, metadata, - tasks_for_shutdown_bytes: Some(SerializedTasks(rmp_serde::to_vec_named( - &tasks_for_shutdown - .into_iter() - .map(|task| async move { - if task.is::() { - task.downcast::() - .expect("just checked") - .serialize() - .await - .map(|bytes| (TaskKind::MediaDataExtractor, bytes)) - } else if task.is::>>() { - task.downcast::>>() - .expect("just checked") - .serialize() - .await - .map(|bytes| (TaskKind::Thumbnailer, bytes)) - } else { - unreachable!("Unexpected task type") - } - }) - .collect::>() - .try_join() - .await?, - )?)), errors, + tasks_for_shutdown_bytes, }) .map(Some) } async fn deserialize( serialized_job: &[u8], - _: &Ctx, + _: &OuterCtx, ) -> Result)>, rmp_serde::decode::Error> { let SaveState { location, location_path, sub_path, regenerate_thumbnails, + total_media_data_extraction_files, total_media_data_extraction_tasks, total_thumbnailer_tasks, total_thumbnailer_files, @@ -621,6 +1059,7 @@ impl SerializableJob for MediaProcessor { location_path, sub_path, regenerate_thumbnails, + total_media_data_extraction_files, total_media_data_extraction_tasks, total_thumbnailer_tasks, total_thumbnailer_files, @@ -643,183 +1082,3 @@ impl Hash for MediaProcessor { } } } - -async fn dispatch_media_data_extractor_tasks( - db: &Arc, - parent_iso_file_path: &IsolatedFilePathData<'_>, - location_path: &Arc, - dispatcher: &JobTaskDispatcher, -) -> Result<(u64, Vec>), media_processor::Error> { - let (extract_exif_file_paths, extract_ffmpeg_file_paths) = ( - get_all_children_files_by_extensions( - db, - parent_iso_file_path, - &helpers::exif_media_data::AVAILABLE_EXTENSIONS, - ), - get_all_children_files_by_extensions( - db, - parent_iso_file_path, - &helpers::ffmpeg_media_data::AVAILABLE_EXTENSIONS, - ), - ) - .try_join() - .await?; - - let files_count = (extract_exif_file_paths.len() + extract_ffmpeg_file_paths.len()) as u64; - - let tasks = extract_exif_file_paths - .into_iter() - .chunks(BATCH_SIZE) - .into_iter() - .map(Iterator::collect::>) - .map(|chunked_file_paths| { - tasks::MediaDataExtractor::new_exif( - &chunked_file_paths, - parent_iso_file_path.location_id(), - Arc::clone(location_path), - Arc::clone(db), - ) - }) - .map(IntoTask::into_task) - .chain( - extract_ffmpeg_file_paths - .into_iter() - .chunks(BATCH_SIZE) - .into_iter() - .map(Iterator::collect::>) - .map(|chunked_file_paths| { - tasks::MediaDataExtractor::new_ffmpeg( - &chunked_file_paths, - parent_iso_file_path.location_id(), - Arc::clone(location_path), - Arc::clone(db), - ) - }) - .map(IntoTask::into_task), - ) - .collect::>(); - - Ok((files_count, dispatcher.dispatch_many_boxed(tasks).await)) -} - -async fn get_all_children_files_by_extensions( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, - extensions: &[Extension], -) -> Result, media_processor::Error> { - // FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite - // We have no data coming from the user, so this is sql injection safe - db._query_raw(raw!( - &format!( - "SELECT id, materialized_path, is_dir, name, extension, cas_id, object_id - FROM file_path - WHERE - location_id={{}} - AND cas_id IS NOT NULL - AND LOWER(extension) IN ({}) - AND materialized_path LIKE {{}} - ORDER BY materialized_path ASC", - // Ordering by materialized_path so we can prioritize processing the first files - // in the above part of the directories tree - extensions - .iter() - .map(|ext| format!("LOWER('{ext}')")) - .collect::>() - .join(",") - ), - PrismaValue::Int(parent_iso_file_path.location_id()), - PrismaValue::String(format!( - "{}%", - parent_iso_file_path - .materialized_path_for_children() - .expect("sub path iso_file_path must be a directory") - )) - )) - .exec() - .await - .map_err(Into::into) -} - -async fn dispatch_thumbnailer_tasks( - parent_iso_file_path: &IsolatedFilePathData<'_>, - should_regenerate: bool, - location_path: &PathBuf, - dispatcher: &JobTaskDispatcher, - ctx: &impl OuterContext, -) -> Result<(u64, Vec>), media_processor::Error> { - let thumbnails_directory_path = - Arc::new(ctx.get_data_directory().join(THUMBNAIL_CACHE_DIR_NAME)); - let location_id = parent_iso_file_path.location_id(); - let library_id = ctx.id(); - let db = ctx.db(); - let reporter = Arc::new(NewThumbnailsReporter { ctx: ctx.clone() }); - - let mut file_paths = get_all_children_files_by_extensions( - db, - parent_iso_file_path, - &helpers::thumbnailer::ALL_THUMBNAILABLE_EXTENSIONS, - ) - .await?; - - let thumbs_count = file_paths.len() as u64; - - let first_materialized_path = file_paths[0].materialized_path.clone(); - - // Only the first materialized_path should be processed with priority as the user must see the thumbnails ASAP - let different_materialized_path_idx = file_paths - .iter() - .position(|file_path| file_path.materialized_path != first_materialized_path); - - let non_priority_tasks = different_materialized_path_idx - .map(|idx| { - file_paths - .drain(idx..) - .chunks(BATCH_SIZE) - .into_iter() - .map(|chunk| { - tasks::Thumbnailer::new_indexed( - Arc::clone(&thumbnails_directory_path), - &chunk.collect::>(), - (location_id, location_path), - library_id, - should_regenerate, - false, - Arc::clone(&reporter), - ) - }) - .map(IntoTask::into_task) - .collect::>() - }) - .unwrap_or_default(); - - let priority_tasks = file_paths - .into_iter() - .chunks(BATCH_SIZE) - .into_iter() - .map(|chunk| { - tasks::Thumbnailer::new_indexed( - Arc::clone(&thumbnails_directory_path), - &chunk.collect::>(), - (location_id, location_path), - library_id, - should_regenerate, - true, - Arc::clone(&reporter), - ) - }) - .map(IntoTask::into_task) - .collect::>(); - - debug!( - "Dispatching {thumbs_count} thumbnails to be processed, {} with priority and {} without priority tasks", - priority_tasks.len(), - non_priority_tasks.len() - ); - - Ok(( - thumbs_count, - dispatcher - .dispatch_many_boxed(priority_tasks.into_iter().chain(non_priority_tasks)) - .await, - )) -} diff --git a/core/crates/heavy-lifting/src/media_processor/mod.rs b/core/crates/heavy-lifting/src/media_processor/mod.rs index 7197e686f..c21c2e3dc 100644 --- a/core/crates/heavy-lifting/src/media_processor/mod.rs +++ b/core/crates/heavy-lifting/src/media_processor/mod.rs @@ -1,11 +1,15 @@ use crate::{utils::sub_path, OuterContext, UpdateEvent}; -use sd_core_file_path_helper::FilePathError; +use sd_core_file_path_helper::{FilePathError, IsolatedFilePathData}; +use sd_core_prisma_helpers::file_path_for_media_processor; +use sd_file_ext::extensions::Extension; +use sd_prisma::prisma::{file_path, object, PrismaClient}; use sd_utils::db::MissingFieldError; -use std::fmt; +use std::{collections::HashMap, fmt}; +use prisma_client_rust::{raw, PrismaValue}; use serde::{Deserialize, Serialize}; use specta::Type; @@ -19,10 +23,22 @@ pub use tasks::{ thumbnailer::{self, Thumbnailer}, }; -pub use helpers::thumbnailer::{ThumbKey, ThumbnailKind}; +pub use helpers::{ + exif_media_data, ffmpeg_media_data, + thumbnailer::{ + can_generate_thumbnail_for_document, can_generate_thumbnail_for_image, + generate_single_thumbnail, get_shard_hex, get_thumbnails_directory, GenerateThumbnailArgs, + ThumbKey, ThumbnailKind, WEBP_EXTENSION, + }, +}; + +#[cfg(feature = "ffmpeg")] +pub use helpers::thumbnailer::can_generate_thumbnail_for_video; + pub use shallow::shallow; -use self::thumbnailer::NewThumbnailReporter; +use media_data_extractor::NonCriticalMediaDataExtractorError; +use thumbnailer::{NewThumbnailReporter, NonCriticalThumbnailerError}; const BATCH_SIZE: usize = 10; @@ -43,31 +59,126 @@ pub enum Error { impl From for rspc::Error { fn from(e: Error) -> Self { - Self::with_cause(rspc::ErrorCode::InternalServerError, e.to_string(), e) + match e { + Error::SubPath(sub_path_err) => sub_path_err.into(), + + _ => Self::with_cause(rspc::ErrorCode::InternalServerError, e.to_string(), e), + } } } -#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)] -pub enum NonCriticalError { +#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type, Clone)] +#[serde(rename_all = "snake_case")] +pub enum NonCriticalMediaProcessorError { #[error(transparent)] - MediaDataExtractor(#[from] media_data_extractor::NonCriticalError), + MediaDataExtractor(#[from] NonCriticalMediaDataExtractorError), #[error(transparent)] - Thumbnailer(#[from] thumbnailer::NonCriticalError), + Thumbnailer(#[from] NonCriticalThumbnailerError), } -struct NewThumbnailsReporter { - ctx: Ctx, +#[derive(Clone)] +pub struct NewThumbnailsReporter { + pub ctx: OuterCtx, } -impl fmt::Debug for NewThumbnailsReporter { +impl fmt::Debug for NewThumbnailsReporter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("NewThumbnailsReporter").finish() } } -impl NewThumbnailReporter for NewThumbnailsReporter { +impl NewThumbnailReporter for NewThumbnailsReporter { fn new_thumbnail(&self, thumb_key: ThumbKey) { self.ctx - .report_update(UpdateEvent::NewThumbnailEvent { thumb_key }); + .report_update(UpdateEvent::NewThumbnail { thumb_key }); } } + +#[derive(Deserialize)] +struct RawFilePathForMediaProcessor { + id: file_path::id::Type, + materialized_path: file_path::materialized_path::Type, + is_dir: file_path::is_dir::Type, + name: file_path::name::Type, + extension: file_path::extension::Type, + cas_id: file_path::cas_id::Type, + object_id: object::id::Type, + object_pub_id: object::pub_id::Type, +} + +impl From for file_path_for_media_processor::Data { + fn from( + RawFilePathForMediaProcessor { + id, + materialized_path, + is_dir, + name, + extension, + cas_id, + object_id, + object_pub_id, + }: RawFilePathForMediaProcessor, + ) -> Self { + Self { + id, + materialized_path, + is_dir, + name, + extension, + cas_id, + object: Some(file_path_for_media_processor::object::Data { + id: object_id, + pub_id: object_pub_id, + }), + } + } +} + +async fn get_direct_children_files_by_extensions( + parent_iso_file_path: &IsolatedFilePathData<'_>, + extensions: &[Extension], + db: &PrismaClient, +) -> Result, Error> { + // FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite + // We have no data coming from the user, so this is sql injection safe + let unique_by_object_id = db + ._query_raw::(raw!( + &format!( + "SELECT + file_path.id, + file_path.materialized_path, + file_path.is_dir, + file_path.name, + file_path.extension, + file_path.cas_id, + object.id as 'object_id', + object.pub_id as 'object_pub_id' + FROM file_path + INNER JOIN object ON object.id = file_path.object_id + WHERE + location_id={{}} + AND cas_id IS NOT NULL + AND LOWER(extension) IN ({}) + AND materialized_path = {{}} + ORDER BY name ASC", + extensions + .iter() + .map(|ext| format!("LOWER('{ext}')")) + .collect::>() + .join(",") + ), + PrismaValue::Int(parent_iso_file_path.location_id()), + PrismaValue::String( + parent_iso_file_path + .materialized_path_for_children() + .expect("sub path iso_file_path must be a directory") + ) + )) + .exec() + .await? + .into_iter() + .map(|raw_file_path| (raw_file_path.object_id, raw_file_path)) + .collect::>(); + + Ok(unique_by_object_id.into_values().map(Into::into).collect()) +} diff --git a/core/crates/heavy-lifting/src/media_processor/shallow.rs b/core/crates/heavy-lifting/src/media_processor/shallow.rs index 2f3cd8322..b74c8c063 100644 --- a/core/crates/heavy-lifting/src/media_processor/shallow.rs +++ b/core/crates/heavy-lifting/src/media_processor/shallow.rs @@ -4,9 +4,8 @@ use crate::{ }; use sd_core_file_path_helper::IsolatedFilePathData; -use sd_core_prisma_helpers::file_path_for_media_processor; +use sd_core_sync::Manager as SyncManager; -use sd_file_ext::extensions::Extension; use sd_prisma::prisma::{location, PrismaClient}; use sd_task_system::{ BaseTaskDispatcher, CancelTaskOnDrop, IntoTask, TaskDispatcher, TaskHandle, TaskOutput, @@ -19,15 +18,18 @@ use std::{ sync::Arc, }; -use futures::StreamExt; -use futures_concurrency::future::{FutureGroup, TryJoin}; +use futures::{stream::FuturesUnordered, StreamExt}; +use futures_concurrency::future::TryJoin; use itertools::Itertools; -use prisma_client_rust::{raw, PrismaValue}; use tracing::{debug, warn}; use super::{ + get_direct_children_files_by_extensions, helpers::{self, exif_media_data, ffmpeg_media_data, thumbnailer::THUMBNAIL_CACHE_DIR_NAME}, - tasks::{self, media_data_extractor, thumbnailer}, + tasks::{ + self, media_data_extractor, + thumbnailer::{self, NewThumbnailReporter}, + }, NewThumbnailsReporter, BATCH_SIZE, }; @@ -35,8 +37,8 @@ use super::{ pub async fn shallow( location: location::Data, sub_path: impl AsRef + Send, - dispatcher: BaseTaskDispatcher, - ctx: impl OuterContext, + dispatcher: &BaseTaskDispatcher, + ctx: &impl OuterContext, ) -> Result, Error> { let sub_path = sub_path.as_ref(); @@ -47,14 +49,13 @@ pub async fn shallow( let location = Arc::new(location); - let sub_iso_file_path = maybe_get_iso_file_path_from_sub_path( + let sub_iso_file_path = maybe_get_iso_file_path_from_sub_path::( location.id, - &Some(sub_path), + Some(sub_path), &*location_path, ctx.db(), ) - .await - .map_err(media_processor::Error::from)? + .await? .map_or_else( || { IsolatedFilePathData::new(location.id, &*location_path, &*location_path, true) @@ -65,37 +66,70 @@ pub async fn shallow( let mut errors = vec![]; - let mut futures = dispatch_media_data_extractor_tasks( + let media_data_extraction_tasks = dispatch_media_data_extractor_tasks( ctx.db(), + ctx.sync(), &sub_iso_file_path, &location_path, - &dispatcher, + dispatcher, ) - .await? - .into_iter() - .map(CancelTaskOnDrop) - .chain( - dispatch_thumbnailer_tasks(&sub_iso_file_path, false, &location_path, &dispatcher, &ctx) - .await? - .into_iter() - .map(CancelTaskOnDrop), - ) - .collect::>(); + .await?; + + let total_media_data_extraction_tasks = media_data_extraction_tasks.len(); + + let thumbnailer_tasks = + dispatch_thumbnailer_tasks(&sub_iso_file_path, false, &location_path, dispatcher, ctx) + .await?; + + let total_thumbnailer_tasks = thumbnailer_tasks.len(); + + let mut futures = media_data_extraction_tasks + .into_iter() + .chain(thumbnailer_tasks.into_iter()) + .map(CancelTaskOnDrop::new) + .collect::>(); + + let mut completed_media_data_extraction_tasks = 0; + let mut completed_thumbnailer_tasks = 0; while let Some(res) = futures.next().await { match res { Ok(TaskStatus::Done((_, TaskOutput::Out(out)))) => { if out.is::() { - errors.extend( - out.downcast::() - .expect("just checked") - .errors, + let media_data_extractor::Output { + db_read_time, + filtering_time, + extraction_time, + db_write_time, + errors: new_errors, + .. + } = *out + .downcast::() + .expect("just checked"); + + errors.extend(new_errors); + + completed_media_data_extraction_tasks += 1; + + debug!( + "Media data extraction task ({completed_media_data_extraction_tasks}/\ + {total_media_data_extraction_tasks}) completed in {:?};", + db_read_time + filtering_time + extraction_time + db_write_time ); } else if out.is::() { - errors.extend( - out.downcast::() - .expect("just checked") - .errors, + let thumbnailer::Output { + total_time, + errors: new_errors, + .. + } = *out.downcast::().expect("just checked"); + + errors.extend(new_errors); + + completed_thumbnailer_tasks += 1; + + debug!( + "Thumbnailer task ({completed_thumbnailer_tasks}/{total_thumbnailer_tasks}) \ + completed in {total_time:?};", ); } else { unreachable!( @@ -120,20 +154,21 @@ pub async fn shallow( async fn dispatch_media_data_extractor_tasks( db: &Arc, + sync: &Arc, parent_iso_file_path: &IsolatedFilePathData<'_>, location_path: &Arc, dispatcher: &BaseTaskDispatcher, -) -> Result>, media_processor::Error> { +) -> Result>, Error> { let (extract_exif_file_paths, extract_ffmpeg_file_paths) = ( - get_files_by_extensions( - db, + get_direct_children_files_by_extensions( parent_iso_file_path, &exif_media_data::AVAILABLE_EXTENSIONS, - ), - get_files_by_extensions( db, + ), + get_direct_children_files_by_extensions( parent_iso_file_path, &ffmpeg_media_data::AVAILABLE_EXTENSIONS, + db, ), ) .try_join() @@ -150,6 +185,7 @@ async fn dispatch_media_data_extractor_tasks( parent_iso_file_path.location_id(), Arc::clone(location_path), Arc::clone(db), + Arc::clone(sync), ) }) .map(IntoTask::into_task) @@ -165,47 +201,20 @@ async fn dispatch_media_data_extractor_tasks( parent_iso_file_path.location_id(), Arc::clone(location_path), Arc::clone(db), + Arc::clone(sync), ) }) .map(IntoTask::into_task), ) .collect::>(); - Ok(dispatcher.dispatch_many_boxed(tasks).await) -} - -async fn get_files_by_extensions( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, - extensions: &[Extension], -) -> Result, media_processor::Error> { - // FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite - // We have no data coming from the user, so this is sql injection safe - db._query_raw(raw!( - &format!( - "SELECT id, materialized_path, is_dir, name, extension, cas_id, object_id - FROM file_path - WHERE - location_id={{}} - AND cas_id IS NOT NULL - AND LOWER(extension) IN ({}) - AND materialized_path = {{}}", - extensions - .iter() - .map(|ext| format!("LOWER('{ext}')")) - .collect::>() - .join(",") - ), - PrismaValue::Int(parent_iso_file_path.location_id()), - PrismaValue::String( - parent_iso_file_path - .materialized_path_for_children() - .expect("sub path iso_file_path must be a directory") - ) - )) - .exec() - .await - .map_err(Into::into) + dispatcher.dispatch_many_boxed(tasks).await.map_or_else( + |_| { + debug!("Task system is shutting down while a shallow media processor was in progress"); + Ok(vec![]) + }, + Ok, + ) } async fn dispatch_thumbnailer_tasks( @@ -214,18 +223,19 @@ async fn dispatch_thumbnailer_tasks( location_path: &PathBuf, dispatcher: &BaseTaskDispatcher, ctx: &impl OuterContext, -) -> Result>, media_processor::Error> { +) -> Result>, Error> { let thumbnails_directory_path = Arc::new(ctx.get_data_directory().join(THUMBNAIL_CACHE_DIR_NAME)); let location_id = parent_iso_file_path.location_id(); let library_id = ctx.id(); let db = ctx.db(); - let reporter = Arc::new(NewThumbnailsReporter { ctx: ctx.clone() }); + let reporter: Arc = + Arc::new(NewThumbnailsReporter { ctx: ctx.clone() }); - let file_paths = get_files_by_extensions( - db, + let file_paths = get_direct_children_files_by_extensions( parent_iso_file_path, &helpers::thumbnailer::ALL_THUMBNAILABLE_EXTENSIONS, + db, ) .await?; @@ -249,10 +259,13 @@ async fn dispatch_thumbnailer_tasks( .map(IntoTask::into_task) .collect::>(); - debug!( - "Dispatching {thumbs_count} thumbnails to be processed, in {} priority tasks", - tasks.len(), - ); + debug!(%thumbs_count, priority_tasks_count = tasks.len(), "Dispatching thumbnails to be processed;"); - Ok(dispatcher.dispatch_many_boxed(tasks).await) + dispatcher.dispatch_many_boxed(tasks).await.map_or_else( + |_| { + debug!("Task system is shutting down while a shallow media processor was in progress"); + Ok(vec![]) + }, + Ok, + ) } diff --git a/core/crates/heavy-lifting/src/media_processor/tasks/media_data_extractor.rs b/core/crates/heavy-lifting/src/media_processor/tasks/media_data_extractor.rs index 4a5f6661f..30072b1c1 100644 --- a/core/crates/heavy-lifting/src/media_processor/tasks/media_data_extractor.rs +++ b/core/crates/heavy-lifting/src/media_processor/tasks/media_data_extractor.rs @@ -7,7 +7,8 @@ use crate::{ }; use sd_core_file_path_helper::IsolatedFilePathData; -use sd_core_prisma_helpers::file_path_for_media_processor; +use sd_core_prisma_helpers::{file_path_for_media_processor, ObjectPubId}; +use sd_core_sync::Manager as SyncManager; use sd_media_metadata::{ExifMetadata, FFmpegMetadata}; use sd_prisma::prisma::{exif_data, ffmpeg_data, file_path, location, object, PrismaClient}; @@ -26,11 +27,22 @@ use std::{ time::Duration, }; -use futures::{FutureExt, StreamExt}; -use futures_concurrency::future::{FutureGroup, Race}; +use futures::{stream::FuturesUnordered, FutureExt, StreamExt}; +use futures_concurrency::future::Race; use serde::{Deserialize, Serialize}; use specta::Type; use tokio::time::Instant; +use tracing::{debug, instrument, trace, Level}; + +#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type, Clone)] +pub enum NonCriticalMediaDataExtractorError { + #[error("failed to extract media data from : {1}", .0.display())] + FailedToExtractImageMediaData(PathBuf, String), + #[error("file path missing object id: ")] + FilePathMissingObjectId(file_path::id::Type), + #[error("failed to construct isolated file path data: : {1}")] + FailedToConstructIsolatedFilePathData(file_path::id::Type, String), +} #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] enum Kind { @@ -40,14 +52,24 @@ enum Kind { #[derive(Debug)] pub struct MediaDataExtractor { + // Task control id: TaskId, kind: Kind, + + // Received input args file_paths: Vec, location_id: location::id::Type, location_path: Arc, + + // Inner state stage: Stage, - db: Arc, + + // Out collector output: Output, + + // Dependencies + db: Arc, + sync: Arc, } #[derive(Debug, Serialize, Deserialize)] @@ -55,74 +77,34 @@ enum Stage { Starting, FetchedObjectsAlreadyWithMediaData(Vec), ExtractingMediaData { - paths_by_id: HashMap, - exif_media_datas: Vec<(ExifMetadata, object::id::Type)>, + paths_by_id: HashMap, + exif_media_datas: Vec<(ExifMetadata, object::id::Type, ObjectPubId)>, ffmpeg_media_datas: Vec<(FFmpegMetadata, object::id::Type)>, extract_ids_to_remove_from_map: Vec, }, SaveMediaData { - exif_media_datas: Vec<(ExifMetadata, object::id::Type)>, + exif_media_datas: Vec<(ExifMetadata, object::id::Type, ObjectPubId)>, ffmpeg_media_datas: Vec<(FFmpegMetadata, object::id::Type)>, }, } -impl MediaDataExtractor { - fn new( - kind: Kind, - file_paths: &[file_path_for_media_processor::Data], - location_id: location::id::Type, - location_path: Arc, - db: Arc, - ) -> Self { - let mut output = Output::default(); - - Self { - id: TaskId::new_v4(), - kind, - file_paths: file_paths - .iter() - .filter(|file_path| { - if file_path.object_id.is_some() { - true - } else { - output.errors.push( - media_processor::NonCriticalError::from( - NonCriticalError::FilePathMissingObjectId(file_path.id), - ) - .into(), - ); - false - } - }) - .cloned() - .collect(), - location_id, - location_path, - stage: Stage::Starting, - db, - output, - } - } - - #[must_use] - pub fn new_exif( - file_paths: &[file_path_for_media_processor::Data], - location_id: location::id::Type, - location_path: Arc, - db: Arc, - ) -> Self { - Self::new(Kind::Exif, file_paths, location_id, location_path, db) - } - - #[must_use] - pub fn new_ffmpeg( - file_paths: &[file_path_for_media_processor::Data], - location_id: location::id::Type, - location_path: Arc, - db: Arc, - ) -> Self { - Self::new(Kind::FFmpeg, file_paths, location_id, location_path, db) - } +/// [`MediaDataExtractor`] task output +#[derive(Serialize, Deserialize, Default, Debug)] +pub struct Output { + /// How many files were successfully processed + pub extracted: u64, + /// How many files were skipped + pub skipped: u64, + /// Time spent reading data from database + pub db_read_time: Duration, + /// Time spent filtering files to extract media data and files to skip + pub filtering_time: Duration, + /// Time spent extracting media data + pub extraction_time: Duration, + /// Time spent writing media data to database + pub db_write_time: Duration, + /// Errors encountered during the task + pub errors: Vec, } #[async_trait::async_trait] @@ -138,6 +120,20 @@ impl Task for MediaDataExtractor { false } + #[instrument( + skip_all, + fields( + task_id = %self.id, + kind = ?self.kind, + location_id = %self.location_id, + location_path = %self.location_path.display(), + file_paths_count = %self.file_paths.len(), + ), + ret(level = Level::TRACE), + err, + )] + #[allow(clippy::blocks_in_conditions)] // Due to `err` on `instrument` macro above + #[allow(clippy::too_many_lines)] async fn run(&mut self, interrupter: &Interrupter) -> Result { loop { match &mut self.stage { @@ -150,18 +146,22 @@ impl Task for MediaDataExtractor { ) .await?; self.output.db_read_time = db_read_start.elapsed(); + trace!( + object_ids_count = object_ids.len(), + "Fetched objects already with media data;", + ); self.stage = Stage::FetchedObjectsAlreadyWithMediaData(object_ids); } Stage::FetchedObjectsAlreadyWithMediaData(objects_already_with_media_data) => { - let filtering_start = Instant::now(); if self.file_paths.len() == objects_already_with_media_data.len() { - // All files already have media data, skipping - self.output.skipped = self.file_paths.len() as u64; - + self.output.skipped = self.file_paths.len() as u64; // Files already have media data, skipping + debug!("Skipped all files as they already have media data"); break; } + + let filtering_start = Instant::now(); let paths_by_id = filter_files_to_extract_media_data( mem::take(objects_already_with_media_data), self.location_id, @@ -169,9 +169,13 @@ impl Task for MediaDataExtractor { &mut self.file_paths, &mut self.output, ); - self.output.filtering_time = filtering_start.elapsed(); + trace!( + paths_needing_media_data_extraction_count = paths_by_id.len(), + "Filtered files to extract media data;", + ); + self.stage = Stage::ExtractingMediaData { extract_ids_to_remove_from_map: Vec::with_capacity(paths_by_id.len()), exif_media_datas: if self.kind == Kind::Exif { @@ -241,8 +245,14 @@ impl Task for MediaDataExtractor { ffmpeg_media_datas, } => { let db_write_start = Instant::now(); - self.output.extracted = - save(self.kind, exif_media_datas, ffmpeg_media_datas, &self.db).await?; + self.output.extracted = save( + self.kind, + exif_media_datas, + ffmpeg_media_datas, + &self.db, + &self.sync, + ) + .await?; self.output.db_write_time = db_write_start.elapsed(); self.output.skipped += self.output.errors.len() as u64; @@ -258,91 +268,74 @@ impl Task for MediaDataExtractor { } } -#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)] -pub enum NonCriticalError { - #[error("failed to extract media data from : {1}", .0.display())] - FailedToExtractImageMediaData(PathBuf, String), - #[error("file path missing object id: ")] - FilePathMissingObjectId(file_path::id::Type), - #[error("failed to construct isolated file path data: : {1}")] - FailedToConstructIsolatedFilePathData(file_path::id::Type, String), -} +impl MediaDataExtractor { + fn new( + kind: Kind, + file_paths: &[file_path_for_media_processor::Data], + location_id: location::id::Type, + location_path: Arc, + db: Arc, + sync: Arc, + ) -> Self { + let mut output = Output::default(); -#[derive(Serialize, Deserialize, Default, Debug)] -pub struct Output { - pub extracted: u64, - pub skipped: u64, - pub db_read_time: Duration, - pub filtering_time: Duration, - pub extraction_time: Duration, - pub db_write_time: Duration, - pub errors: Vec, -} - -#[derive(Debug, Serialize, Deserialize)] -struct SaveState { - id: TaskId, - kind: Kind, - file_paths: Vec, - location_id: location::id::Type, - location_path: Arc, - stage: Stage, - output: Output, -} - -impl SerializableTask for MediaDataExtractor { - type SerializeError = rmp_serde::encode::Error; - - type DeserializeError = rmp_serde::decode::Error; - - type DeserializeCtx = Arc; - - async fn serialize(self) -> Result, Self::SerializeError> { - let Self { - id, + Self { + id: TaskId::new_v4(), kind, - file_paths, + file_paths: file_paths + .iter() + .filter(|file_path| { + if file_path.object.is_some() { + true + } else { + output.errors.push( + media_processor::NonCriticalMediaProcessorError::from( + NonCriticalMediaDataExtractorError::FilePathMissingObjectId( + file_path.id, + ), + ) + .into(), + ); + false + } + }) + .cloned() + .collect(), location_id, location_path, - stage, + stage: Stage::Starting, + db, + sync, output, - .. - } = self; - - rmp_serde::to_vec_named(&SaveState { - id, - kind, - file_paths, - location_id, - location_path, - stage, - output, - }) + } } - async fn deserialize( - data: &[u8], - db: Self::DeserializeCtx, - ) -> Result { - rmp_serde::from_slice(data).map( - |SaveState { - id, - kind, - file_paths, - location_id, - location_path, - stage, - output, - }| Self { - id, - kind, - file_paths, - location_id, - location_path, - stage, - db, - output, - }, + #[must_use] + pub fn new_exif( + file_paths: &[file_path_for_media_processor::Data], + location_id: location::id::Type, + location_path: Arc, + db: Arc, + sync: Arc, + ) -> Self { + Self::new(Kind::Exif, file_paths, location_id, location_path, db, sync) + } + + #[must_use] + pub fn new_ffmpeg( + file_paths: &[file_path_for_media_processor::Data], + location_id: location::id::Type, + location_path: Arc, + db: Arc, + sync: Arc, + ) -> Self { + Self::new( + Kind::FFmpeg, + file_paths, + location_id, + location_path, + db, + sync, ) } } @@ -355,7 +348,7 @@ async fn fetch_objects_already_with_media_data( ) -> Result, media_processor::Error> { let object_ids = file_paths .iter() - .filter_map(|file_path| file_path.object_id) + .filter_map(|file_path| file_path.object.as_ref().map(|object| object.id)) .collect(); match kind { @@ -388,7 +381,7 @@ fn filter_files_to_extract_media_data( Output { skipped, errors, .. }: &mut Output, -) -> HashMap { +) -> HashMap { let unique_objects_already_with_media_data = objects_already_with_media_data .into_iter() .collect::>(); @@ -397,7 +390,7 @@ fn filter_files_to_extract_media_data( file_paths.retain(|file_path| { !unique_objects_already_with_media_data - .contains(&file_path.object_id.expect("already checked")) + .contains(&file_path.object.as_ref().expect("already checked").id) }); file_paths @@ -406,8 +399,8 @@ fn filter_files_to_extract_media_data( IsolatedFilePathData::try_from((location_id, file_path)) .map_err(|e| { errors.push( - media_processor::NonCriticalError::from( - NonCriticalError::FailedToConstructIsolatedFilePathData( + media_processor::NonCriticalMediaProcessorError::from( + NonCriticalMediaDataExtractorError::FailedToConstructIsolatedFilePathData( file_path.id, e.to_string(), ), @@ -416,11 +409,14 @@ fn filter_files_to_extract_media_data( ); }) .map(|iso_file_path| { + let object = file_path.object.as_ref().expect("already checked"); + ( file_path.id, ( location_path.join(iso_file_path), - file_path.object_id.expect("already checked"), + object.id, + object.pub_id.as_slice().into(), ), ) }) @@ -430,13 +426,14 @@ fn filter_files_to_extract_media_data( } enum ExtractionOutputKind { - Exif(Result, media_processor::NonCriticalError>), - FFmpeg(Result), + Exif(Result, media_processor::NonCriticalMediaProcessorError>), + FFmpeg(Result), } struct ExtractionOutput { file_path_id: file_path::id::Type, object_id: object::id::Type, + object_pub_id: ObjectPubId, kind: ExtractionOutputKind, } @@ -453,23 +450,28 @@ enum InterruptRace { #[inline] fn prepare_extraction_futures<'a>( kind: Kind, - paths_by_id: &'a HashMap, + paths_by_id: &'a HashMap, interrupter: &'a Interrupter, -) -> FutureGroup + 'a> { +) -> FuturesUnordered + 'a> { paths_by_id .iter() - .map(|(file_path_id, (path, object_id))| async move { - InterruptRace::Processed(ExtractionOutput { - file_path_id: *file_path_id, - object_id: *object_id, - kind: match kind { - Kind::Exif => ExtractionOutputKind::Exif(exif_media_data::extract(path).await), - Kind::FFmpeg => { - ExtractionOutputKind::FFmpeg(ffmpeg_media_data::extract(path).await) - } - }, - }) - }) + .map( + |(file_path_id, (path, object_id, object_pub_id))| async move { + InterruptRace::Processed(ExtractionOutput { + file_path_id: *file_path_id, + object_id: *object_id, + object_pub_id: object_pub_id.clone(), + kind: match kind { + Kind::Exif => { + ExtractionOutputKind::Exif(exif_media_data::extract(path).await) + } + Kind::FFmpeg => { + ExtractionOutputKind::FFmpeg(ffmpeg_media_data::extract(path).await) + } + }, + }) + }, + ) .map(|fut| { ( fut, @@ -477,24 +479,28 @@ fn prepare_extraction_futures<'a>( ) .race() }) - .collect::>() + .collect::>() } +#[instrument(skip_all, fields(%file_path_id, %object_id))] #[inline] fn process_output( ExtractionOutput { file_path_id, object_id, + object_pub_id, kind, }: ExtractionOutput, - exif_media_datas: &mut Vec<(ExifMetadata, object::id::Type)>, + exif_media_datas: &mut Vec<(ExifMetadata, object::id::Type, ObjectPubId)>, ffmpeg_media_datas: &mut Vec<(FFmpegMetadata, object::id::Type)>, extract_ids_to_remove_from_map: &mut Vec, output: &mut Output, ) { + trace!("Processing extracted media data"); + match kind { ExtractionOutputKind::Exif(Ok(Some(exif_data))) => { - exif_media_datas.push((exif_data, object_id)); + exif_media_datas.push((exif_data, object_id, object_pub_id)); } ExtractionOutputKind::Exif(Ok(None)) => { // No exif media data found @@ -514,12 +520,85 @@ fn process_output( #[inline] async fn save( kind: Kind, - exif_media_datas: &mut Vec<(ExifMetadata, object::id::Type)>, + exif_media_datas: &mut Vec<(ExifMetadata, object::id::Type, ObjectPubId)>, ffmpeg_media_datas: &mut Vec<(FFmpegMetadata, object::id::Type)>, db: &PrismaClient, + sync: &SyncManager, ) -> Result { + trace!("Saving media data on database"); + match kind { - Kind::Exif => exif_media_data::save(mem::take(exif_media_datas), db).await, + Kind::Exif => exif_media_data::save(mem::take(exif_media_datas), db, sync).await, Kind::FFmpeg => ffmpeg_media_data::save(mem::take(ffmpeg_media_datas), db).await, } + .map_err(Into::into) +} + +#[derive(Debug, Serialize, Deserialize)] +struct SaveState { + id: TaskId, + kind: Kind, + file_paths: Vec, + location_id: location::id::Type, + location_path: Arc, + stage: Stage, + output: Output, +} + +impl SerializableTask for MediaDataExtractor { + type SerializeError = rmp_serde::encode::Error; + + type DeserializeError = rmp_serde::decode::Error; + + type DeserializeCtx = (Arc, Arc); + + async fn serialize(self) -> Result, Self::SerializeError> { + let Self { + id, + kind, + file_paths, + location_id, + location_path, + stage, + output, + .. + } = self; + + rmp_serde::to_vec_named(&SaveState { + id, + kind, + file_paths, + location_id, + location_path, + stage, + output, + }) + } + + async fn deserialize( + data: &[u8], + (db, sync): Self::DeserializeCtx, + ) -> Result { + rmp_serde::from_slice(data).map( + |SaveState { + id, + kind, + file_paths, + location_id, + location_path, + stage, + output, + }| Self { + id, + kind, + file_paths, + location_id, + location_path, + stage, + output, + db, + sync, + }, + ) + } } diff --git a/core/crates/heavy-lifting/src/media_processor/tasks/thumbnailer.rs b/core/crates/heavy-lifting/src/media_processor/tasks/thumbnailer.rs index c04fb6c55..0180014a9 100644 --- a/core/crates/heavy-lifting/src/media_processor/tasks/thumbnailer.rs +++ b/core/crates/heavy-lifting/src/media_processor/tasks/thumbnailer.rs @@ -12,8 +12,7 @@ use crate::{ media_processor::{ self, helpers::thumbnailer::{ - can_generate_thumbnail_for_document, can_generate_thumbnail_for_image, get_shard_hex, - EPHEMERAL_DIR, TARGET_PX, TARGET_QUALITY, THUMBNAIL_GENERATION_TIMEOUT, WEBP_EXTENSION, + generate_thumbnail, GenerateThumbnailArgs, GenerationStatus, THUMBNAILER_TASK_TIMEOUT, }, ThumbKey, ThumbnailKind, }, @@ -21,61 +20,31 @@ use crate::{ }; use sd_core_file_path_helper::IsolatedFilePathData; -use sd_core_prisma_helpers::file_path_for_media_processor; +use sd_core_prisma_helpers::{file_path_for_media_processor, CasId}; -use sd_file_ext::extensions::{DocumentExtension, ImageExtension}; -use sd_images::{format_image, scale_dimensions, ConvertibleExtension}; -use sd_media_metadata::exif::Orientation; use sd_prisma::prisma::{file_path, location}; use sd_task_system::{ ExecStatus, Interrupter, InterruptionKind, IntoAnyTaskOutput, SerializableTask, Task, TaskId, }; -use sd_utils::error::FileIOError; use std::{ collections::HashMap, fmt, future::IntoFuture, mem, - ops::Deref, path::{Path, PathBuf}, pin::pin, - str::FromStr, sync::Arc, time::Duration, }; -use futures::{FutureExt, StreamExt}; -use futures_concurrency::future::{FutureGroup, Race}; -use image::{imageops, DynamicImage, GenericImageView}; +use futures::{stream::FuturesUnordered, FutureExt, StreamExt}; +use futures_concurrency::future::Race; use serde::{Deserialize, Serialize}; use specta::Type; -use tokio::{ - fs, io, - task::spawn_blocking, - time::{sleep, Instant}, -}; -use tracing::{error, info, trace}; +use tokio::time::Instant; +use tracing::{error, instrument, trace, Level}; use uuid::Uuid; -use webp::Encoder; - -#[derive(Debug, Serialize, Deserialize)] -pub struct GenerateThumbnailArgs { - pub extension: String, - pub cas_id: String, - pub path: PathBuf, -} - -impl GenerateThumbnailArgs { - #[must_use] - pub const fn new(extension: String, cas_id: String, path: PathBuf) -> Self { - Self { - extension, - cas_id, - path, - } - } -} pub type ThumbnailId = u32; @@ -84,20 +53,29 @@ pub trait NewThumbnailReporter: Send + Sync + fmt::Debug + 'static { } #[derive(Debug)] -pub struct Thumbnailer { +pub struct Thumbnailer { + // Task control id: TaskId, - reporter: Arc, + with_priority: bool, + + // Received input args thumbs_kind: ThumbnailKind, thumbnails_directory_path: Arc, - thumbnails_to_generate: HashMap, - already_processed_ids: Vec, + thumbnails_to_generate: HashMap>, should_regenerate: bool, - with_priority: bool, + + // Inner state + already_processed_ids: Vec, + + // Out collector output: Output, + + // Dependencies + reporter: Arc, } #[async_trait::async_trait] -impl Task for Thumbnailer { +impl Task for Thumbnailer { fn id(&self) -> TaskId { self.id } @@ -107,9 +85,23 @@ impl Task for Thumbnailer { } fn with_timeout(&self) -> Option { - Some(Duration::from_secs(60 * 5)) // The entire task must not take more than 5 minutes + Some(THUMBNAILER_TASK_TIMEOUT) // The entire task must not take more than this constant } + #[instrument( + skip_all, + fields( + task_id = %self.id, + thumbs_kind = ?self.thumbs_kind, + should_regenerate = self.should_regenerate, + thumbnails_to_generate_count = self.thumbnails_to_generate.len(), + already_processed_ids_count = self.already_processed_ids.len(), + with_priority = self.with_priority, + ), + ret(level = Level::TRACE), + err, + )] + #[allow(clippy::blocks_in_conditions)] // Due to `err` on `instrument` macro above async fn run(&mut self, interrupter: &Interrupter) -> Result { enum InterruptRace { Interrupted(InterruptionKind), @@ -135,38 +127,27 @@ impl Task for Thumbnailer { let start = Instant::now(); - let mut futures = pin!(thumbnails_to_generate + let futures = thumbnails_to_generate .iter() .map(|(id, generate_args)| { - let path = &generate_args.path; - + generate_thumbnail( + thumbnails_directory_path, + generate_args, + thumbs_kind, + *should_regenerate, + ) + .map(|res| InterruptRace::Processed((*id, res))) + }) + .map(|fut| { ( - generate_thumbnail( - thumbnails_directory_path, - generate_args, - thumbs_kind, - *should_regenerate, - ) - .map(|res| (*id, res)), - sleep(THUMBNAIL_GENERATION_TIMEOUT).map(|()| { - ( - *id, - ( - THUMBNAIL_GENERATION_TIMEOUT, - Err(NonCriticalError::ThumbnailGenerationTimeout(path.clone())), - ), - ) - }), + fut, + interrupter.into_future().map(InterruptRace::Interrupted), ) .race() - .map(InterruptRace::Processed) }) - .map(|fut| ( - fut, - interrupter.into_future().map(InterruptRace::Interrupted) - ) - .race()) - .collect::>()); + .collect::>(); + + let mut futures = pin!(futures); while let Some(race_output) = futures.next().await { match race_output { @@ -190,25 +171,25 @@ impl Task for Thumbnailer { output.total_time += start.elapsed(); - #[allow(clippy::cast_precision_loss)] - // SAFETY: we're probably won't have 2^52 thumbnails being generated on a single task for this cast to have - // a precision loss issue - let total = (output.generated + output.skipped) as f64; + if output.generated > 1 { + #[allow(clippy::cast_precision_loss)] + // SAFETY: we're probably won't have 2^52 thumbnails being generated on a single task for this cast to have + // a precision loss issue + let total = (output.generated + output.skipped) as f64; + let mean_generation_time_f64 = output.mean_time_acc / total; - let mean_generation_time = output.mean_time_acc / total; - - let generation_time_std_dev = Duration::from_secs_f64( - (mean_generation_time.mul_add(-mean_generation_time, output.std_dev_acc / total)) - .sqrt(), - ); - - info!( - "{{generated: {generated}, skipped: {skipped}}} thumbnails; \ - mean generation time: {mean_generation_time:?} ± {generation_time_std_dev:?}", - generated = output.generated, - skipped = output.skipped, - mean_generation_time = Duration::from_secs_f64(mean_generation_time) - ); + trace!( + generated = output.generated, + skipped = output.skipped, + "mean generation time: {mean_generation_time:?} ± {generation_time_std_dev:?};", + mean_generation_time = Duration::from_secs_f64(mean_generation_time_f64), + generation_time_std_dev = Duration::from_secs_f64( + (mean_generation_time_f64 + .mul_add(-mean_generation_time_f64, output.std_dev_acc / total)) + .sqrt(), + ) + ); + } Ok(ExecStatus::Done(mem::take(output).into_output())) } @@ -224,8 +205,8 @@ pub struct Output { pub std_dev_acc: f64, } -#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type)] -pub enum NonCriticalError { +#[derive(thiserror::Error, Debug, Serialize, Deserialize, Type, Clone)] +pub enum NonCriticalThumbnailerError { #[error("file path has no cas_id")] MissingCasId(file_path::id::Type), #[error("failed to extract isolated file path data from file path : {1}")] @@ -242,19 +223,19 @@ pub enum NonCriticalError { CreateShardDirectory(String), #[error("failed to save thumbnail : {1}", .0.display())] SaveThumbnail(PathBuf, String), - #[error("thumbnail generation timed out ", .0.display())] - ThumbnailGenerationTimeout(PathBuf), + #[error("task timed out: {0}")] + TaskTimeout(TaskId), } -impl Thumbnailer { +impl Thumbnailer { fn new( thumbs_kind: ThumbnailKind, thumbnails_directory_path: Arc, - thumbnails_to_generate: HashMap, + thumbnails_to_generate: HashMap>, errors: Vec, should_regenerate: bool, with_priority: bool, - reporter: Arc, + reporter: Arc, ) -> Self { Self { id: TaskId::new_v4(), @@ -275,8 +256,8 @@ impl Thumbnailer { #[must_use] pub fn new_ephemeral( thumbnails_directory_path: Arc, - thumbnails_to_generate: Vec, - reporter: Arc, + thumbnails_to_generate: Vec>, + reporter: Arc, ) -> Self { Self::new( ThumbnailKind::Ephemeral, @@ -308,7 +289,7 @@ impl Thumbnailer { library_id: Uuid, should_regenerate: bool, with_priority: bool, - reporter: Arc, + reporter: Arc, ) -> Self { let mut errors = Vec::new(); @@ -318,13 +299,18 @@ impl Thumbnailer { file_paths .iter() .filter_map(|file_path| { - if let Some(cas_id) = file_path.cas_id.as_ref() { + if let Some(cas_id) = file_path + .cas_id + .as_ref() + .map(CasId::from) + .map(CasId::into_owned) + { let file_path_id = file_path.id; IsolatedFilePathData::try_from((location_id, file_path)) .map_err(|e| { errors.push( - media_processor::NonCriticalError::from( - NonCriticalError::FailedToExtractIsolatedFilePathData( + media_processor::NonCriticalMediaProcessorError::from( + NonCriticalThumbnailerError::FailedToExtractIsolatedFilePathData( file_path_id, e.to_string(), ), @@ -336,8 +322,8 @@ impl Thumbnailer { .map(|iso_file_path| (file_path_id, cas_id, iso_file_path)) } else { errors.push( - media_processor::NonCriticalError::from( - NonCriticalError::MissingCasId(file_path.id), + media_processor::NonCriticalMediaProcessorError::from( + NonCriticalThumbnailerError::MissingCasId(file_path.id), ) .into(), ); @@ -354,7 +340,7 @@ impl Thumbnailer { file_path_id as u32, GenerateThumbnailArgs::new( iso_file_path.extension().to_string(), - cas_id.clone(), + cas_id, full_path, ), ) @@ -369,23 +355,74 @@ impl Thumbnailer { } } +#[instrument(skip_all, fields(thumb_id = id, %generated, %skipped, ?elapsed_time, ?res))] +fn process_thumbnail_generation_output( + (id, (elapsed_time, res)): ThumbnailGenerationOutput, + with_priority: bool, + reporter: &dyn NewThumbnailReporter, + already_processed_ids: &mut Vec, + Output { + generated, + skipped, + errors, + mean_time_acc: mean_generation_time_accumulator, + std_dev_acc: std_dev_accumulator, + .. + }: &mut Output, +) { + let elapsed_time = elapsed_time.as_secs_f64(); + *mean_generation_time_accumulator += elapsed_time; + *std_dev_accumulator += elapsed_time * elapsed_time; + + match res { + Ok((thumb_key, status)) => { + match status { + GenerationStatus::Generated => { + *generated += 1; + } + GenerationStatus::Skipped => { + *skipped += 1; + } + } + + // This if is REALLY needed, due to the sheer performance of the thumbnailer, + // I restricted to only send events notifying for thumbnails in the current + // opened directory, sending events for the entire location turns into a + // humongous bottleneck in the frontend lol, since it doesn't even knows + // what to do with thumbnails for inner directories lol + // - fogodev + if with_priority { + reporter.new_thumbnail(thumb_key); + } + } + Err(e) => { + errors.push(media_processor::NonCriticalMediaProcessorError::from(e).into()); + *skipped += 1; + } + } + + already_processed_ids.push(id); + + trace!("Thumbnail processed"); +} + #[derive(Debug, Serialize, Deserialize)] struct SaveState { id: TaskId, thumbs_kind: ThumbnailKind, thumbnails_directory_path: Arc, - thumbnails_to_generate: HashMap, + thumbnails_to_generate: HashMap>, should_regenerate: bool, with_priority: bool, output: Output, } -impl SerializableTask for Thumbnailer { +impl SerializableTask for Thumbnailer { type SerializeError = rmp_serde::encode::Error; type DeserializeError = rmp_serde::decode::Error; - type DeserializeCtx = Arc; + type DeserializeCtx = Arc; async fn serialize(self) -> Result, Self::SerializeError> { let Self { @@ -443,235 +480,10 @@ impl SerializableTask for Thumbnailer, + Result<(ThumbKey, GenerationStatus), NonCriticalThumbnailerError>, ), ); - -fn process_thumbnail_generation_output( - (id, (elapsed_time, res)): ThumbnailGenerationOutput, - with_priority: bool, - reporter: &impl NewThumbnailReporter, - already_processed_ids: &mut Vec, - Output { - generated, - skipped, - errors, - mean_time_acc: mean_generation_time_accumulator, - std_dev_acc: std_dev_accumulator, - .. - }: &mut Output, -) { - let elapsed_time = elapsed_time.as_secs_f64(); - *mean_generation_time_accumulator += elapsed_time; - *std_dev_accumulator += elapsed_time * elapsed_time; - - match res { - Ok((thumb_key, status)) => { - match status { - GenerationStatus::Generated => { - *generated += 1; - } - GenerationStatus::Skipped => { - *skipped += 1; - } - } - - // This if is REALLY needed, due to the sheer performance of the thumbnailer, - // I restricted to only send events notifying for thumbnails in the current - // opened directory, sending events for the entire location turns into a - // humongous bottleneck in the frontend lol, since it doesn't even knows - // what to do with thumbnails for inner directories lol - // - fogodev - if with_priority { - reporter.new_thumbnail(thumb_key); - } - } - Err(e) => { - errors.push(media_processor::NonCriticalError::from(e).into()); - *skipped += 1; - } - } - - already_processed_ids.push(id); -} - -async fn generate_thumbnail( - thumbnails_directory: &Path, - GenerateThumbnailArgs { - extension, - cas_id, - path, - }: &GenerateThumbnailArgs, - kind: &ThumbnailKind, - should_regenerate: bool, -) -> ( - Duration, - Result<(ThumbKey, GenerationStatus), NonCriticalError>, -) { - trace!("Generating thumbnail for {}", path.display()); - let start = Instant::now(); - - let mut output_path = match kind { - ThumbnailKind::Ephemeral => thumbnails_directory.join(EPHEMERAL_DIR), - ThumbnailKind::Indexed(library_id) => thumbnails_directory.join(library_id.to_string()), - }; - - output_path.push(get_shard_hex(cas_id)); - output_path.push(cas_id); - output_path.set_extension(WEBP_EXTENSION); - - if let Err(e) = fs::metadata(&*output_path).await { - if e.kind() != io::ErrorKind::NotFound { - error!( - "Failed to check if thumbnail exists, but we will try to generate it anyway: {e:#?}" - ); - } - // Otherwise we good, thumbnail doesn't exist so we can generate it - } else if !should_regenerate { - trace!( - "Skipping thumbnail generation for {} because it already exists", - path.display() - ); - return ( - start.elapsed(), - Ok((ThumbKey::new(cas_id, kind), GenerationStatus::Skipped)), - ); - } - - if let Ok(extension) = ImageExtension::from_str(extension) { - if can_generate_thumbnail_for_image(extension) { - if let Err(e) = generate_image_thumbnail(&path, &output_path).await { - return (start.elapsed(), Err(e)); - } - } - } else if let Ok(extension) = DocumentExtension::from_str(extension) { - if can_generate_thumbnail_for_document(extension) { - if let Err(e) = generate_image_thumbnail(&path, &output_path).await { - return (start.elapsed(), Err(e)); - } - } - } - - #[cfg(feature = "ffmpeg")] - { - use crate::media_processor::helpers::thumbnailer::can_generate_thumbnail_for_video; - use sd_file_ext::extensions::VideoExtension; - - if let Ok(extension) = VideoExtension::from_str(extension) { - if can_generate_thumbnail_for_video(extension) { - if let Err(e) = generate_video_thumbnail(&path, &output_path).await { - return (start.elapsed(), Err(e)); - } - } - } - } - - trace!("Generated thumbnail for {}", path.display()); - - ( - start.elapsed(), - Ok((ThumbKey::new(cas_id, kind), GenerationStatus::Generated)), - ) -} - -async fn generate_image_thumbnail( - file_path: impl AsRef + Send, - output_path: impl AsRef + Send, -) -> Result<(), NonCriticalError> { - let file_path = file_path.as_ref().to_path_buf(); - - let webp = spawn_blocking({ - let file_path = file_path.clone(); - - move || -> Result<_, NonCriticalError> { - let mut img = format_image(&file_path) - .map_err(|e| NonCriticalError::FormatImage(file_path.clone(), e.to_string()))?; - - let (w, h) = img.dimensions(); - - #[allow(clippy::cast_precision_loss)] - let (w_scaled, h_scaled) = scale_dimensions(w as f32, h as f32, TARGET_PX); - - // Optionally, resize the existing photo and convert back into DynamicImage - if w != w_scaled && h != h_scaled { - img = DynamicImage::ImageRgba8(imageops::resize( - &img, - w_scaled, - h_scaled, - imageops::FilterType::Triangle, - )); - } - - // this corrects the rotation/flip of the image based on the *available* exif data - // not all images have exif data, so we don't error. we also don't rotate HEIF as that's against the spec - if let Some(orientation) = Orientation::from_path(&file_path) { - if ConvertibleExtension::try_from(file_path.as_ref()) - .expect("we already checked if the image was convertible") - .should_rotate() - { - img = orientation.correct_thumbnail(img); - } - } - - // Create the WebP encoder for the above image - let encoder = Encoder::from_image(&img) - .map_err(|reason| NonCriticalError::WebPEncoding(file_path, reason.to_string()))?; - - // Type `WebPMemory` is !Send, which makes the `Future` in this function `!Send`, - // this make us `deref` to have a `&[u8]` and then `to_owned` to make a `Vec` - // which implies on a unwanted clone... - Ok(encoder.encode(TARGET_QUALITY).deref().to_owned()) - } - }) - .await - .map_err(|e| { - NonCriticalError::PanicWhileGeneratingThumbnail(file_path.clone(), e.to_string()) - })??; - - let output_path = output_path.as_ref(); - - if let Some(shard_dir) = output_path.parent() { - fs::create_dir_all(shard_dir).await.map_err(|e| { - NonCriticalError::CreateShardDirectory(FileIOError::from((shard_dir, e)).to_string()) - })?; - } else { - error!( - "Failed to get parent directory of '{}' for sharding parent directory", - output_path.display() - ); - } - - fs::write(output_path, &webp).await.map_err(|e| { - NonCriticalError::SaveThumbnail(file_path, FileIOError::from((output_path, e)).to_string()) - }) -} - -#[cfg(feature = "ffmpeg")] -async fn generate_video_thumbnail( - file_path: impl AsRef + Send, - output_path: impl AsRef + Send, -) -> Result<(), NonCriticalError> { - use sd_ffmpeg::{to_thumbnail, ThumbnailSize}; - - let file_path = file_path.as_ref(); - - to_thumbnail( - file_path, - output_path, - ThumbnailSize::Scale(1024), - TARGET_QUALITY, - ) - .await - .map_err(|e| { - NonCriticalError::VideoThumbnailGenerationFailed(file_path.to_path_buf(), e.to_string()) - }) -} diff --git a/core/crates/heavy-lifting/src/utils/sub_path.rs b/core/crates/heavy-lifting/src/utils/sub_path.rs index f9e607b41..ea188e133 100644 --- a/core/crates/heavy-lifting/src/utils/sub_path.rs +++ b/core/crates/heavy-lifting/src/utils/sub_path.rs @@ -1,4 +1,3 @@ -use rspc::ErrorCode; use sd_core_file_path_helper::{ ensure_file_path_exists, ensure_sub_path_is_directory, ensure_sub_path_is_in_location, FilePathError, IsolatedFilePathData, @@ -9,6 +8,7 @@ use sd_prisma::prisma::{location, PrismaClient}; use std::path::{Path, PathBuf}; use prisma_client_rust::QueryError; +use rspc::ErrorCode; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -23,66 +23,91 @@ pub enum Error { } impl From for rspc::Error { - fn from(err: Error) -> Self { - match err { - Error::SubPathNotFound(_) => { - Self::with_cause(ErrorCode::NotFound, err.to_string(), err) + fn from(e: Error) -> Self { + match e { + Error::SubPathNotFound(_) => Self::with_cause(ErrorCode::NotFound, e.to_string(), e), + + _ => Self::with_cause(ErrorCode::InternalServerError, e.to_string(), e), + } + } +} + +pub async fn get_full_path_from_sub_path>( + location_id: location::id::Type, + sub_path: Option + Send + Sync>, + location_path: impl AsRef + Send, + db: &PrismaClient, +) -> Result { + async fn inner( + location_id: location::id::Type, + sub_path: Option<&Path>, + location_path: &Path, + db: &PrismaClient, + ) -> Result { + match sub_path { + Some(sub_path) if sub_path != Path::new("") => { + let full_path = ensure_sub_path_is_in_location(location_path, sub_path).await?; + + ensure_sub_path_is_directory(location_path, sub_path).await?; + + ensure_file_path_exists( + sub_path, + &IsolatedFilePathData::new(location_id, location_path, &full_path, true)?, + db, + Error::SubPathNotFound, + ) + .await?; + + Ok(full_path) } - - _ => Self::with_cause(ErrorCode::InternalServerError, err.to_string(), err), + _ => Ok(location_path.to_path_buf()), } } + + inner( + location_id, + sub_path.as_ref().map(AsRef::as_ref), + location_path.as_ref(), + db, + ) + .await + .map_err(E::from) } -pub async fn get_full_path_from_sub_path( +pub async fn maybe_get_iso_file_path_from_sub_path>( location_id: location::id::Type, - sub_path: &Option + Send + Sync>, + sub_path: Option + Send + Sync>, location_path: impl AsRef + Send, db: &PrismaClient, -) -> Result { - let location_path = location_path.as_ref(); +) -> Result>, E> { + async fn inner( + location_id: location::id::Type, + sub_path: Option<&Path>, + location_path: &Path, + db: &PrismaClient, + ) -> Result>, Error> { + match sub_path { + Some(sub_path) if sub_path != Path::new("") => { + let full_path = ensure_sub_path_is_in_location(location_path, sub_path).await?; + ensure_sub_path_is_directory(location_path, sub_path).await?; - match sub_path { - Some(sub_path) if sub_path.as_ref() != Path::new("") => { - let sub_path = sub_path.as_ref(); - let full_path = ensure_sub_path_is_in_location(location_path, sub_path).await?; + let sub_iso_file_path = + IsolatedFilePathData::new(location_id, location_path, &full_path, true)?; - ensure_sub_path_is_directory(location_path, sub_path).await?; - - ensure_file_path_exists( - sub_path, - &IsolatedFilePathData::new(location_id, location_path, &full_path, true)?, - db, - Error::SubPathNotFound, - ) - .await?; - - Ok(full_path) + ensure_file_path_exists(sub_path, &sub_iso_file_path, db, Error::SubPathNotFound) + .await + .map(|()| Some(sub_iso_file_path)) + } + _ => Ok(None), } - _ => Ok(location_path.to_path_buf()), - } -} - -pub async fn maybe_get_iso_file_path_from_sub_path( - location_id: location::id::Type, - sub_path: &Option + Send + Sync>, - location_path: impl AsRef + Send, - db: &PrismaClient, -) -> Result>, Error> { - let location_path = location_path.as_ref(); - - match sub_path { - Some(sub_path) if sub_path.as_ref() != Path::new("") => { - let full_path = ensure_sub_path_is_in_location(location_path, sub_path).await?; - ensure_sub_path_is_directory(location_path, sub_path).await?; - - let sub_iso_file_path = - IsolatedFilePathData::new(location_id, location_path, &full_path, true)?; - - ensure_file_path_exists(sub_path, &sub_iso_file_path, db, Error::SubPathNotFound) - .await - .map(|()| Some(sub_iso_file_path)) - } - _ => Ok(None), } + + inner( + location_id, + sub_path.as_ref().map(AsRef::as_ref), + location_path.as_ref(), + db, + ) + .await + .map_err(E::from) } diff --git a/core/crates/indexer-rules/src/lib.rs b/core/crates/indexer-rules/src/lib.rs index e74198e4e..f239e6a27 100644 --- a/core/crates/indexer-rules/src/lib.rs +++ b/core/crates/indexer-rules/src/lib.rs @@ -51,15 +51,15 @@ use rspc::ErrorCode; use specta::Type; use thiserror::Error; -use tokio::{fs, sync::RwLock}; -use tracing::debug; +use tokio::fs; +use tracing::{debug, instrument, trace}; use uuid::Uuid; pub mod seed; mod serde_impl; #[derive(Error, Debug)] -pub enum IndexerRuleError { +pub enum Error { // User errors #[error("invalid indexer rule kind integer: {0}")] InvalidRuleKindInt(i32), @@ -83,16 +83,14 @@ pub enum IndexerRuleError { MissingField(#[from] MissingFieldError), } -impl From for rspc::Error { - fn from(err: IndexerRuleError) -> Self { - match err { - IndexerRuleError::InvalidRuleKindInt(_) - | IndexerRuleError::Glob(_) - | IndexerRuleError::NonUtf8Path(_) => { - Self::with_cause(ErrorCode::BadRequest, err.to_string(), err) +impl From for rspc::Error { + fn from(e: Error) -> Self { + match e { + Error::InvalidRuleKindInt(_) | Error::Glob(_) | Error::NonUtf8Path(_) => { + Self::with_cause(ErrorCode::BadRequest, e.to_string(), e) } - _ => Self::with_cause(ErrorCode::InternalServerError, err.to_string(), err), + _ => Self::with_cause(ErrorCode::InternalServerError, e.to_string(), e), } } } @@ -113,21 +111,17 @@ pub struct IndexerRuleCreateArgs { } impl IndexerRuleCreateArgs { - pub async fn create( - self, - db: &PrismaClient, - ) -> Result, IndexerRuleError> { + #[instrument(skip_all, fields(name = %self.name, rules = ?self.rules), err)] + pub async fn create(self, db: &PrismaClient) -> Result, Error> { use indexer_rule::{date_created, date_modified, name, rules_per_kind}; debug!( - "{} a new indexer rule (name = {}, params = {:?})", + "{} a new indexer rule", if self.dry_run { "Dry run: Would create" } else { "Trying to create" }, - self.name, - self.rules ); let rules_data = rmp_serde::to_vec_named( @@ -167,7 +161,7 @@ impl IndexerRuleCreateArgs { Ok(Some( db.indexer_rule() .create( - sd_utils::uuid_to_bytes(generate_pub_id()), + sd_utils::uuid_to_bytes(&generate_pub_id()), vec![ name::set(Some(self.name)), rules_per_kind::set(Some(rules_data)), @@ -224,7 +218,7 @@ impl RulePerKind { fn new_files_by_globs_str_and_kind( globs_str: impl IntoIterator>, kind_fn: impl Fn(Vec, GlobSet) -> Self, - ) -> Result { + ) -> Result { globs_str .into_iter() .map(|s| s.as_ref().parse::()) @@ -245,13 +239,13 @@ impl RulePerKind { pub fn new_accept_files_by_globs_str( globs_str: impl IntoIterator>, - ) -> Result { + ) -> Result { Self::new_files_by_globs_str_and_kind(globs_str, Self::AcceptFilesByGlob) } pub fn new_reject_files_by_globs_str( globs_str: impl IntoIterator>, - ) -> Result { + ) -> Result { Self::new_files_by_globs_str_and_kind(globs_str, Self::RejectFilesByGlob) } } @@ -267,51 +261,19 @@ impl MetadataForIndexerRules for Metadata { } impl RulePerKind { - #[deprecated = "Use `[apply_with_metadata]` instead"] async fn apply( &self, source: impl AsRef + Send, - ) -> Result<(RuleKind, bool), IndexerRuleError> { - match self { - Self::AcceptIfChildrenDirectoriesArePresent(children) => { - accept_dir_for_its_children(source, children) - .await - .map(|accepted| (RuleKind::AcceptIfChildrenDirectoriesArePresent, accepted)) - } - Self::RejectIfChildrenDirectoriesArePresent(children) => { - reject_dir_for_its_children(source, children) - .await - .map(|rejected| (RuleKind::RejectIfChildrenDirectoriesArePresent, rejected)) - } - - Self::AcceptFilesByGlob(_globs, accept_glob_set) => Ok(( - RuleKind::AcceptFilesByGlob, - accept_by_glob(source, accept_glob_set), - )), - Self::RejectFilesByGlob(_globs, reject_glob_set) => Ok(( - RuleKind::RejectFilesByGlob, - reject_by_glob(source, reject_glob_set), - )), - Self::IgnoredByGit(git_repo, patterns) => Ok(( - RuleKind::IgnoredByGit, - accept_by_gitpattern(source.as_ref(), git_repo, patterns), - )), - } - } - - async fn apply_with_metadata( - &self, - source: impl AsRef + Send, metadata: &impl MetadataForIndexerRules, - ) -> Result<(RuleKind, bool), IndexerRuleError> { + ) -> Result<(RuleKind, bool), Error> { match self { Self::AcceptIfChildrenDirectoriesArePresent(children) => { - accept_dir_for_its_children_with_metadata(source, metadata, children) + accept_dir_for_its_children(source, metadata, children) .await .map(|accepted| (RuleKind::AcceptIfChildrenDirectoriesArePresent, accepted)) } Self::RejectIfChildrenDirectoriesArePresent(children) => { - reject_dir_for_its_children_with_metadata(source, metadata, children) + reject_dir_for_its_children(source, metadata, children) .await .map(|rejected| (RuleKind::RejectIfChildrenDirectoriesArePresent, rejected)) } @@ -326,24 +288,32 @@ impl RulePerKind { )), Self::IgnoredByGit(base_dir, patterns) => Ok(( RuleKind::IgnoredByGit, - accept_by_gitpattern(source.as_ref(), base_dir, patterns), + accept_by_git_pattern(source, base_dir, patterns), )), } } } -fn accept_by_gitpattern(source: &Path, base_dir: &Path, search: &Search) -> bool { - let relative = source - .strip_prefix(base_dir) - .expect("`base_dir` should be our git repo, and `source` should be inside of it"); +fn accept_by_git_pattern( + source: impl AsRef, + base_dir: impl AsRef, + search: &Search, +) -> bool { + fn inner(source: &Path, base_dir: &Path, search: &Search) -> bool { + let relative = source + .strip_prefix(base_dir) + .expect("`base_dir` should be our git repo, and `source` should be inside of it"); - let Some(src) = relative.to_str().map(|s| s.as_bytes().into()) else { - return false; - }; + let Some(src) = relative.to_str().map(|s| s.as_bytes().into()) else { + return false; + }; - search - .pattern_matching_relative_path(src, Some(source.is_dir()), Case::Fold) - .map_or(true, |rule| rule.pattern.is_negative()) + search + .pattern_matching_relative_path(src, Some(source.is_dir()), Case::Fold) + .map_or(true, |rule| rule.pattern.is_negative()) + } + + inner(source.as_ref(), base_dir.as_ref(), search) } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -357,32 +327,19 @@ pub struct IndexerRule { } impl IndexerRule { - #[deprecated = "Use `[apply_with_metadata]` instead"] pub async fn apply( &self, source: impl AsRef + Send, - ) -> Result, IndexerRuleError> { - self.rules - .iter() - .map(|rule| rule.apply(source.as_ref())) - .collect::>() - .try_join() - .await - } - - pub async fn apply_with_metadata( - &self, - source: impl AsRef + Send, metadata: &impl MetadataForIndexerRules, - ) -> Result, IndexerRuleError> { + ) -> Result, Error> { async fn inner( rules: &[RulePerKind], source: &Path, metadata: &impl MetadataForIndexerRules, - ) -> Result, IndexerRuleError> { + ) -> Result, Error> { rules .iter() - .map(|rule| rule.apply_with_metadata(source, metadata)) + .map(|rule| rule.apply(source, metadata)) .collect::>() .try_join() .await @@ -390,64 +347,79 @@ impl IndexerRule { inner(&self.rules, source.as_ref(), metadata).await } - - #[deprecated = "Use `[IndexerRuler::apply_all]` instead"] - pub async fn apply_all( - rules: &[Self], - source: impl AsRef + Send, - ) -> Result>, IndexerRuleError> { - rules - .iter() - .map(|rule| rule.apply(source.as_ref())) - .collect::>() - .try_join() - .await - .map(|results| { - results.into_iter().flatten().fold( - HashMap::<_, Vec<_>>::with_capacity(RuleKind::variant_count()), - |mut map, (kind, result)| { - map.entry(kind).or_default().push(result); - map - }, - ) - }) - } } -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RulerDecision { + Accept, + Reject, +} + +#[derive(Debug, Default, Serialize, Deserialize)] pub struct IndexerRuler { - rules: Arc>>, + base: Arc>, + extra: Vec, +} + +impl Clone for IndexerRuler { + fn clone(&self) -> Self { + Self { + base: Arc::clone(&self.base), + // Each instance of IndexerRules MUST have its own extra rules no clones allowed! + extra: Vec::new(), + } + } } impl IndexerRuler { #[must_use] pub fn new(rules: Vec) -> Self { Self { - rules: Arc::new(RwLock::new(rules)), + base: Arc::new(rules), + extra: Vec::new(), } } - pub async fn serialize(&self) -> Result, encode::Error> { - rmp_serde::to_vec_named(&*self.rules.read().await) - } + pub async fn evaluate_path( + &self, + source: impl AsRef + Send, + metadata: &impl MetadataForIndexerRules, + ) -> Result { + async fn inner( + this: &IndexerRuler, + source: &Path, + metadata: &impl MetadataForIndexerRules, + ) -> Result { + Ok( + if IndexerRuler::reject_path( + source, + metadata.is_dir(), + &this.apply_all(source, metadata).await?, + ) { + RulerDecision::Reject + } else { + RulerDecision::Accept + }, + ) + } - pub fn deserialize(data: &[u8]) -> Result { - rmp_serde::from_slice(data).map(Self::new) + inner(self, source.as_ref(), metadata).await } pub async fn apply_all( &self, source: impl AsRef + Send, metadata: &impl MetadataForIndexerRules, - ) -> Result>, IndexerRuleError> { + ) -> Result>, Error> { async fn inner( - rules: &[IndexerRule], + base: &[IndexerRule], + extra: &[IndexerRule], source: &Path, metadata: &impl MetadataForIndexerRules, - ) -> Result>, IndexerRuleError> { - rules - .iter() - .map(|rule| rule.apply_with_metadata(source, metadata)) + ) -> Result>, Error> { + base.iter() + .chain(extra.iter()) + .map(|rule| rule.apply(source, metadata)) .collect::>() .try_join() .await @@ -462,24 +434,99 @@ impl IndexerRuler { }) } - inner(&self.rules.read().await, source.as_ref(), metadata).await + inner(&self.base, &self.extra, source.as_ref(), metadata).await } /// Extend the indexer rules with the contents from an iterator of rules - pub async fn extend(&self, iter: impl IntoIterator + Send) { - let mut indexer = self.rules.write().await; - indexer.extend(iter); + pub fn extend(&mut self, iter: impl IntoIterator + Send) { + self.extra.extend(iter); } - pub async fn has_system(&self, rule: &SystemIndexerRule) -> bool { - let rules = self.rules.read().await; + #[must_use] + pub fn has_system(&self, rule: &SystemIndexerRule) -> bool { + self.base + .iter() + .chain(self.extra.iter()) + .any(|inner_rule| rule == inner_rule) + } - rules.iter().any(|inner_rule| rule == inner_rule) + #[instrument(skip_all, fields(current_path = %current_path.display()))] + fn reject_path( + current_path: &Path, + is_dir: bool, + acceptance_per_rule_kind: &HashMap>, + ) -> bool { + Self::rejected_by_reject_glob(acceptance_per_rule_kind) + || Self::rejected_by_git_ignore(acceptance_per_rule_kind) + || (is_dir && Self::rejected_by_children_directories(acceptance_per_rule_kind)) + || Self::rejected_by_accept_glob(acceptance_per_rule_kind) + } + + pub fn rejected_by_accept_glob( + acceptance_per_rule_kind: &HashMap>, + ) -> bool { + let res = acceptance_per_rule_kind + .get(&RuleKind::AcceptFilesByGlob) + .map_or(false, |accept_rules| { + accept_rules.iter().all(|accept| !accept) + }); + + if res { + trace!("Reject because it didn't passed in any `RuleKind::AcceptFilesByGlob` rules"); + } + + res + } + + pub fn rejected_by_children_directories( + acceptance_per_rule_kind: &HashMap>, + ) -> bool { + let res = acceptance_per_rule_kind + .get(&RuleKind::RejectIfChildrenDirectoriesArePresent) + .map_or(false, |reject_results| { + reject_results.iter().any(|reject| !reject) + }); + + if res { + trace!("Rejected by rule `RuleKind::RejectIfChildrenDirectoriesArePresent`"); + } + + res + } + + pub fn rejected_by_reject_glob( + acceptance_per_rule_kind: &HashMap>, + ) -> bool { + let res = acceptance_per_rule_kind + .get(&RuleKind::RejectFilesByGlob) + .map_or(false, |reject_results| { + reject_results.iter().any(|reject| !reject) + }); + + if res { + trace!("Rejected by `RuleKind::RejectFilesByGlob`"); + } + + res + } + + pub fn rejected_by_git_ignore(acceptance_per_rule_kind: &HashMap>) -> bool { + let res = acceptance_per_rule_kind + .get(&RuleKind::IgnoredByGit) + .map_or(false, |reject_results| { + reject_results.iter().any(|reject| !reject) + }); + + if res { + trace!("Rejected by `RuleKind::IgnoredByGit`"); + } + + res } } impl TryFrom<&indexer_rule::Data> for IndexerRule { - type Error = IndexerRuleError; + type Error = Error; fn try_from(data: &indexer_rule::Data) -> Result { Ok(Self { @@ -497,7 +544,7 @@ impl TryFrom<&indexer_rule::Data> for IndexerRule { } impl TryFrom for IndexerRule { - type Error = IndexerRuleError; + type Error = Error; fn try_from(data: indexer_rule::Data) -> Result { Self::try_from(&data) @@ -512,140 +559,56 @@ fn reject_by_glob(source: impl AsRef, reject_glob_set: &GlobSet) -> bool { !accept_by_glob(source.as_ref(), reject_glob_set) } -#[deprecated = "Use `[accept_dir_for_its_children_with_metadata]` instead"] async fn accept_dir_for_its_children( - source: impl AsRef + Send, - children: &HashSet, -) -> Result { - let source = source.as_ref(); - - // FIXME(fogodev): Just check for io::ErrorKind::NotADirectory error instead (feature = "io_error_more", issue = "86442") - if !fs::metadata(source) - .await - .map_err(|e| IndexerRuleError::AcceptByItsChildrenFileIO(FileIOError::from((source, e))))? - .is_dir() - { - return Ok(false); - } - - let mut read_dir = fs::read_dir(source) - .await // TODO: Check NotADirectory error here when available - .map_err(|e| IndexerRuleError::AcceptByItsChildrenFileIO(FileIOError::from((source, e))))?; - while let Some(entry) = read_dir - .next_entry() - .await - .map_err(|e| IndexerRuleError::AcceptByItsChildrenFileIO(FileIOError::from((source, e))))? - { - let entry_name = entry - .file_name() - .to_str() - .ok_or_else(|| NonUtf8PathError(entry.path().into()))? - .to_string(); - - if entry - .metadata() - .await - .map_err(|e| { - IndexerRuleError::AcceptByItsChildrenFileIO(FileIOError::from((source, e))) - })? - .is_dir() && children.contains(&entry_name) - { - return Ok(true); - } - } - - Ok(false) -} - -async fn accept_dir_for_its_children_with_metadata( source: impl AsRef + Send, metadata: &impl MetadataForIndexerRules, children: &HashSet, -) -> Result { - let source = source.as_ref(); - - // FIXME(fogodev): Just check for io::ErrorKind::NotADirectory error instead (feature = "io_error_more", issue = "86442") - if !metadata.is_dir() { - return Ok(false); - } - - let mut read_dir = fs::read_dir(source) - .await // TODO: Check NotADirectory error here when available - .map_err(|e| IndexerRuleError::AcceptByItsChildrenFileIO(FileIOError::from((source, e))))?; - while let Some(entry) = read_dir - .next_entry() - .await - .map_err(|e| IndexerRuleError::AcceptByItsChildrenFileIO(FileIOError::from((source, e))))? - { - let entry_name = entry - .file_name() - .to_str() - .ok_or_else(|| NonUtf8PathError(entry.path().into()))? - .to_string(); - - if entry - .metadata() - .await - .map_err(|e| { - IndexerRuleError::AcceptByItsChildrenFileIO(FileIOError::from((source, e))) - })? - .is_dir() && children.contains(&entry_name) - { - return Ok(true); - } - } - - Ok(false) -} - -#[deprecated = "Use `[reject_dir_for_its_children_with_metadata]` instead"] -async fn reject_dir_for_its_children( - source: impl AsRef + Send, - children: &HashSet, -) -> Result { - let source = source.as_ref(); - - // FIXME(fogodev): Just check for io::ErrorKind::NotADirectory error instead (feature = "io_error_more", issue = "86442") - if !fs::metadata(source) - .await - .map_err(|e| IndexerRuleError::AcceptByItsChildrenFileIO(FileIOError::from((source, e))))? - .is_dir() - { - return Ok(true); - } - - let mut read_dir = fs::read_dir(source) - .await // TODO: Check NotADirectory error here when available - .map_err(|e| IndexerRuleError::RejectByItsChildrenFileIO(FileIOError::from((source, e))))?; - while let Some(entry) = read_dir - .next_entry() - .await - .map_err(|e| IndexerRuleError::RejectByItsChildrenFileIO(FileIOError::from((source, e))))? - { - if entry - .metadata() - .await - .map_err(|e| { - IndexerRuleError::RejectByItsChildrenFileIO(FileIOError::from((source, e))) - })? - .is_dir() && children.contains( - entry - .file_name() - .to_str() - .ok_or_else(|| NonUtf8PathError(entry.path().into()))?, - ) { +) -> Result { + async fn inner( + source: &Path, + metadata: &impl MetadataForIndexerRules, + children: &HashSet, + ) -> Result { + // FIXME(fogodev): Just check for io::ErrorKind::NotADirectory error instead (feature = "io_error_more", issue = "86442") + if !metadata.is_dir() { return Ok(false); } + + let mut read_dir = fs::read_dir(source) + .await // TODO: Check NotADirectory error here when available + .map_err(|e| Error::AcceptByItsChildrenFileIO(FileIOError::from((source, e))))?; + while let Some(entry) = read_dir + .next_entry() + .await + .map_err(|e| Error::AcceptByItsChildrenFileIO(FileIOError::from((source, e))))? + { + let entry_name = entry + .file_name() + .to_str() + .ok_or_else(|| NonUtf8PathError(entry.path().into()))? + .to_string(); + + if entry + .metadata() + .await + .map_err(|e| Error::AcceptByItsChildrenFileIO(FileIOError::from((source, e))))? + .is_dir() && children.contains(&entry_name) + { + return Ok(true); + } + } + + Ok(false) } - Ok(true) + inner(source.as_ref(), metadata, children).await } -async fn reject_dir_for_its_children_with_metadata( +async fn reject_dir_for_its_children( source: impl AsRef + Send, metadata: &impl MetadataForIndexerRules, children: &HashSet, -) -> Result { +) -> Result { let source = source.as_ref(); // FIXME(fogodev): Just check for io::ErrorKind::NotADirectory error instead (feature = "io_error_more", issue = "86442") @@ -655,18 +618,16 @@ async fn reject_dir_for_its_children_with_metadata( let mut read_dir = fs::read_dir(source) .await // TODO: Check NotADirectory error here when available - .map_err(|e| IndexerRuleError::RejectByItsChildrenFileIO(FileIOError::from((source, e))))?; + .map_err(|e| Error::RejectByItsChildrenFileIO(FileIOError::from((source, e))))?; while let Some(entry) = read_dir .next_entry() .await - .map_err(|e| IndexerRuleError::RejectByItsChildrenFileIO(FileIOError::from((source, e))))? + .map_err(|e| Error::RejectByItsChildrenFileIO(FileIOError::from((source, e))))? { if entry .metadata() .await - .map_err(|e| { - IndexerRuleError::RejectByItsChildrenFileIO(FileIOError::from((source, e))) - })? + .map_err(|e| Error::RejectByItsChildrenFileIO(FileIOError::from((source, e))))? .is_dir() && children.contains( entry .file_name() @@ -710,9 +671,37 @@ mod tests { } } - async fn check_rule(indexer_rule: &IndexerRule, path: impl AsRef + Send) -> bool { + fn check_rule(indexer_rule: &IndexerRule, path: impl AsRef) -> bool { + let path = path.as_ref(); indexer_rule - .apply(path) + .rules + .iter() + .map(|rule| match rule { + RulePerKind::AcceptFilesByGlob(_globs, accept_glob_set) => ( + RuleKind::AcceptFilesByGlob, + accept_by_glob(path, accept_glob_set), + ), + RulePerKind::RejectFilesByGlob(_globs, reject_glob_set) => ( + RuleKind::RejectFilesByGlob, + reject_by_glob(path, reject_glob_set), + ), + RulePerKind::IgnoredByGit(git_repo, patterns) => ( + RuleKind::IgnoredByGit, + accept_by_git_pattern(path, git_repo, patterns), + ), + + _ => unimplemented!("can't use simple `apply` for this rule: {:?}", rule), + }) + .all(|(_kind, res)| res) + } + + async fn check_rule_with_metadata( + indexer_rule: &IndexerRule, + path: impl AsRef + Send, + metadata: &impl MetadataForIndexerRules, + ) -> bool { + indexer_rule + .apply(path.as_ref(), metadata) .await .unwrap() .into_iter() @@ -739,12 +728,12 @@ mod tests { )], ); - assert!(!check_rule(&rule, hidden).await); - assert!(check_rule(&rule, normal).await); - assert!(!check_rule(&rule, hidden_inner_dir).await); - assert!(!check_rule(&rule, hidden_inner_file).await); - assert!(check_rule(&rule, normal_inner_dir).await); - assert!(check_rule(&rule, normal_inner_file).await); + assert!(!check_rule(&rule, hidden)); + assert!(check_rule(&rule, normal)); + assert!(!check_rule(&rule, hidden_inner_dir)); + assert!(!check_rule(&rule, hidden_inner_file)); + assert!(check_rule(&rule, normal_inner_dir)); + assert!(check_rule(&rule, normal_inner_file)); } #[tokio::test] @@ -765,9 +754,9 @@ mod tests { )], ); - assert!(check_rule(&rule, project_file).await); - assert!(!check_rule(&rule, project_build_dir).await); - assert!(!check_rule(&rule, project_build_dir_inner).await); + assert!(check_rule(&rule, project_file)); + assert!(!check_rule(&rule, project_build_dir)); + assert!(!check_rule(&rule, project_build_dir_inner)); } #[tokio::test] @@ -795,16 +784,16 @@ mod tests { )], ); - assert!(!check_rule(&rule, text).await); - assert!(check_rule(&rule, png).await); - assert!(check_rule(&rule, jpg).await); - assert!(check_rule(&rule, jpeg).await); - assert!(!check_rule(&rule, inner_text).await); - assert!(check_rule(&rule, inner_png).await); - assert!(check_rule(&rule, inner_jpg).await); - assert!(check_rule(&rule, inner_jpeg).await); - assert!(!check_rule(&rule, many_inner_dirs_text).await); - assert!(check_rule(&rule, many_inner_dirs_png).await); + assert!(!check_rule(&rule, text)); + assert!(check_rule(&rule, png)); + assert!(check_rule(&rule, jpg)); + assert!(check_rule(&rule, jpeg)); + assert!(!check_rule(&rule, inner_text)); + assert!(check_rule(&rule, inner_png)); + assert!(check_rule(&rule, inner_jpg)); + assert!(check_rule(&rule, inner_jpeg)); + assert!(!check_rule(&rule, many_inner_dirs_text)); + assert!(check_rule(&rule, many_inner_dirs_png)); } #[tokio::test] @@ -833,9 +822,22 @@ mod tests { )], ); - assert!(check_rule(&rule, project1).await); - assert!(check_rule(&rule, project2).await); - assert!(!check_rule(&rule, not_project).await); + assert!( + !check_rule_with_metadata(&rule, &project1, &fs::metadata(&project1).await.unwrap()) + .await + ); + assert!( + !check_rule_with_metadata(&rule, &project2, &fs::metadata(&project2).await.unwrap()) + .await + ); + assert!( + check_rule_with_metadata( + &rule, + ¬_project, + &fs::metadata(¬_project).await.unwrap() + ) + .await + ); } #[tokio::test] @@ -864,9 +866,22 @@ mod tests { )], ); - assert!(!check_rule(&rule, project1).await); - assert!(!check_rule(&rule, project2).await); - assert!(check_rule(&rule, not_project).await); + assert!( + !check_rule_with_metadata(&rule, &project1, &fs::metadata(&project1).await.unwrap()) + .await + ); + assert!( + !check_rule_with_metadata(&rule, &project2, &fs::metadata(&project2).await.unwrap()) + .await + ); + assert!( + check_rule_with_metadata( + &rule, + ¬_project, + &fs::metadata(¬_project).await.unwrap() + ) + .await + ); } impl PartialEq for RulePerKind { diff --git a/core/crates/indexer-rules/src/seed.rs b/core/crates/indexer-rules/src/seed.rs index 960e344db..173514630 100644 --- a/core/crates/indexer-rules/src/seed.rs +++ b/core/crates/indexer-rules/src/seed.rs @@ -1,25 +1,24 @@ -use std::path::{Path, PathBuf}; - -use futures_concurrency::future::Join; -use gix_ignore::{glob::search::pattern::List, search::Ignore, Search}; use sd_prisma::prisma::{indexer_rule, PrismaClient}; +use std::path::{Path, PathBuf}; + use chrono::Utc; -use thiserror::Error; +use futures_concurrency::future::Join; +use gix_ignore::{glob::search::pattern::List, search::Ignore, Search}; +use once_cell::sync::Lazy; use tokio::fs; use uuid::Uuid; -use super::{IndexerRule, IndexerRuleError, RulePerKind}; -use once_cell::sync::Lazy; +use super::{Error, IndexerRule, RulePerKind}; -#[derive(Error, Debug)] +#[derive(thiserror::Error, Debug)] pub enum SeederError { #[error("Failed to run indexer rules seeder: {0}")] - IndexerRules(#[from] IndexerRuleError), + IndexerRules(#[from] Error), #[error("An error occurred with the database while applying migrations: {0}")] DatabaseError(#[from] prisma_client_rust::QueryError), #[error("Failed to parse indexer rules based on external system")] - InhirentedExternalRules, + InheritedExternalRules, } #[derive(Debug)] @@ -29,7 +28,7 @@ pub struct GitIgnoreRules { impl GitIgnoreRules { pub async fn get_rules_if_in_git_repo( - library_root: &Path, + location_root: &Path, current: &Path, ) -> Option> { let mut git_repo = None; @@ -38,7 +37,7 @@ impl GitIgnoreRules { for ancestor in current .ancestors() - .take_while(|&path| path.starts_with(library_root)) + .take_while(|&path| path.starts_with(location_root)) { let git_ignore = ancestor.join(".gitignore"); @@ -54,13 +53,16 @@ impl GitIgnoreRules { } let git_repo = git_repo?; - Some(Self::parse_gitrepo(git_repo, ignores).await) + Some(Self::parse_git_repo(git_repo, ignores).await) } - async fn parse_gitrepo(git_repo: &Path, gitignores: Vec) -> Result { + async fn parse_git_repo( + git_repo: &Path, + git_ignores: Vec, + ) -> Result { let mut search = Search::default(); - let gitignores = gitignores + let git_ignores = git_ignores .into_iter() .map(Self::parse_git_ignore) .collect::>() @@ -68,7 +70,7 @@ impl GitIgnoreRules { .await; search .patterns - .extend(gitignores.into_iter().filter_map(Result::ok)); + .extend(git_ignores.into_iter().filter_map(Result::ok)); let git_exclude_rules = Self::parse_git_exclude(git_repo.join(".git")).await; if let Ok(rules) = git_exclude_rules { @@ -86,11 +88,11 @@ impl GitIgnoreRules { if let Ok(Some(patterns)) = List::from_file(gitignore, None, true, &mut buf) { Ok(patterns) } else { - Err(SeederError::InhirentedExternalRules) + Err(SeederError::InheritedExternalRules) } }) .await - .map_err(|_| SeederError::InhirentedExternalRules)? + .map_err(|_| SeederError::InheritedExternalRules)? } async fn parse_git_exclude(dot_git: PathBuf) -> Result>, SeederError> { @@ -98,10 +100,10 @@ impl GitIgnoreRules { let mut buf = Vec::new(); Search::from_git_dir(dot_git.as_ref(), None, &mut buf) .map(|search| search.patterns) - .map_err(|_| SeederError::InhirentedExternalRules) + .map_err(|_| SeederError::InheritedExternalRules) }) .await - .map_err(|_| SeederError::InhirentedExternalRules)? + .map_err(|_| SeederError::InheritedExternalRules)? } async fn is_git_repo(path: &Path) -> bool { @@ -179,8 +181,8 @@ pub async fn new_or_existing_library(db: &PrismaClient) -> Result<(), SeederErro .into_iter() .enumerate() { - let pub_id = sd_utils::uuid_to_bytes(Uuid::from_u128(i as u128)); - let rules = rmp_serde::to_vec_named(&rule.rules).map_err(IndexerRuleError::from)?; + let pub_id = sd_utils::uuid_to_bytes(&Uuid::from_u128(i as u128)); + let rules = rmp_serde::to_vec_named(&rule.rules).map_err(Error::from)?; let data = vec![ name::set(Some(rule.name.to_string())), diff --git a/core/crates/prisma-helpers/Cargo.toml b/core/crates/prisma-helpers/Cargo.toml index 7c32c6db2..66d1be763 100644 --- a/core/crates/prisma-helpers/Cargo.toml +++ b/core/crates/prisma-helpers/Cargo.toml @@ -9,7 +9,10 @@ edition = { workspace = true } [dependencies] # Spacedrive Sub-crates sd-prisma = { path = "../../../crates/prisma" } +sd-utils = { path = "../../../crates/utils" } # Workspace dependencies prisma-client-rust = { workspace = true } -serde = { workspace = true } +serde = { workspace = true, features = ["derive"] } +specta = { workspace = true } +uuid = { workspace = true, features = ["v4", "serde"] } diff --git a/core/crates/prisma-helpers/src/lib.rs b/core/crates/prisma-helpers/src/lib.rs index 8f1fe20b9..958bb7703 100644 --- a/core/crates/prisma-helpers/src/lib.rs +++ b/core/crates/prisma-helpers/src/lib.rs @@ -29,8 +29,16 @@ #![allow(clippy::missing_errors_doc, clippy::module_name_repetitions)] use sd_prisma::prisma::{file_path, job, label, location, object}; +use sd_utils::{from_bytes_to_uuid, uuid_to_bytes}; + +use std::{borrow::Cow, fmt}; + +use serde::{Deserialize, Serialize}; +use specta::Type; +use uuid::Uuid; // File Path selectables! +file_path::select!(file_path_id { id }); file_path::select!(file_path_pub_id { pub_id }); file_path::select!(file_path_pub_and_cas_ids { id pub_id cas_id }); file_path::select!(file_path_just_pub_id_materialized_path { @@ -62,7 +70,10 @@ file_path::select!(file_path_for_media_processor { name extension cas_id - object_id + object: select { + id + pub_id + } }); file_path::select!(file_path_to_isolate { location_id @@ -137,6 +148,11 @@ file_path::select!(file_path_to_full_path { path } }); +file_path::select!(file_path_to_create_object { + id + pub_id + date_created +}); // File Path includes! file_path::include!(file_path_with_object { object }); @@ -157,6 +173,7 @@ file_path::include!(file_path_for_frontend { }); // Object selectables! +object::select!(object_ids { id pub_id }); object::select!(object_for_file_identifier { pub_id file_paths: select { pub_id cas_id extension is_dir materialized_path name } @@ -222,6 +239,14 @@ job::select!(job_without_data { date_estimated_completion }); +// Location selectables! +location::select!(location_ids_and_path { + id + pub_id + instance_id + path +}); + // Location includes! location::include!(location_with_indexer_rules { indexer_rules: select { indexer_rule } @@ -284,3 +309,220 @@ label::include!((take: i64) => label_with_objects { } } }); + +#[derive(Debug, Serialize, Deserialize, Hash, PartialEq, Eq, Type)] +#[serde(transparent)] +pub struct CasId<'cas_id>(Cow<'cas_id, str>); + +impl Clone for CasId<'_> { + fn clone(&self) -> CasId<'static> { + CasId(Cow::Owned(self.0.clone().into_owned())) + } +} + +impl<'cas_id> CasId<'cas_id> { + #[must_use] + pub fn as_str(&self) -> &str { + self.0.as_ref() + } + + #[must_use] + pub fn to_owned(&self) -> CasId<'static> { + CasId(Cow::Owned(self.0.clone().into_owned())) + } + + #[must_use] + pub fn into_owned(self) -> CasId<'static> { + CasId(Cow::Owned(self.0.clone().into_owned())) + } +} + +impl From<&CasId<'_>> for file_path::cas_id::Type { + fn from(CasId(cas_id): &CasId<'_>) -> Self { + Some(cas_id.clone().into_owned()) + } +} + +impl<'cas_id> From<&'cas_id str> for CasId<'cas_id> { + fn from(cas_id: &'cas_id str) -> Self { + Self(Cow::Borrowed(cas_id)) + } +} + +impl<'cas_id> From<&'cas_id String> for CasId<'cas_id> { + fn from(cas_id: &'cas_id String) -> Self { + Self(Cow::Borrowed(cas_id)) + } +} + +impl From for CasId<'static> { + fn from(cas_id: String) -> Self { + Self(cas_id.into()) + } +} + +impl From> for String { + fn from(CasId(cas_id): CasId<'_>) -> Self { + cas_id.into_owned() + } +} + +impl From<&CasId<'_>> for String { + fn from(CasId(cas_id): &CasId<'_>) -> Self { + cas_id.clone().into_owned() + } +} + +#[derive(Debug, Serialize, Deserialize, Hash, PartialEq, Eq, Clone)] +#[serde(transparent)] +#[repr(transparent)] +pub struct FilePathPubId(PubId); + +#[derive(Debug, Serialize, Deserialize, Hash, PartialEq, Eq, Clone)] +#[serde(transparent)] +#[repr(transparent)] +pub struct ObjectPubId(PubId); + +#[derive(Debug, Serialize, Deserialize, Hash, PartialEq, Eq, Clone)] +enum PubId { + Uuid(Uuid), + Vec(Vec), +} + +impl PubId { + fn new() -> Self { + Self::Uuid(Uuid::new_v4()) + } + + fn to_db(&self) -> Vec { + match self { + Self::Uuid(uuid) => uuid_to_bytes(uuid), + Self::Vec(bytes) => bytes.clone(), + } + } +} + +impl Default for PubId { + fn default() -> Self { + Self::new() + } +} + +impl From for PubId { + fn from(uuid: Uuid) -> Self { + Self::Uuid(uuid) + } +} + +impl From> for PubId { + fn from(bytes: Vec) -> Self { + Self::Vec(bytes) + } +} + +impl From<&Vec> for PubId { + fn from(bytes: &Vec) -> Self { + Self::Vec(bytes.clone()) + } +} + +impl From<&[u8]> for PubId { + fn from(bytes: &[u8]) -> Self { + Self::Vec(bytes.to_vec()) + } +} + +impl From for Vec { + fn from(pub_id: PubId) -> Self { + match pub_id { + PubId::Uuid(uuid) => uuid_to_bytes(&uuid), + PubId::Vec(bytes) => bytes, + } + } +} + +impl From for Uuid { + fn from(pub_id: PubId) -> Self { + match pub_id { + PubId::Uuid(uuid) => uuid, + PubId::Vec(bytes) => from_bytes_to_uuid(&bytes), + } + } +} + +impl fmt::Display for PubId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Uuid(uuid) => write!(f, "{uuid}"), + Self::Vec(bytes) => write!(f, "{}", from_bytes_to_uuid(bytes)), + } + } +} + +macro_rules! delegate_pub_id { + ($($type_name:ty),+ $(,)?) => { + $( + impl From<::uuid::Uuid> for $type_name { + fn from(uuid: ::uuid::Uuid) -> Self { + Self(uuid.into()) + } + } + + impl From> for $type_name { + fn from(bytes: Vec) -> Self { + Self(bytes.into()) + } + } + + impl From<&Vec> for $type_name { + fn from(bytes: &Vec) -> Self { + Self(bytes.into()) + } + } + + impl From<&[u8]> for $type_name { + fn from(bytes: &[u8]) -> Self { + Self(bytes.into()) + } + } + + impl From<$type_name> for Vec { + fn from(pub_id: $type_name) -> Self { + pub_id.0.into() + } + } + + impl From<$type_name> for ::uuid::Uuid { + fn from(pub_id: $type_name) -> Self { + pub_id.0.into() + } + } + + impl ::std::fmt::Display for $type_name { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + write!(f, "{}", self.0) + } + } + + impl $type_name { + #[must_use] + pub fn new() -> Self { + Self(PubId::new()) + } + + #[must_use] + pub fn to_db(&self) -> Vec { + self.0.to_db() + } + } + + impl Default for $type_name { + fn default() -> Self { + Self::new() + } + } + )+ + }; +} + +delegate_pub_id!(FilePathPubId, ObjectPubId); diff --git a/core/crates/sync/src/ingest.rs b/core/crates/sync/src/ingest.rs index 779fef741..176a004c4 100644 --- a/core/crates/sync/src/ingest.rs +++ b/core/crates/sync/src/ingest.rs @@ -114,10 +114,10 @@ impl Actor { } State::Ingesting(event) => { debug!( - "ingesting {} operations: {} to {}", - event.messages.len(), - event.messages.first().unwrap().3.timestamp.as_u64(), - event.messages.last().unwrap().3.timestamp.as_u64(), + messages_count = event.messages.len(), + first_message = event.messages.first().unwrap().3.timestamp.as_u64(), + last_message = event.messages.last().unwrap().3.timestamp.as_u64(), + "Ingesting operations;", ); for (instance, data) in event.messages.0 { diff --git a/core/crates/sync/src/manager.rs b/core/crates/sync/src/manager.rs index e75a90c08..c06029210 100644 --- a/core/crates/sync/src/manager.rs +++ b/core/crates/sync/src/manager.rs @@ -175,7 +175,7 @@ impl Manager { .crdt_operation() .find_many(vec![ crdt_operation::instance::is(vec![instance::pub_id::equals(uuid_to_bytes( - instance_uuid, + &instance_uuid, ))]), crdt_operation::timestamp::gt(timestamp.as_u64() as i64), ]) @@ -204,7 +204,7 @@ impl Manager { .map(|(instance_id, timestamp)| { prisma_client_rust::and![ $op::instance::is(vec![instance::pub_id::equals(uuid_to_bytes( - *instance_id + instance_id ))]), $op::timestamp::gt(timestamp.as_u64() as i64) ] @@ -216,7 +216,7 @@ impl Manager { .clocks .iter() .map(|(instance_id, _)| { - uuid_to_bytes(*instance_id) + uuid_to_bytes(instance_id) }) .collect() ) @@ -263,7 +263,7 @@ impl Manager { .map(|(instance_id, timestamp)| { prisma_client_rust::and![ $op::instance::is(vec![instance::pub_id::equals(uuid_to_bytes( - *instance_id + instance_id ))]), $op::timestamp::gt(timestamp.as_u64() as i64) ] @@ -275,7 +275,7 @@ impl Manager { .clocks .iter() .map(|(instance_id, _)| { - uuid_to_bytes(*instance_id) + uuid_to_bytes(instance_id) }) .collect() ) diff --git a/core/crates/sync/tests/lib.rs b/core/crates/sync/tests/lib.rs index fa5ee0e2f..03555953f 100644 --- a/core/crates/sync/tests/lib.rs +++ b/core/crates/sync/tests/lib.rs @@ -30,11 +30,11 @@ async fn write_test_location( ( instance.sync.shared_create( prisma_sync::location::SyncId { - pub_id: uuid_to_bytes(id), + pub_id: uuid_to_bytes(&id), }, sync_ops, ), - instance.db.location().create(uuid_to_bytes(id), db_ops), + instance.db.location().create(uuid_to_bytes(&id), db_ops), ) }) .await?) diff --git a/core/crates/sync/tests/mock_instance.rs b/core/crates/sync/tests/mock_instance.rs index 513833dfc..5a4408c43 100644 --- a/core/crates/sync/tests/mock_instance.rs +++ b/core/crates/sync/tests/mock_instance.rs @@ -36,7 +36,7 @@ impl Instance { db.instance() .create( - uuid_to_bytes(id), + uuid_to_bytes(&id), vec![], vec![], Utc::now().into(), @@ -73,7 +73,7 @@ impl Instance { left.db .instance() .create( - uuid_to_bytes(right.id), + uuid_to_bytes(&right.id), vec![], vec![], Utc::now().into(), diff --git a/core/src/api/backups.rs b/core/src/api/backups.rs index 50db04ad2..de3bb1deb 100644 --- a/core/src/api/backups.rs +++ b/core/src/api/backups.rs @@ -150,15 +150,19 @@ async fn start_backup(node: Arc, library: Arc) -> Uuid { match do_backup(bkp_id, &node, &library).await { Ok(path) => { info!( - "Backup '{bkp_id}' for library '{}' created at '{path:?}'!", - library.id + backup_id = %bkp_id, + library_id = %library.id, + path = %path.display(), + "Backup created!;", ); invalidate_query!(library, "backups.getAll"); } Err(e) => { error!( - "Error with backup '{bkp_id}' for library '{}': {e:?}", - library.id + backup_id = %bkp_id, + library_id = %library.id, + ?e, + "Error with backup for library;", ); // TODO: Alert user something went wrong @@ -282,10 +286,10 @@ async fn do_backup(id: Uuid, node: &Node, library: &Library) -> Result, path: PathBuf) { match restore_backup(&node, &path).await { Ok(Header { id, library_id, .. }) => { - info!("Restored to '{id}' for library '{library_id}'!",); + info!(%id, %library_id, "Restored backup for library!"); } Err(e) => { - error!("Error restoring backup '{}': {e:#?}", path.display()); + error!(path = %path.display(), ?e, "Error restoring backup;"); // TODO: Alert user something went wrong } diff --git a/core/src/api/cloud.rs b/core/src/api/cloud.rs index 3bd40fb23..ecffa5fe8 100644 --- a/core/src/api/cloud.rs +++ b/core/src/api/cloud.rs @@ -155,9 +155,9 @@ mod library { &library.db, &library.sync, &node.libraries, - instance.uuid, + &instance.uuid, instance.identity, - instance.node_id, + &instance.node_id, RemoteIdentity::from_str(&instance.node_remote_identity) .expect("malformed remote identity in the DB"), instance.metadata, @@ -304,8 +304,8 @@ mod locations { .body(ByteStream::from_body_0_4(Full::from("Hello, world!"))) .send() .await - .map_err(|err| { - tracing::error!("S3 error: {err:?}"); + .map_err(|e| { + tracing::error!(?e, "S3 error;"); rspc::Error::new( rspc::ErrorCode::InternalServerError, "Failed to upload to S3".to_string(), diff --git a/core/src/api/ephemeral_files.rs b/core/src/api/ephemeral_files.rs index f349d8578..c2cc85a52 100644 --- a/core/src/api/ephemeral_files.rs +++ b/core/src/api/ephemeral_files.rs @@ -7,11 +7,13 @@ use crate::{ library::Library, object::{ fs::{error::FileSystemJobsError, find_available_filename_for_duplicate}, - media::exif_metadata_extractor::{can_extract_exif_data_for_image, extract_exif_data}, + // media::exif_metadata_extractor::{can_extract_exif_data_for_image, extract_exif_data}, }, }; use sd_core_file_path_helper::IsolatedFilePathData; +use sd_core_heavy_lifting::media_processor::exif_media_data; + use sd_file_ext::{ extensions::{Extension, ImageExtension}, kind::ObjectKind, @@ -64,18 +66,18 @@ pub(crate) fn mount() -> AlphaRouter { }; let image_extension = ImageExtension::from_str(extension).map_err(|e| { - error!("Failed to parse image extension: {e:#?}"); + error!(?e, "Failed to parse image extension;"); rspc::Error::new( ErrorCode::BadRequest, "Invalid image extension".to_string(), ) })?; - if !can_extract_exif_data_for_image(&image_extension) { + if !exif_media_data::can_extract(image_extension) { return Ok(None); } - let exif_data = extract_exif_data(full_path) + let exif_data = exif_media_data::extract(full_path) .await .map_err(|e| { rspc::Error::with_cause( @@ -91,7 +93,7 @@ pub(crate) fn mount() -> AlphaRouter { Some(v) if v == ObjectKind::Audio || v == ObjectKind::Video => { let ffmpeg_data = MediaData::FFmpeg( FFmpegMetadata::from_path(full_path).await.map_err(|e| { - error!("{e:#?}"); + error!(?e, "Failed to extract ffmpeg metadata;"); rspc::Error::with_cause( ErrorCode::InternalServerError, e.to_string(), @@ -206,14 +208,15 @@ pub(crate) fn mount() -> AlphaRouter { )) })?; - Ok(()) + Ok::<_, rspc::Error>(()) } Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(()), Err(e) => Err(FileIOError::from(( path, e, "Failed to get file metadata for deletion", - ))), + )) + .into()), } }) .collect::>() @@ -384,9 +387,10 @@ pub(crate) fn mount() -> AlphaRouter { fs::rename(&old_path, &new_path).await.map_err(|e| { error!( - "Failed to rename file from: '{}' to: '{}'; Error: {e:#?}", - old_path.display(), - new_path.display() + old_path = %old_path.display(), + new_path = %new_path.display(), + ?e, + "Failed to rename file;", ); let e = FileIOError::from((old_path, e, "Failed to rename file")); rspc::Error::with_cause(ErrorCode::Conflict, e.to_string(), e) @@ -493,7 +497,7 @@ impl EphemeralFileSystemOps { let target = target_dir.join(name); Some((source, target)) } else { - warn!("Skipping file with no name: '{}'", source.display()); + warn!(source = %source.display(), "Skipping file with no name;"); None } }) @@ -615,7 +619,7 @@ impl EphemeralFileSystemOps { let target = target_dir.join(name); Some((source, target)) } else { - warn!("Skipping file with no name: '{}'", source.display()); + warn!(source = %source.display(), "Skipping file with no name;"); None } }) diff --git a/core/src/api/files.rs b/core/src/api/files.rs index bd6f6eab8..9a512fddc 100644 --- a/core/src/api/files.rs +++ b/core/src/api/files.rs @@ -9,12 +9,13 @@ use crate::{ old_copy::OldFileCopierJobInit, old_cut::OldFileCutterJobInit, old_delete::OldFileDeleterJobInit, old_erase::OldFileEraserJobInit, }, - media::{exif_media_data_from_prisma_data, ffmpeg_data_from_prisma_data}, + // media::{exif_media_data_from_prisma_data, ffmpeg_data_from_prisma_data}, }, - old_job::Job, + old_job::OldJob, }; use sd_core_file_path_helper::{FilePathError, IsolatedFilePathData}; +use sd_core_heavy_lifting::media_processor::{exif_media_data, ffmpeg_media_data}; use sd_core_prisma_helpers::{ file_path_to_isolate, file_path_to_isolate_with_id, object_with_file_paths, object_with_media_data, @@ -127,13 +128,13 @@ pub(crate) fn mount() -> AlphaRouter { .and_then(|obj| { Some(match obj.kind { Some(v) if v == ObjectKind::Image as i32 => MediaData::Exif( - exif_media_data_from_prisma_data(obj.exif_data?), + exif_media_data::from_prisma_data(obj.exif_data?), ), Some(v) if v == ObjectKind::Audio as i32 || v == ObjectKind::Video as i32 => { - MediaData::FFmpeg(ffmpeg_data_from_prisma_data( + MediaData::FFmpeg(ffmpeg_media_data::from_prisma_data( obj.ffmpeg_data?, )) } @@ -476,8 +477,8 @@ pub(crate) fn mount() -> AlphaRouter { Ok(()) => Ok(()), Err(e) if e.kind() == io::ErrorKind::NotFound => { warn!( - "File not found in the file system, will remove from database: {}", - full_path.display() + path = %full_path.display(), + "File not found in the file system, will remove from database;", ); library .db @@ -495,7 +496,7 @@ pub(crate) fn mount() -> AlphaRouter { } } } - _ => Job::new(args) + _ => OldJob::new(args) .spawn(&node, &library) .await .map_err(Into::into), @@ -560,7 +561,7 @@ pub(crate) fn mount() -> AlphaRouter { Ok(()) } - _ => Job::new(args) + _ => OldJob::new(args) .spawn(&node, &library) .await .map_err(Into::into), @@ -642,10 +643,11 @@ pub(crate) fn mount() -> AlphaRouter { }) .await .map_err(|e| { - error!("{e:#?}"); - rspc::Error::new( + error!(?e, "Failed to convert image;"); + rspc::Error::with_cause( ErrorCode::InternalServerError, "Had an internal problem converting image".to_string(), + e, ) })??; @@ -706,7 +708,7 @@ pub(crate) fn mount() -> AlphaRouter { .procedure("eraseFiles", { R.with2(library()) .mutation(|(node, library), args: OldFileEraserJobInit| async move { - Job::new(args) + OldJob::new(args) .spawn(&node, &library) .await .map_err(Into::into) @@ -715,7 +717,7 @@ pub(crate) fn mount() -> AlphaRouter { .procedure("copyFiles", { R.with2(library()) .mutation(|(node, library), args: OldFileCopierJobInit| async move { - Job::new(args) + OldJob::new(args) .spawn(&node, &library) .await .map_err(Into::into) @@ -724,7 +726,7 @@ pub(crate) fn mount() -> AlphaRouter { .procedure("cutFiles", { R.with2(library()) .mutation(|(node, library), args: OldFileCutterJobInit| async move { - Job::new(args) + OldJob::new(args) .spawn(&node, &library) .await .map_err(Into::into) @@ -878,10 +880,11 @@ pub(crate) fn mount() -> AlphaRouter { } else { fs::rename(&from, &to).await.map_err(|e| { error!( - "Failed to rename file from: '{}' to: '{}'; Error: {e:#?}", - from.display(), - to.display() - ); + from = %from.display(), + to = %to.display(), + ?e, + "Failed to rename file;", + ); rspc::Error::with_cause( ErrorCode::Conflict, "Failed to rename file".to_string(), diff --git a/core/src/api/jobs.rs b/core/src/api/jobs.rs index 6c3f57d6b..12a95b6ae 100644 --- a/core/src/api/jobs.rs +++ b/core/src/api/jobs.rs @@ -1,21 +1,22 @@ use crate::{ + context::NodeContext, invalidate_query, location::{find_location, LocationError}, - object::{ - media::OldMediaProcessorJobInit, - old_file_identifier::old_file_identifier_job::OldFileIdentifierJobInit, - validation::old_validator_job::OldObjectValidatorJobInit, - }, - old_job::{Job, JobReport, JobStatus, OldJobs}, + object::validation::old_validator_job::OldObjectValidatorJobInit, + old_job::{JobStatus, OldJob, OldJobReport}, }; -use sd_core_prisma_helpers::job_without_data; +use sd_core_heavy_lifting::{ + file_identifier::FileIdentifier, job_system::report, media_processor::job::MediaProcessor, + JobId, JobSystemError, Report, +}; use sd_prisma::prisma::{job, location, SortOrder}; use std::{ collections::{hash_map::Entry, BTreeMap, HashMap, VecDeque}, path::PathBuf, + sync::Arc, time::Instant, }; @@ -30,6 +31,8 @@ use uuid::Uuid; use super::{utils::library, CoreEvent, Ctx, R}; +const TEN_MINUTES: Duration = Duration::from_secs(60 * 10); + pub(crate) fn mount() -> AlphaRouter { R.router() .procedure("progress", { @@ -41,7 +44,7 @@ pub(crate) fn mount() -> AlphaRouter { .subscription(|(node, _), _: ()| async move { let mut event_bus_rx = node.event_bus.0.subscribe(); // debounce per-job - let mut intervals = BTreeMap::::new(); + let mut intervals = BTreeMap::::new(); async_stream::stream! { loop { @@ -62,6 +65,9 @@ pub(crate) fn mount() -> AlphaRouter { yield progress_event; *instant = Instant::now(); + + // remove stale jobs that didn't receive a progress for more than 10 minutes + intervals.retain(|_, instant| instant.elapsed() < TEN_MINUTES); } } }) @@ -73,44 +79,53 @@ pub(crate) fn mount() -> AlphaRouter { // this is to ensure the client will always get the correct initial state // - jobs are sorted in to groups by their action // - TODO: refactor grouping system to a many-to-many table - #[derive(Debug, Clone, Serialize, Deserialize, Type)] + #[derive(Debug, Clone, Serialize, Type)] pub struct JobGroup { - id: Uuid, + id: JobId, + running_job_id: Option, action: Option, - status: JobStatus, + status: report::Status, created_at: DateTime, - jobs: VecDeque, + jobs: VecDeque, } R.with2(library()) .query(|(node, library), _: ()| async move { let mut groups: HashMap = HashMap::new(); - let job_reports: Vec = library + let job_reports: Vec = library .db .job() .find_many(vec![]) .order_by(job::date_created::order(SortOrder::Desc)) .take(100) - .select(job_without_data::select()) .exec() .await? .into_iter() - .flat_map(JobReport::try_from) + .flat_map(|job| { + if let Ok(report) = Report::try_from(job.clone()) { + Some(report) + } else { + // TODO(fogodev): this is a temporary fix for the old job system + OldJobReport::try_from(job).map(Into::into).ok() + } + }) .collect(); - let active_reports_by_id = node.old_jobs.get_active_reports_with_id().await; + let mut active_reports_by_id = node.job_system.get_active_reports().await; + active_reports_by_id.extend( + node.old_jobs + .get_active_reports_with_id() + .await + .into_iter() + .map(|(id, old_report)| (id, old_report.into())), + ); for job in job_reports { // action name and group key are computed from the job data - let (action_name, group_key) = job.get_meta(); + let (action_name, group_key) = job.get_action_name_and_group_key(); - trace!( - "job {:#?}, action_name {}, group_key {:?}", - job, - action_name, - group_key - ); + trace!(?job, %action_name, ?group_key); // if the job is running, use the in-memory report let report = active_reports_by_id.get(&job.id).unwrap_or(&job); @@ -122,7 +137,10 @@ pub(crate) fn mount() -> AlphaRouter { Entry::Vacant(entry) => { entry.insert(JobGroup { id: job.parent_id.unwrap_or(job.id), - action: Some(action_name.clone()), + running_job_id: (job.status == report::Status::Running + || job.status == report::Status::Paused) + .then_some(job.id), + action: Some(action_name), status: job.status, jobs: [report.clone()].into_iter().collect(), created_at: job.created_at.unwrap_or(Utc::now()), @@ -132,8 +150,10 @@ pub(crate) fn mount() -> AlphaRouter { Entry::Occupied(mut entry) => { let group = entry.get_mut(); - // protect paused status from being overwritten - if report.status != JobStatus::Paused { + if report.status == report::Status::Running + || report.status == report::Status::Paused + { + group.running_job_id = Some(report.id); group.status = report.status; } @@ -146,6 +166,7 @@ pub(crate) fn mount() -> AlphaRouter { job.id.to_string(), JobGroup { id: job.id, + running_job_id: Some(job.id), action: None, status: job.status, jobs: [report.clone()].into_iter().collect(), @@ -164,7 +185,14 @@ pub(crate) fn mount() -> AlphaRouter { .procedure("isActive", { R.with2(library()) .query(|(node, library), _: ()| async move { - Ok(node.old_jobs.has_active_workers(library.id).await) + let library_id = library.id; + Ok(node + .job_system + .has_active_jobs(NodeContext { + node: Arc::clone(&node), + library, + }) + .await || node.old_jobs.has_active_workers(library_id).await) }) }) .procedure("clear", { @@ -204,30 +232,56 @@ pub(crate) fn mount() -> AlphaRouter { // pause job .procedure("pause", { R.with2(library()) - .mutation(|(node, library), id: Uuid| async move { - let ret = OldJobs::pause(&node.old_jobs, id).await.map_err(Into::into); + .mutation(|(node, library), job_id: JobId| async move { + if let Err(e) = node.job_system.pause(job_id).await { + if matches!(e, JobSystemError::NotFound(_)) { + // If the job is not found, it can be a job from the old job system + node.old_jobs.pause(job_id).await?; + } else { + return Err(e.into()); + } + } + + invalidate_query!(library, "jobs.isActive"); invalidate_query!(library, "jobs.reports"); - ret + + Ok(()) }) }) .procedure("resume", { R.with2(library()) - .mutation(|(node, library), id: Uuid| async move { - let ret = OldJobs::resume(&node.old_jobs, id) - .await - .map_err(Into::into); + .mutation(|(node, library), job_id: JobId| async move { + if let Err(e) = node.job_system.resume(job_id).await { + if matches!(e, JobSystemError::NotFound(_)) { + // If the job is not found, it can be a job from the old job system + node.old_jobs.resume(job_id).await?; + } else { + return Err(e.into()); + } + } + + invalidate_query!(library, "jobs.isActive"); invalidate_query!(library, "jobs.reports"); - ret + + Ok(()) }) }) .procedure("cancel", { R.with2(library()) - .mutation(|(node, library), id: Uuid| async move { - let ret = OldJobs::cancel(&node.old_jobs, id) - .await - .map_err(Into::into); + .mutation(|(node, library), job_id: JobId| async move { + if let Err(e) = node.job_system.cancel(job_id).await { + if matches!(e, JobSystemError::NotFound(_)) { + // If the job is not found, it can be a job from the old job system + node.old_jobs.cancel(job_id).await?; + } else { + return Err(e.into()); + } + } + + invalidate_query!(library, "jobs.isActive"); invalidate_query!(library, "jobs.reports"); - ret + + Ok(()) }) }) .procedure("generateThumbsForLocation", { @@ -250,50 +304,50 @@ pub(crate) fn mount() -> AlphaRouter { return Err(LocationError::IdNotFound(id).into()); }; - Job::new(OldMediaProcessorJobInit { - location, - sub_path: Some(path), - regenerate_thumbnails: regenerate, - regenerate_labels: false, - }) - .spawn(&node, &library) - .await - .map_err(Into::into) - }, - ) - }) - .procedure("generateLabelsForLocation", { - #[derive(Type, Deserialize)] - pub struct GenerateLabelsForLocationArgs { - pub id: location::id::Type, - pub path: PathBuf, - #[serde(default)] - pub regenerate: bool, - } - - R.with2(library()).mutation( - |(node, library), - GenerateLabelsForLocationArgs { - id, - path, - regenerate, - }: GenerateLabelsForLocationArgs| async move { - let Some(location) = find_location(&library, id).exec().await? else { - return Err(LocationError::IdNotFound(id).into()); - }; - - Job::new(OldMediaProcessorJobInit { - location, - sub_path: Some(path), - regenerate_thumbnails: false, - regenerate_labels: regenerate, - }) - .spawn(&node, &library) - .await - .map_err(Into::into) + node.job_system + .dispatch( + MediaProcessor::new(location, Some(path), regenerate)?, + id, + NodeContext { + node: Arc::clone(&node), + library, + }, + ) + .await + .map_err(Into::into) }, ) }) + // .procedure("generateLabelsForLocation", { + // #[derive(Type, Deserialize)] + // pub struct GenerateLabelsForLocationArgs { + // pub id: location::id::Type, + // pub path: PathBuf, + // #[serde(default)] + // pub regenerate: bool, + // } + // R.with2(library()).mutation( + // |(node, library), + // GenerateLabelsForLocationArgs { + // id, + // path, + // regenerate, + // }: GenerateLabelsForLocationArgs| async move { + // let Some(location) = find_location(&library, id).exec().await? else { + // return Err(LocationError::IdNotFound(id).into()); + // }; + // OldJob::new(OldMediaProcessorJobInit { + // location, + // sub_path: Some(path), + // regenerate_thumbnails: false, + // regenerate_labels: regenerate, + // }) + // .spawn(&node, &library) + // .await + // .map_err(Into::into) + // }, + // ) + // }) .procedure("objectValidator", { #[derive(Type, Deserialize)] pub struct ObjectValidatorArgs { @@ -307,7 +361,7 @@ pub(crate) fn mount() -> AlphaRouter { return Err(LocationError::IdNotFound(args.id).into()); }; - Job::new(OldObjectValidatorJobInit { + OldJob::new(OldObjectValidatorJobInit { location, sub_path: Some(args.path), }) @@ -324,18 +378,22 @@ pub(crate) fn mount() -> AlphaRouter { } R.with2(library()).mutation( - |(node, library), args: IdentifyUniqueFilesArgs| async move { - let Some(location) = find_location(&library, args.id).exec().await? else { - return Err(LocationError::IdNotFound(args.id).into()); + |(node, library), IdentifyUniqueFilesArgs { id, path }: IdentifyUniqueFilesArgs| async move { + let Some(location) = find_location(&library, id).exec().await? else { + return Err(LocationError::IdNotFound(id).into()); }; - Job::new(OldFileIdentifierJobInit { - location, - sub_path: Some(args.path), - }) - .spawn(&node, &library) - .await - .map_err(Into::into) + node.job_system + .dispatch( + FileIdentifier::new(location, Some(path))?, + id, + NodeContext { + node: Arc::clone(&node), + library, + }, + ) + .await + .map_err(Into::into) }, ) }) diff --git a/core/src/api/labels.rs b/core/src/api/labels.rs index 0e5249c59..9aaaf30e3 100644 --- a/core/src/api/labels.rs +++ b/core/src/api/labels.rs @@ -1,8 +1,7 @@ -use crate::{ - invalidate_query, library::Library, object::media::old_thumbnail::get_indexed_thumb_key, -}; +use crate::{invalidate_query, library::Library}; -use sd_core_prisma_helpers::label_with_objects; +use sd_core_heavy_lifting::media_processor::ThumbKey; +use sd_core_prisma_helpers::{label_with_objects, CasId}; use sd_prisma::{ prisma::{label, label_on_object, object, SortOrder}, @@ -49,7 +48,9 @@ pub(crate) fn mount() -> AlphaRouter { file_path_data .cas_id .as_ref() - .map(|cas_id| get_indexed_thumb_key(cas_id, library.id)) + .map(CasId::from) + .map(CasId::into_owned) + .map(|cas_id| ThumbKey::new_indexed(cas_id, library.id)) }) // Filter out None values and transform each element to Vec> .collect::>(), // Collect into Vec>> }) diff --git a/core/src/api/libraries.rs b/core/src/api/libraries.rs index 4d63b9b69..49d964cb0 100644 --- a/core/src/api/libraries.rs +++ b/core/src/api/libraries.rs @@ -8,6 +8,7 @@ use crate::{ use futures::StreamExt; use prisma_client_rust::raw; +use sd_core_heavy_lifting::JobId; use sd_file_ext::kind::ObjectKind; use sd_p2p::RemoteIdentity; use sd_prisma::prisma::{indexer_rule, object, statistics}; @@ -106,7 +107,7 @@ pub(crate) fn mount() -> AlphaRouter { match STATISTICS_UPDATERS.lock().await.entry(library.id) { Entry::Occupied(entry) => { if entry.get().send(Instant::now()).await.is_err() { - error!("Failed to send statistics update request"); + error!("Failed to send statistics update request;"); } } Entry::Vacant(entry) => { @@ -181,13 +182,13 @@ pub(crate) fn mount() -> AlphaRouter { }: DefaultLocations, node: Arc, library: Arc, - ) -> Result<(), rspc::Error> { + ) -> Result, rspc::Error> { // If all of them are false, we skip if [!desktop, !documents, !downloads, !pictures, !music, !videos] .into_iter() .all(identity) { - return Ok(()); + return Ok(None); } let Some(default_locations_paths) = UserDirs::new() else { @@ -242,7 +243,7 @@ pub(crate) fn mount() -> AlphaRouter { .await .map_err(rspc::Error::from)? else { - return Ok(()); + return Ok(None); }; let scan_state = ScanState::try_from(location.scan_state)?; @@ -271,7 +272,7 @@ pub(crate) fn mount() -> AlphaRouter { }) .fold(&mut maybe_error, |maybe_error, res| { if let Err(e) = res { - error!("Failed to create default location: {e:#?}"); + error!(?e, "Failed to create default location;"); *maybe_error = Some(e); } maybe_error @@ -283,7 +284,7 @@ pub(crate) fn mount() -> AlphaRouter { debug!("Created default locations"); - Ok(()) + Ok(None) } R.mutation( @@ -296,7 +297,7 @@ pub(crate) fn mount() -> AlphaRouter { let library = node.libraries.create(name, None, &node).await?; - debug!("Created library {}", library.id); + debug!(%library.id, "Created library;"); if let Some(locations) = default_locations { create_default_locations_on_library_creation( @@ -381,16 +382,19 @@ pub(crate) fn mount() -> AlphaRouter { for _ in 0..5 { match library.db._execute_raw(raw!("VACUUM;")).exec().await { Ok(_) => break, - Err(err) => { + Err(e) => { warn!( - "Failed to vacuum DB for library '{}', retrying...: {err:#?}", - library.id + %library.id, + ?e, + "Failed to vacuum DB for library, retrying...;", ); tokio::time::sleep(Duration::from_millis(500)).await; } } } - info!("Successfully vacuumed DB for library '{}'", library.id); + + info!(%library.id, "Successfully vacuumed DB;"); + Ok(()) }), ) @@ -421,7 +425,7 @@ async fn update_statistics_loop( Message::Tick => { if last_received_at.elapsed() < FIVE_MINUTES { if let Err(e) = update_library_statistics(&node, &library).await { - error!("Failed to update library statistics: {e:#?}"); + error!(?e, "Failed to update library statistics;"); } else { invalidate_query!(&library, "library.statistics"); } diff --git a/core/src/api/locations.rs b/core/src/api/locations.rs index 24d56e52b..a5d11fa3a 100644 --- a/core/src/api/locations.rs +++ b/core/src/api/locations.rs @@ -1,16 +1,15 @@ use crate::{ invalidate_query, location::{ - delete_location, find_location, indexer::OldIndexerJobInit, light_scan_location, - non_indexed::NonIndexedPathItem, relink_location, scan_location, scan_location_sub_path, - LocationCreateArgs, LocationError, LocationUpdateArgs, ScanState, + delete_location, find_location, light_scan_location, non_indexed::NonIndexedPathItem, + relink_location, scan_location, scan_location_sub_path, LocationCreateArgs, LocationError, + LocationUpdateArgs, ScanState, }, - object::old_file_identifier::old_file_identifier_job::OldFileIdentifierJobInit, - old_job::StatefulJob, p2p::PeerMetadata, util::AbortOnDrop, }; +use sd_core_heavy_lifting::{media_processor::ThumbKey, JobName}; use sd_core_indexer_rules::IndexerRuleCreateArgs; use sd_core_prisma_helpers::{ file_path_for_frontend, label_with_objects, location_with_indexer_rules, object_with_file_paths, @@ -29,28 +28,24 @@ use tracing::{debug, error}; use super::{utils::library, Ctx, R}; -// it includes the shard hex formatted as ([["f02", "cab34a76fbf3469f"]]) -// Will be None if no thumbnail exists -pub type ThumbnailKey = Vec; - #[derive(Serialize, Type, Debug)] #[serde(tag = "type")] pub enum ExplorerItem { Path { // provide the frontend with the thumbnail key explicitly - thumbnail: Option, + thumbnail: Option, // this tells the frontend if a thumbnail actually exists or not has_created_thumbnail: bool, // we can't actually modify data from PCR types, thats why computed properties are used on ExplorerItem item: Box, }, Object { - thumbnail: Option, + thumbnail: Option, has_created_thumbnail: bool, item: object_with_file_paths::Data, }, NonIndexedPath { - thumbnail: Option, + thumbnail: Option, has_created_thumbnail: bool, item: NonIndexedPathItem, }, @@ -61,7 +56,7 @@ pub enum ExplorerItem { item: PeerMetadata, }, Label { - thumbnails: Vec, + thumbnails: Vec, item: label_with_objects::Data, }, } @@ -347,7 +342,7 @@ pub(crate) fn mount() -> AlphaRouter { .exec() .await?; - debug!("Disconnected {count} file paths from objects"); + debug!(%count, "Disconnected file paths from objects;"); // library.orphan_remover.invoke().await; } @@ -409,13 +404,15 @@ pub(crate) fn mount() -> AlphaRouter { sub_path, }: LightScanArgs| async move { if node - .old_jobs - .has_job_running(|job_identity| { - job_identity.target_location == location_id - && (job_identity.name == ::NAME - || job_identity.name - == ::NAME) - }) + .job_system + .check_running_jobs( + vec![ + JobName::Indexer, + JobName::FileIdentifier, + JobName::MediaProcessor, + ], + location_id, + ) .await { return Err(rspc::Error::new( @@ -433,7 +430,7 @@ pub(crate) fn mount() -> AlphaRouter { let handle = tokio::spawn(async move { if let Err(e) = light_scan_location(node, library, location, sub_path).await { - error!("light scan error: {e:#?}"); + error!(?e, "Light scan error;"); } }); diff --git a/core/src/api/mod.rs b/core/src/api/mod.rs index eb5ab5fc9..7330cbd2b 100644 --- a/core/src/api/mod.rs +++ b/core/src/api/mod.rs @@ -8,6 +8,7 @@ use crate::{ Node, }; +use sd_core_heavy_lifting::media_processor::ThumbKey; use sd_p2p::RemoteIdentity; use sd_prisma::prisma::file_path; @@ -54,7 +55,7 @@ pub type Router = rspc::Router; #[derive(Debug, Clone, Serialize, Type)] pub enum CoreEvent { NewThumbnail { - thumb_key: Vec, + thumb_key: ThumbKey, }, NewIdentifiedObjects { file_path_ids: Vec, @@ -175,7 +176,7 @@ pub(crate) fn mount() -> Arc { .await .map(|_| true) } - .map_err(|err| rspc::Error::new(ErrorCode::InternalServerError, err.to_string()))?; + .map_err(|e| rspc::Error::new(ErrorCode::InternalServerError, e.to_string()))?; match feature { BackendFeature::CloudSync => { diff --git a/core/src/api/nodes.rs b/core/src/api/nodes.rs index 083055298..996f8a502 100644 --- a/core/src/api/nodes.rs +++ b/core/src/api/nodes.rs @@ -82,8 +82,9 @@ pub(crate) fn mount() -> AlphaRouter { new_model = sd_ai::old_image_labeler::YoloV8::model(Some(&version)) .map_err(|e| { error!( - "Failed to crate image_detection model: '{}'; Error: {e:#?}", - &version, + %version, + ?e, + "Failed to crate image_detection model;", ); }) .ok(); @@ -94,8 +95,8 @@ pub(crate) fn mount() -> AlphaRouter { } }) .await - .map_err(|err| { - error!("Failed to write config: {}", err); + .map_err(|e| { + error!(?e, "Failed to write config;"); rspc::Error::new( ErrorCode::InternalServerError, "error updating config".into(), @@ -186,21 +187,14 @@ pub(crate) fn mount() -> AlphaRouter { pub background_processing_percentage: u8, // 0-100 } R.mutation( - |node, - UpdateThumbnailerPreferences { - background_processing_percentage, - }: UpdateThumbnailerPreferences| async move { + |node, UpdateThumbnailerPreferences { .. }: UpdateThumbnailerPreferences| async move { node.config - .update_preferences(|preferences| { - preferences - .thumbnailer - .set_background_processing_percentage( - background_processing_percentage, - ); + .update_preferences(|_| { + // TODO(fogodev): introduce configurable workers count to task system }) .await .map_err(|e| { - error!("failed to update thumbnailer preferences: {e:#?}"); + error!(?e, "Failed to update thumbnailer preferences;"); rspc::Error::with_cause( ErrorCode::InternalServerError, "Failed to update thumbnailer preferences".to_string(), diff --git a/core/src/api/notifications.rs b/core/src/api/notifications.rs index 738523dfd..ff8b7b076 100644 --- a/core/src/api/notifications.rs +++ b/core/src/api/notifications.rs @@ -56,12 +56,12 @@ pub(crate) fn mount() -> AlphaRouter { .find_many(vec![]) .exec() .await - .map_err(|err| { + .map_err(|e| { rspc::Error::new( ErrorCode::InternalServerError, format!( "Failed to get notifications for library '{}': {}", - library.id, err + library.id, e ), ) })? @@ -69,12 +69,12 @@ pub(crate) fn mount() -> AlphaRouter { .map(|n| { Ok(Notification { id: NotificationId::Library(library.id, n.id as u32), - data: rmp_serde::from_slice(&n.data).map_err(|err| { + data: rmp_serde::from_slice(&n.data).map_err(|e| { rspc::Error::new( ErrorCode::InternalServerError, format!( "Failed to get notifications for library '{}': {}", - library.id, err + library.id, e ), ) })?, @@ -108,8 +108,8 @@ pub(crate) fn mount() -> AlphaRouter { .delete_many(vec![notification::id::equals(id as i32)]) .exec() .await - .map_err(|err| { - rspc::Error::new(ErrorCode::InternalServerError, err.to_string()) + .map_err(|e| { + rspc::Error::new(ErrorCode::InternalServerError, e.to_string()) })?; } NotificationId::Node(id) => { @@ -119,8 +119,8 @@ pub(crate) fn mount() -> AlphaRouter { .retain(|n| n.id != NotificationId::Node(id)); }) .await - .map_err(|err| { - rspc::Error::new(ErrorCode::InternalServerError, err.to_string()) + .map_err(|e| { + rspc::Error::new(ErrorCode::InternalServerError, e.to_string()) })?; } } @@ -135,9 +135,7 @@ pub(crate) fn mount() -> AlphaRouter { cfg.notifications = vec![]; }) .await - .map_err(|err| { - rspc::Error::new(ErrorCode::InternalServerError, err.to_string()) - })?; + .map_err(|e| rspc::Error::new(ErrorCode::InternalServerError, e.to_string()))?; join_all( node.libraries diff --git a/core/src/api/p2p.rs b/core/src/api/p2p.rs index 3d23d0374..472e0e464 100644 --- a/core/src/api/p2p.rs +++ b/core/src/api/p2p.rs @@ -89,20 +89,20 @@ pub(crate) fn mount() -> AlphaRouter { ))? .new_stream() .await - .map_err(|err| { + .map_err(|e| { rspc::Error::new( ErrorCode::InternalServerError, - format!("error in peer.new_stream: {:?}", err), + format!("error in peer.new_stream: {:?}", e), ) })?; stream .write_all(&Header::Ping.to_bytes()) .await - .map_err(|err| { + .map_err(|e| { rspc::Error::new( ErrorCode::InternalServerError, - format!("error sending ping header: {:?}", err), + format!("error sending ping header: {:?}", e), ) })?; diff --git a/core/src/api/search/mod.rs b/core/src/api/search/mod.rs index a17d97997..90abc8a1d 100644 --- a/core/src/api/search/mod.rs +++ b/core/src/api/search/mod.rs @@ -2,12 +2,12 @@ use crate::{ api::{locations::ExplorerItem, utils::library}, library::Library, location::{non_indexed, LocationError}, - object::media::old_thumbnail::get_indexed_thumb_key, util::{unsafe_streamed_query, BatchedStream}, }; use prisma_client_rust::Operator; -use sd_core_prisma_helpers::{file_path_for_frontend, object_with_file_paths}; +use sd_core_heavy_lifting::media_processor::ThumbKey; +use sd_core_prisma_helpers::{file_path_for_frontend, object_with_file_paths, CasId}; use sd_prisma::prisma::{self, PrismaClient}; use std::path::PathBuf; @@ -217,21 +217,23 @@ pub fn mount() -> AlphaRouter { let mut items = Vec::with_capacity(file_paths.len()); for file_path in file_paths { - let has_created_thumbnail = if let Some(cas_id) = &file_path.cas_id { - library - .thumbnail_exists(&node, cas_id) - .await - .map_err(LocationError::from)? - } else { - false - }; + let has_created_thumbnail = + if let Some(cas_id) = file_path.cas_id.as_ref().map(CasId::from) { + library + .thumbnail_exists(&node, &cas_id) + .await + .map_err(LocationError::from)? + } else { + false + }; items.push(ExplorerItem::Path { thumbnail: file_path .cas_id .as_ref() - // .filter(|_| thumbnail_exists_locally) - .map(|i| get_indexed_thumb_key(i, library.id)), + .map(CasId::from) + .map(CasId::into_owned) + .map(|cas_id| ThumbKey::new_indexed(cas_id, library.id)), has_created_thumbnail, item: Box::new(file_path), }) @@ -332,9 +334,11 @@ pub fn mount() -> AlphaRouter { .file_paths .iter() .map(|fp| fp.cas_id.as_ref()) - .find_map(|c| c); + .find_map(|c| c) + .map(CasId::from) + .map(|cas_id| cas_id.to_owned()); - let has_created_thumbnail = if let Some(cas_id) = cas_id { + let has_created_thumbnail = if let Some(cas_id) = &cas_id { library.thumbnail_exists(&node, cas_id).await.map_err(|e| { rspc::Error::with_cause( ErrorCode::InternalServerError, @@ -348,8 +352,7 @@ pub fn mount() -> AlphaRouter { items.push(ExplorerItem::Object { thumbnail: cas_id - // .filter(|_| thumbnail_exists_locally) - .map(|cas_id| get_indexed_thumb_key(cas_id, library.id)), + .map(|cas_id| ThumbKey::new_indexed(cas_id, library.id)), item: object, has_created_thumbnail, }); diff --git a/core/src/api/search/saved.rs b/core/src/api/search/saved.rs index 266455a64..4836fef82 100644 --- a/core/src/api/search/saved.rs +++ b/core/src/api/search/saved.rs @@ -82,7 +82,7 @@ pub(crate) fn mount() -> AlphaRouter { // https://docs.rs/serde/latest/serde/de/struct.IgnoredAny.html if let Err(e) = serde_json::from_str::(&s) { - error!("failed to parse filters: {e:#?}"); + error!(?e, "Failed to parse filters;"); None } else { Some(s) diff --git a/core/src/api/tags.rs b/core/src/api/tags.rs index 4e534c832..b951368f2 100644 --- a/core/src/api/tags.rs +++ b/core/src/api/tags.rs @@ -221,7 +221,7 @@ pub(crate) fn mount() -> AlphaRouter { .iter() .filter(|fp| fp.is_dir.unwrap_or_default() && fp.object.is_none()) .map(|fp| { - let id = uuid_to_bytes(Uuid::new_v4()); + let id = uuid_to_bytes(&Uuid::new_v4()); sync_params.extend(sync.shared_create( prisma_sync::object::SyncId { pub_id: id.clone() }, diff --git a/core/src/api/utils/invalidate.rs b/core/src/api/utils/invalidate.rs index e29cb0994..8df2eea6d 100644 --- a/core/src/api/utils/invalidate.rs +++ b/core/src/api/utils/invalidate.rs @@ -132,6 +132,19 @@ impl InvalidRequests { #[macro_export] // #[allow(clippy::crate_in_macro_def)] macro_rules! invalidate_query { + + ($ctx:expr, $query:ident) => {{ + let ctx: &$crate::library::Library = &$ctx; // Assert the context is the correct type + let query: &'static str = $query; + + ::tracing::trace!(target: "sd_core::invalidate-query", "invalidate_query!(\"{}\") at {}", query, concat!(file!(), ":", line!())); + + // The error are ignored here because they aren't mission critical. If they fail the UI might be outdated for a bit. + ctx.emit($crate::api::CoreEvent::InvalidateOperation( + $crate::api::utils::InvalidateOperationEvent::dangerously_create(query, serde_json::Value::Null, None) + )) + }}; + ($ctx:expr, $key:literal) => {{ let ctx: &$crate::library::Library = &$ctx; // Assert the context is the correct type @@ -324,8 +337,12 @@ pub(crate) fn mount_invalidate() -> AlphaRouter { ) => { let key = match to_key(&(key, arg)) { Ok(key) => key, - Err(err) => { - warn!("Error deriving key for invalidate operation '{:?}': {:?}", first_event, err); + Err(e) => { + warn!( + ?first_event, + ?e, + "Error deriving key for invalidate operation;" + ); continue; } }; @@ -345,7 +362,10 @@ pub(crate) fn mount_invalidate() -> AlphaRouter { } event = event_bus_rx.recv() => { let Ok(event) = event else { - warn!("Shutting down invalidation manager thread due to the core event bus being dropped!"); + warn!( + "Shutting down invalidation manager thread \ + due to the core event bus being dropped!" + ); break; }; @@ -359,8 +379,12 @@ pub(crate) fn mount_invalidate() -> AlphaRouter { Ok(key) => { buf.insert(key, op); }, - Err(err) => { - warn!("Error deriving key for invalidate operation '{:?}': {:?}", op, err); + Err(e) => { + warn!( + ?op, + ?e, + "Error deriving key for invalidate operation;", + ); }, } }, @@ -383,7 +407,10 @@ pub(crate) fn mount_invalidate() -> AlphaRouter { Ok(_) => {} // All receivers are shutdown means that all clients are disconnected. Err(_) => { - debug!("Shutting down invalidation manager! This is normal if all clients disconnects."); + debug!( + "Shutting down invalidation manager! \ + This is normal if all clients disconnects." + ); manager_thread_active.swap(false, Ordering::Relaxed); break; } diff --git a/core/src/cloud/sync/ingest.rs b/core/src/cloud/sync/ingest.rs index 8801ebceb..de1a98603 100644 --- a/core/src/cloud/sync/ingest.rs +++ b/core/src/cloud/sync/ingest.rs @@ -42,7 +42,6 @@ pub async fn run_actor( break; } Request::Messages { timestamps, .. } => timestamps, - _ => continue, }; let (ops_ids, ops): (Vec<_>, Vec<_>) = err_break!( @@ -60,10 +59,10 @@ pub async fn run_actor( } debug!( - "Sending {} messages ({:?} to {:?}) to ingester", - ops.len(), - ops.first().map(|operation| operation.timestamp.as_u64()), - ops.last().map(|operation| operation.timestamp.as_u64()), + messages_count = ops.len(), + first_message = ?ops.first().map(|operation| operation.timestamp.as_u64()), + last_message = ?ops.last().map(|operation| operation.timestamp.as_u64()), + "Sending messages to ingester", ); let (wait_tx, wait_rx) = tokio::sync::oneshot::channel::<()>(); diff --git a/core/src/cloud/sync/mod.rs b/core/src/cloud/sync/mod.rs index 5b12e6578..95a110ef0 100644 --- a/core/src/cloud/sync/mod.rs +++ b/core/src/cloud/sync/mod.rs @@ -97,7 +97,7 @@ macro_rules! err_break { match $e { Ok(d) => d, Err(e) => { - tracing::error!("{e}"); + tracing::error!(?e); break; } } diff --git a/core/src/cloud/sync/receive.rs b/core/src/cloud/sync/receive.rs index 8c62e018c..85354e89c 100644 --- a/core/src/cloud/sync/receive.rs +++ b/core/src/cloud/sync/receive.rs @@ -56,7 +56,7 @@ pub async fn run_actor( .map(|id| { db.cloud_crdt_operation() .find_first(vec![cloud_crdt_operation::instance::is(vec![ - instance::pub_id::equals(uuid_to_bytes(*id)), + instance::pub_id::equals(uuid_to_bytes(id)), ])]) .order_by(cloud_crdt_operation::timestamp::order( SortOrder::Desc, @@ -76,8 +76,10 @@ pub async fn run_actor( let cloud_timestamp = d.map(|d| d.timestamp).unwrap_or_default() as u64; debug!( - "Instance {id}, Sync Timestamp {}, Cloud Timestamp {cloud_timestamp}", - sync_timestamp.as_u64() + instance_id = %id, + sync_timestamp = sync_timestamp.as_u64(), + %cloud_timestamp, + "Comparing sync timestamps", ); let max_timestamp = Ord::max(cloud_timestamp, sync_timestamp.as_u64()); @@ -118,7 +120,10 @@ pub async fn run_actor( .await ); - info!("Received {} collections", collections.len()); + info!( + collections_count = collections.len(), + "Received collections;", + ); if collections.is_empty() { break; @@ -165,9 +170,9 @@ pub async fn run_actor( &db, &sync, &libraries, - collection.instance_uuid, + &collection.instance_uuid, instance.identity, - instance.node_id, + &instance.node_id, RemoteIdentity::from_str(&instance.node_remote_identity) .expect("malformed remote identity in the DB"), node.p2p.peer_metadata(), @@ -185,14 +190,10 @@ pub async fn run_actor( let operations = compressed_operations.into_ops(); debug!( - "Processing collection. Instance {}, Start {:?}, End {:?}", - &collection.instance_uuid, - operations - .first() - .map(|operation| operation.timestamp.as_u64()), - operations - .last() - .map(|operation| operation.timestamp.as_u64()), + instance_id = %collection.instance_uuid, + start = ?operations.first().map(|operation| operation.timestamp.as_u64()), + end = ?operations.last().map(|operation| operation.timestamp.as_u64()), + "Processing collection", ); err_break!(write_cloud_ops_to_db(operations, &db).await); @@ -247,9 +248,9 @@ pub async fn upsert_instance( db: &PrismaClient, sync: &sd_core_sync::Manager, libraries: &Libraries, - uuid: Uuid, + uuid: &Uuid, identity: RemoteIdentity, - node_id: Uuid, + node_id: &Uuid, node_remote_identity: RemoteIdentity, metadata: HashMap, ) -> prisma_client_rust::Result<()> { @@ -276,7 +277,7 @@ pub async fn upsert_instance( .exec() .await?; - sync.timestamps.write().await.entry(uuid).or_default(); + sync.timestamps.write().await.entry(*uuid).or_default(); // Called again so the new instances are picked up libraries.update_instances_by_id(library_id).await; diff --git a/core/src/cloud/sync/send.rs b/core/src/cloud/sync/send.rs index e4744f306..872b92c70 100644 --- a/core/src/cloud/sync/send.rs +++ b/core/src/cloud/sync/send.rs @@ -52,8 +52,8 @@ pub async fn run_actor( use sd_cloud_api::library::message_collections::do_add; debug!( - "Preparing to send {} instances' operations to cloud", - req_adds.len() + total_operations = req_adds.len(), + "Preparing to send instance's operations to cloud;" ); // gets new operations for each instance to send to cloud @@ -84,10 +84,7 @@ pub async fn run_actor( use base64::prelude::*; - debug!( - "Instance {}: {} to {}", - req_add.instance_uuid, start_time, end_time - ); + debug!(instance_id = %req_add.instance_uuid, %start_time, %end_time); instances.push(do_add::Input { uuid: req_add.instance_uuid, diff --git a/core/src/context.rs b/core/src/context.rs new file mode 100644 index 000000000..f47ff5aae --- /dev/null +++ b/core/src/context.rs @@ -0,0 +1,229 @@ +use crate::{api::CoreEvent, invalidate_query, library::Library, old_job::JobProgressEvent, Node}; + +use sd_core_heavy_lifting::{ + job_system::report::{Report, Status}, + OuterContext, ProgressUpdate, UpdateEvent, +}; + +use std::{ + ops::{Deref, DerefMut}, + sync::{ + atomic::{AtomicU8, Ordering}, + Arc, + }, +}; + +use chrono::{DateTime, Utc}; +use tokio::{spawn, sync::RwLock}; +use tracing::{error, trace}; +use uuid::Uuid; + +#[derive(Clone)] +pub struct NodeContext { + pub node: Arc, + pub library: Arc, +} + +pub trait NodeContextExt: sealed::Sealed { + fn library(&self) -> &Arc; +} + +mod sealed { + pub trait Sealed {} +} + +impl sealed::Sealed for NodeContext {} + +impl NodeContextExt for NodeContext { + fn library(&self) -> &Arc { + &self.library + } +} + +impl OuterContext for NodeContext { + fn id(&self) -> Uuid { + self.library.id + } + + fn db(&self) -> &Arc { + &self.library.db + } + + fn sync(&self) -> &Arc { + &self.library.sync + } + + fn invalidate_query(&self, query: &'static str) { + invalidate_query!(self.library, query) + } + + fn query_invalidator(&self) -> impl Fn(&'static str) + Send + Sync { + |query| { + invalidate_query!(self.library, query); + } + } + + fn report_update(&self, update: UpdateEvent) { + // FIX-ME: Remove this conversion once we have a proper atomic updates system + let event = match update { + UpdateEvent::NewThumbnail { thumb_key } => CoreEvent::NewThumbnail { thumb_key }, + UpdateEvent::NewIdentifiedObjects { file_path_ids } => { + CoreEvent::NewIdentifiedObjects { file_path_ids } + } + }; + self.node.emit(event); + } + + fn get_data_directory(&self) -> &std::path::Path { + &self.node.data_dir + } +} + +#[derive(Clone)] +pub struct JobContext { + outer_ctx: OuterCtx, + report: Arc>, + start_time: DateTime, + report_update_counter: Arc, +} + +impl OuterContext for JobContext { + fn id(&self) -> Uuid { + self.outer_ctx.id() + } + + fn db(&self) -> &Arc { + self.outer_ctx.db() + } + + fn sync(&self) -> &Arc { + self.outer_ctx.sync() + } + + fn invalidate_query(&self, query: &'static str) { + self.outer_ctx.invalidate_query(query); + } + + fn query_invalidator(&self) -> impl Fn(&'static str) + Send + Sync { + self.outer_ctx.query_invalidator() + } + + fn report_update(&self, update: UpdateEvent) { + self.outer_ctx.report_update(update); + } + + fn get_data_directory(&self) -> &std::path::Path { + self.outer_ctx.get_data_directory() + } +} + +impl sd_core_heavy_lifting::JobContext + for JobContext +{ + fn new(report: Report, outer_ctx: OuterCtx) -> Self { + Self { + report: Arc::new(RwLock::new(report)), + outer_ctx, + start_time: Utc::now(), + report_update_counter: Arc::new(AtomicU8::new(0)), + } + } + + async fn progress(&self, updates: impl IntoIterator + Send) { + let mut report = self.report.write().await; + + // protect against updates if job is not running + if report.status != Status::Running { + return; + }; + + let mut changed_phase = false; + + for update in updates { + match update { + ProgressUpdate::TaskCount(task_count) => { + report.task_count = task_count as i32; + } + ProgressUpdate::CompletedTaskCount(completed_task_count) => { + report.completed_task_count = completed_task_count as i32; + } + + ProgressUpdate::Message(message) => { + trace!(job_id = %report.id, %message, "job message;"); + report.message = message; + } + ProgressUpdate::Phase(phase) => { + trace!( + job_id = %report.id, + "changing phase: {} -> {phase};", + report.phase + ); + report.phase = phase; + changed_phase = true; + } + } + } + + // Calculate elapsed time + let elapsed = Utc::now() - self.start_time; + + // Calculate remaining time + let task_count = report.task_count as usize; + let completed_task_count = report.completed_task_count as usize; + let remaining_task_count = task_count.saturating_sub(completed_task_count); + + // Adding 1 to avoid division by zero + let remaining_time_per_task = elapsed / (completed_task_count + 1) as i32; + + let remaining_time = remaining_time_per_task * remaining_task_count as i32; + + // Update the report with estimated remaining time + report.estimated_completion = Utc::now() + .checked_add_signed(remaining_time) + .unwrap_or(Utc::now()); + + let library = self.outer_ctx.library(); + + let counter = self.report_update_counter.fetch_add(1, Ordering::AcqRel); + + if counter == 50 || counter == 0 || changed_phase { + self.report_update_counter.store(1, Ordering::Release); + + spawn({ + let db = Arc::clone(&library.db); + let mut report = report.clone(); + async move { + if let Err(e) = report.update(&db).await { + error!( + ?e, + "Failed to update job report on debounced job progress event;" + ); + } + } + }); + } + + // emit a CoreEvent + library.emit(CoreEvent::JobProgress(JobProgressEvent { + id: report.id, + library_id: library.id, + task_count: report.task_count, + completed_task_count: report.completed_task_count, + estimated_completion: report.estimated_completion, + phase: report.phase.clone(), + message: report.message.clone(), + })); + } + + async fn report(&self) -> impl Deref { + Arc::clone(&self.report).read_owned().await + } + + async fn report_mut(&self) -> impl DerefMut { + Arc::clone(&self.report).write_owned().await + } + + fn get_outer_ctx(&self) -> OuterCtx { + self.outer_ctx.clone() + } +} diff --git a/core/src/custom_uri/mod.rs b/core/src/custom_uri/mod.rs index 7a4766dd6..33af7c727 100644 --- a/core/src/custom_uri/mod.rs +++ b/core/src/custom_uri/mod.rs @@ -1,16 +1,13 @@ use crate::{ api::{utils::InvalidateOperationEvent, CoreEvent}, library::Library, - object::media::old_thumbnail::WEBP_EXTENSION, p2p::operations::{self, request_file}, util::InfallibleResponse, Node, }; -use async_stream::stream; -use bytes::Bytes; -use mpsc_to_async_write::MpscToAsyncWrite; use sd_core_file_path_helper::IsolatedFilePathData; +use sd_core_heavy_lifting::media_processor::WEBP_EXTENSION; use sd_core_prisma_helpers::file_path_to_handle_custom_uri; use sd_file_ext::text::is_text; @@ -30,6 +27,7 @@ use std::{ sync::Arc, }; +use async_stream::stream; use axum::{ body::{self, Body, BoxBody, Full, StreamBody}, extract::{self, State}, @@ -39,6 +37,7 @@ use axum::{ routing::get, Router, }; +use bytes::Bytes; use http_body::combinators::UnsyncBoxBody; use hyper::{header, upgrade::OnUpgrade}; use mini_moka::sync::Cache; @@ -56,6 +55,8 @@ mod mpsc_to_async_write; mod serve_file; mod utils; +use mpsc_to_async_write::MpscToAsyncWrite; + type CacheKey = (Uuid, file_path::id::Type); #[derive(Debug, Clone)] @@ -74,8 +75,8 @@ pub enum ServeFrom { Local, /// Serve from a specific instance Remote { - library_identity: RemoteIdentity, - node_identity: RemoteIdentity, + library_identity: Box, + node_identity: Box, library: Arc, }, } @@ -102,8 +103,8 @@ async fn request_to_remote_node( let mut response = match operations::remote_rspc(p2p.clone(), identity, request).await { Ok(v) => v, - Err(err) => { - warn!("Error doing remote rspc query with '{identity}': {err:?}"); + Err(e) => { + warn!(%identity, ?e, "Error doing remote rspc query with;"); return StatusCode::BAD_GATEWAY.into_response(); } }; @@ -120,21 +121,21 @@ async fn request_to_remote_node( }; tokio::spawn(async move { - let Ok(mut request_upgraded) = request_upgraded.await.map_err(|err| { - warn!("Error upgrading websocket request: {err}"); + let Ok(mut request_upgraded) = request_upgraded.await.map_err(|e| { + warn!(?e, "Error upgrading websocket request;"); }) else { return; }; - let Ok(mut response_upgraded) = response_upgraded.await.map_err(|err| { - warn!("Error upgrading websocket response: {err}"); + let Ok(mut response_upgraded) = response_upgraded.await.map_err(|e| { + warn!(?e, "Error upgrading websocket response;"); }) else { return; }; copy_bidirectional(&mut request_upgraded, &mut response_upgraded) .await - .map_err(|err| { - warn!("Error upgrading websocket response: {err}"); + .map_err(|e| { + warn!(?e, "Error upgrading websocket response;"); }) .ok(); }); @@ -204,8 +205,8 @@ async fn get_or_init_lru_entry( ServeFrom::Local } else { ServeFrom::Remote { - library_identity, - node_identity, + library_identity: Box::new(library_identity), + node_identity: Box::new(node_identity), library: library.clone(), } }, @@ -237,9 +238,9 @@ pub fn base_router() -> Router { .then_some(()) .ok_or_else(|| not_found(()))?; - let file = File::open(&path).await.map_err(|err| { + let file = File::open(&path).await.map_err(|e| { InfallibleResponse::builder() - .status(if err.kind() == io::ErrorKind::NotFound { + .status(if e.kind() == io::ErrorKind::NotFound { StatusCode::NOT_FOUND } else { StatusCode::INTERNAL_SERVER_ERROR @@ -270,7 +271,7 @@ pub fn base_router() -> Router { serve_from, .. }, - .. + _library, ) = get_or_init_lru_entry(&state, path).await?; match serve_from { @@ -282,24 +283,23 @@ pub fn base_router() -> Router { .then_some(()) .ok_or_else(|| not_found(()))?; - let mut file = - File::open(&file_path_full_path).await.map_err(|err| { - InfallibleResponse::builder() - .status(if err.kind() == io::ErrorKind::NotFound { - StatusCode::NOT_FOUND - } else { - StatusCode::INTERNAL_SERVER_ERROR - }) - .body(body::boxed(Full::from(""))) - })?; + let mut file = File::open(&file_path_full_path).await.map_err(|e| { + InfallibleResponse::builder() + .status(if e.kind() == io::ErrorKind::NotFound { + StatusCode::NOT_FOUND + } else { + StatusCode::INTERNAL_SERVER_ERROR + }) + .body(body::boxed(Full::from(""))) + })?; let resp = InfallibleResponse::builder().header( "Content-Type", HeaderValue::from_str( &infer_the_mime_type(&extension, &mut file, &metadata).await?, ) - .map_err(|err| { - error!("Error converting mime-type into header value: {}", err); + .map_err(|e| { + error!(?e, "Error converting mime-type into header value;"); internal_server_error(()) })?, ); @@ -316,15 +316,20 @@ pub fn base_router() -> Router { let (tx, mut rx) = tokio::sync::mpsc::channel::>(150); request_file( state.node.p2p.p2p.clone(), - node_identity, + *node_identity, &library.identity, file_path_pub_id, Range::Full, MpscToAsyncWrite::new(PollSender::new(tx)), ) .await - .map_err(|err| { - error!("Error requesting file {file_path_pub_id:?} from node {:?}: {err:?}", library.identity.to_remote_identity()); + .map_err(|e| { + error!( + %file_path_pub_id, + node_identity = ?library.identity.to_remote_identity(), + ?e, + "Error requesting file from other node;", + ); internal_server_error(()) })?; @@ -352,9 +357,9 @@ pub fn base_router() -> Router { .then_some(()) .ok_or_else(|| not_found(()))?; - let mut file = File::open(&path).await.map_err(|err| { + let mut file = File::open(&path).await.map_err(|e| { InfallibleResponse::builder() - .status(if err.kind() == io::ErrorKind::NotFound { + .status(if e.kind() == io::ErrorKind::NotFound { StatusCode::NOT_FOUND } else { StatusCode::INTERNAL_SERVER_ERROR @@ -368,8 +373,8 @@ pub fn base_router() -> Router { None => "text/plain".to_string(), Some(ext) => infer_the_mime_type(ext, &mut file, &metadata).await?, }) - .map_err(|err| { - error!("Error converting mime-type into header value: {}", err); + .map_err(|e| { + error!(?e, "Error converting mime-type into header value;"); internal_server_error(()) })?, ); @@ -423,8 +428,8 @@ pub fn router(node: Arc) -> Router<()> { mut request: Request| async move { let identity = match RemoteIdentity::from_str(&identity) { Ok(identity) => identity, - Err(err) => { - warn!("Error parsing identity '{}': {}", identity, err); + Err(e) => { + warn!(%identity, ?e, "Error parsing identity;"); return (StatusCode::BAD_REQUEST, HeaderMap::new(), vec![]) .into_response(); } diff --git a/core/src/custom_uri/utils.rs b/core/src/custom_uri/utils.rs index 70171e2eb..645da5106 100644 --- a/core/src/custom_uri/utils.rs +++ b/core/src/custom_uri/utils.rs @@ -11,8 +11,8 @@ use http_body::Full; use tracing::debug; #[track_caller] -pub(crate) fn bad_request(err: impl Debug) -> http::Response { - debug!("400: Bad Request at {}: {err:?}", Location::caller()); +pub(crate) fn bad_request(e: impl Debug) -> http::Response { + debug!(caller = %Location::caller(), ?e, "400: Bad Request;"); InfallibleResponse::builder() .status(StatusCode::BAD_REQUEST) @@ -20,8 +20,8 @@ pub(crate) fn bad_request(err: impl Debug) -> http::Response { } #[track_caller] -pub(crate) fn not_found(err: impl Debug) -> http::Response { - debug!("404: Not Found at {}: {err:?}", Location::caller()); +pub(crate) fn not_found(e: impl Debug) -> http::Response { + debug!(caller = %Location::caller(), ?e, "404: Not Found;"); InfallibleResponse::builder() .status(StatusCode::NOT_FOUND) @@ -29,11 +29,8 @@ pub(crate) fn not_found(err: impl Debug) -> http::Response { } #[track_caller] -pub(crate) fn internal_server_error(err: impl Debug) -> http::Response { - debug!( - "500: Internal Server Error at {}: {err:?}", - Location::caller() - ); +pub(crate) fn internal_server_error(e: impl Debug) -> http::Response { + debug!(caller = %Location::caller(), ?e, "500: Internal Server Error;"); InfallibleResponse::builder() .status(StatusCode::INTERNAL_SERVER_ERROR) @@ -41,8 +38,8 @@ pub(crate) fn internal_server_error(err: impl Debug) -> http::Response } #[track_caller] -pub(crate) fn not_implemented(err: impl Debug) -> http::Response { - debug!("501: Not Implemented at {}: {err:?}", Location::caller()); +pub(crate) fn not_implemented(e: impl Debug) -> http::Response { + debug!(caller = %Location::caller(), ?e, "501: Not Implemented;"); InfallibleResponse::builder() .status(StatusCode::NOT_IMPLEMENTED) diff --git a/core/src/lib.rs b/core/src/lib.rs index 70879d0d4..cdfda558b 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -4,18 +4,16 @@ use crate::{ api::{CoreEvent, Router}, location::LocationManagerError, - object::media::old_thumbnail::old_actor::OldThumbnailer, }; +use sd_core_heavy_lifting::{media_processor::ThumbnailKind, JobSystem}; +use sd_core_prisma_helpers::CasId; + #[cfg(feature = "ai")] use sd_ai::old_image_labeler::{DownloadModelError, OldImageLabeler, YoloV8}; -use sd_utils::error::FileIOError; -use api::notifications::{Notification, NotificationData, NotificationId}; -use chrono::{DateTime, Utc}; -use node::config; -use notifications::Notifications; -use reqwest::{RequestBuilder, Response}; +use sd_task_system::TaskSystem; +use sd_utils::error::FileIOError; use std::{ fmt, @@ -23,6 +21,9 @@ use std::{ sync::{atomic::AtomicBool, Arc}, }; +use chrono::{DateTime, Utc}; +use futures_concurrency::future::Join; +use reqwest::{RequestBuilder, Response}; use thiserror::Error; use tokio::{fs, io, sync::broadcast}; use tracing::{error, info, warn}; @@ -34,6 +35,9 @@ use tracing_subscriber::{filter::FromEnvError, prelude::*, EnvFilter}; pub mod api; mod cloud; +mod context; +#[cfg(feature = "crypto")] +pub(crate) mod crypto; pub mod custom_uri; mod env; pub mod library; @@ -50,7 +54,10 @@ pub(crate) mod volume; pub use env::Env; -use object::media::old_thumbnail::get_ephemeral_thumbnail_path; +use api::notifications::{Notification, NotificationData, NotificationId}; +use context::{JobContext, NodeContext}; +use node::config; +use notifications::Notifications; pub(crate) use sd_core_sync as sync; @@ -65,10 +72,11 @@ pub struct Node { pub p2p: Arc, pub event_bus: (broadcast::Sender, broadcast::Receiver), pub notifications: Notifications, - pub thumbnailer: OldThumbnailer, pub cloud_sync_flag: Arc, pub env: Arc, pub http: reqwest::Client, + pub task_system: TaskSystem, + pub job_system: JobSystem>, #[cfg(feature = "ai")] pub old_image_labeller: Option, } @@ -88,7 +96,7 @@ impl Node { ) -> Result<(Arc, Arc), NodeError> { let data_dir = data_dir.as_ref(); - info!("Starting core with data directory '{}'", data_dir.display()); + info!(data_directory = %data_dir.display(), "Starting core;"); let env = Arc::new(env); @@ -117,22 +125,19 @@ impl Node { let (old_jobs, jobs_actor) = old_job::OldJobs::new(); let libraries = library::Libraries::new(data_dir.join("libraries")).await?; + let task_system = TaskSystem::new(); + let (p2p, start_p2p) = p2p::P2PManager::new(config.clone(), libraries.clone()) .await .map_err(NodeError::P2PManager)?; let node = Arc::new(Node { data_dir: data_dir.to_path_buf(), + job_system: JobSystem::new(task_system.get_dispatcher(), data_dir), + task_system, old_jobs, locations, notifications: notifications::Notifications::new(), p2p, - thumbnailer: OldThumbnailer::new( - data_dir, - libraries.clone(), - event_bus.0.clone(), - config.preferences_watcher(), - ) - .await, config, event_bus, libraries, @@ -146,7 +151,10 @@ impl Node { ) .await .map_err(|e| { - error!("Failed to initialize image labeller. AI features will be disabled: {e:#?}"); + error!( + ?e, + "Failed to initialize image labeller. AI features will be disabled;" + ); }) .ok(), }); @@ -168,6 +176,27 @@ impl Node { locations_actor.start(node.clone()); node.libraries.init(&node).await?; jobs_actor.start(node.clone()); + + node.job_system + .init( + &node + .libraries + .get_all() + .await + .into_iter() + .map(|library| { + ( + library.id, + NodeContext { + library, + node: Arc::clone(&node), + }, + ) + }) + .collect(), + ) + .await?; + start_p2p( node.clone(), axum::Router::new() @@ -188,7 +217,7 @@ impl Node { .into_make_service(), ); - info!("Spacedrive online."); + info!("Spacedrive online!"); Ok((node, router)) } @@ -212,7 +241,14 @@ impl Node { std::env::set_var( "RUST_LOG", - format!("info,sd_core={level},sd_p2p=debug,sd_core::location::manager=info,sd_ai={level}"), + format!( + "info,\ + sd_core={level},\ + sd_p2p={level},\ + sd_core_heavy_lifting={level},\ + sd_task_system={level},\ + sd_ai={level}" + ), ); } @@ -259,9 +295,18 @@ impl Node { pub async fn shutdown(&self) { info!("Spacedrive shutting down..."); - self.thumbnailer.shutdown().await; - self.old_jobs.shutdown().await; - self.p2p.shutdown().await; + + // Let's shutdown the task system first, as the job system will receive tasks to save + self.task_system.shutdown().await; + + ( + self.old_jobs.shutdown(), + self.p2p.shutdown(), + self.job_system.shutdown(), + ) + .join() + .await; + #[cfg(feature = "ai")] if let Some(image_labeller) = &self.old_image_labeller { image_labeller.shutdown().await; @@ -271,12 +316,16 @@ impl Node { pub(crate) fn emit(&self, event: CoreEvent) { if let Err(e) = self.event_bus.0.send(event) { - warn!("Error sending event to event bus: {e:?}"); + warn!(?e, "Error sending event to event bus;"); } } - pub async fn ephemeral_thumbnail_exists(&self, cas_id: &str) -> Result { - let thumb_path = get_ephemeral_thumbnail_path(self, cas_id); + pub async fn ephemeral_thumbnail_exists( + &self, + cas_id: &CasId<'_>, + ) -> Result { + let thumb_path = + ThumbnailKind::Ephemeral.compute_path(self.config.data_directory(), cas_id); match fs::metadata(&thumb_path).await { Ok(_) => Ok(true), @@ -301,8 +350,8 @@ impl Node { Ok(_) => { self.notifications._internal_send(notification); } - Err(err) => { - error!("Error saving notification to config: {:?}", err); + Err(e) => { + error!(?e, "Error saving notification to config;"); } } } @@ -375,6 +424,9 @@ pub enum NodeError { InitConfig(#[from] util::debug_initializer::InitConfigError), #[error("logger error: {0}")] Logger(#[from] FromEnvError), + #[error(transparent)] + JobSystem(#[from] sd_core_heavy_lifting::JobSystemError), + #[cfg(feature = "ai")] #[error("ai error: {0}")] AI(#[from] sd_ai::Error), diff --git a/core/src/library/config.rs b/core/src/library/config.rs index f12a2711f..863f744f9 100644 --- a/core/src/library/config.rs +++ b/core/src/library/config.rs @@ -130,7 +130,7 @@ impl LibraryConfig { db.indexer_rule().update_many( vec![indexer_rule::name::equals(Some(name))], vec![indexer_rule::pub_id::set(sd_utils::uuid_to_bytes( - Uuid::from_u128(i as u128), + &Uuid::from_u128(i as u128), ))], ) }) @@ -221,7 +221,7 @@ impl LibraryConfig { maybe_missing(path.size_in_bytes, "file_path.size_in_bytes") .map_or_else( |e| { - error!("{e:#?}"); + error!(?e); None }, Some, @@ -232,9 +232,11 @@ impl LibraryConfig { Some(size.to_be_bytes().to_vec()) } else { error!( - "File path had invalid size: '{}'", - path.id, size_in_bytes + file_path_id = %path.id, + size = %size_in_bytes, + "File path had invalid size;", ); + None }; @@ -463,7 +465,8 @@ impl LibraryConfig { } _ => { - error!("Library config version is not handled: {:?}", current); + error!(current_version = ?current, "Library config version is not handled;"); + return Err(VersionManagerError::UnexpectedMigration { current_version: current.int_value(), next_version: next.int_value(), diff --git a/core/src/library/library.rs b/core/src/library/library.rs index 57ad5ef04..795714ab8 100644 --- a/core/src/library/library.rs +++ b/core/src/library/library.rs @@ -1,9 +1,8 @@ -use crate::{ - api::CoreEvent, cloud, object::media::old_thumbnail::get_indexed_thumbnail_path, sync, Node, -}; +use crate::{api::CoreEvent, cloud, sync, Node}; use sd_core_file_path_helper::IsolatedFilePathData; -use sd_core_prisma_helpers::file_path_to_full_path; +use sd_core_heavy_lifting::media_processor::ThumbnailKind; +use sd_core_prisma_helpers::{file_path_to_full_path, CasId}; use sd_p2p::Identity; use sd_prisma::prisma::{file_path, location, PrismaClient}; @@ -121,12 +120,17 @@ impl Library { // TODO: Remove this once we replace the old invalidation system pub(crate) fn emit(&self, event: CoreEvent) { if let Err(e) = self.event_bus_tx.send(event) { - warn!("Error sending event to event bus: {e:?}"); + warn!(?e, "Error sending event to event bus;"); } } - pub async fn thumbnail_exists(&self, node: &Node, cas_id: &str) -> Result { - let thumb_path = get_indexed_thumbnail_path(node, cas_id, self.id); + pub async fn thumbnail_exists( + &self, + node: &Node, + cas_id: &CasId<'_>, + ) -> Result { + let thumb_path = + ThumbnailKind::Indexed(self.id).compute_path(node.config.data_directory(), cas_id); match fs::metadata(&thumb_path).await { Ok(_) => Ok(true), @@ -182,7 +186,7 @@ impl Library { pub fn do_cloud_sync(&self) { if let Err(e) = self.do_cloud_sync.send(()) { - warn!("Error sending cloud resync message: {e:?}"); + warn!(?e, "Error sending cloud resync message;"); } } } diff --git a/core/src/library/manager/mod.rs b/core/src/library/manager/mod.rs index 309ab003e..3c98d0a37 100644 --- a/core/src/library/manager/mod.rs +++ b/core/src/library/manager/mod.rs @@ -36,7 +36,7 @@ use tokio::{ sync::{broadcast, RwLock}, time::sleep, }; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, info, instrument, warn}; use uuid::Uuid; use super::{Library, LibraryConfig, LibraryName}; @@ -113,9 +113,9 @@ impl Libraries { .and_then(|v| v.to_str().map(Uuid::from_str)) else { warn!( - "Attempted to load library from path '{}' \ - but it has an invalid filename. Skipping...", - config_path.display() + config_path = %config_path.display(), + "Attempted to load library from path \ + but it has an invalid filename. Skipping...;", ); continue; }; @@ -124,7 +124,11 @@ impl Libraries { match fs::metadata(&db_path).await { Ok(_) => {} Err(e) if e.kind() == io::ErrorKind::NotFound => { - warn!("Found library '{}' but no matching database file was found. Skipping...", config_path.display()); + warn!( + config_path = %config_path.display(), + "Found library but no matching database file was found. Skipping...;", + ); + continue; } Err(e) => return Err(FileIOError::from((db_path, e)).into()), @@ -158,6 +162,7 @@ impl Libraries { .await } + #[instrument(skip(self, instance, node), err)] #[allow(clippy::too_many_arguments)] pub(crate) async fn create_with_uuid( self: &Arc, @@ -189,9 +194,8 @@ impl Libraries { .await?; debug!( - "Created library '{}' config at '{}'", - id, - config_path.display() + config_path = %config_path.display(), + "Created library;", ); let node_cfg = node.config.get().await; @@ -225,12 +229,12 @@ impl Libraries { ) .await?; - debug!("Loaded library '{id:?}'"); + debug!("Loaded library"); if should_seed { tag::seed::new_library(&library).await?; sd_core_indexer_rules::seed::new_or_existing_library(&library.db).await?; - debug!("Seeded library '{id:?}'"); + debug!("Seeded library"); } invalidate_query!(library, "library.list"); @@ -325,7 +329,7 @@ impl Libraries { .exec() .await .map(|locations| locations.into_iter().filter_map(|location| location.path)) - .map_err(|e| error!("Failed to fetch locations for library deletion: {e:#?}")) + .map_err(|e| error!(?e, "Failed to fetch locations for library deletion;")) { location_paths .map(|location_path| async move { @@ -343,7 +347,7 @@ impl Libraries { .into_iter() .for_each(|res| { if let Err(e) = res { - error!("Failed to remove library from location metadata: {e:#?}"); + error!(?e, "Failed to remove library from location metadata;"); } }); } @@ -371,7 +375,7 @@ impl Libraries { .remove(id) .expect("we have exclusive access and checked it exists!"); - info!("Removed Library ", library.id); + info!(%library.id, "Removed Library;"); invalidate_query!(library, "library.list"); @@ -420,6 +424,16 @@ impl Libraries { self.libraries.read().await.get(library_id).is_some() } + #[instrument( + skip_all, + fields( + library_id = %id, + db_path = %db_path.as_ref().display(), + config_path = %config_path.as_ref().display(), + %should_seed, + ), + err, + )] /// load the library from a given path. pub async fn load( self: &Arc, @@ -479,8 +493,9 @@ impl Libraries { || curr_metadata != Some(node.p2p.peer_metadata()) { info!( - "Detected that the library '{}' has changed node from '{}' to '{}'. Reconciling node data...", - id, instance_node_id, node_config.id + old_node_id = %instance_node_id, + new_node_id = %node_config.id, + "Detected that the library has changed nodes. Reconciling node data...", ); // ensure @@ -593,12 +608,12 @@ impl Libraries { .await? { if let Err(e) = node.locations.add(location.id, library.clone()).await { - error!("Failed to watch location on startup: {e}"); + error!(?e, "Failed to watch location on startup;"); }; } if let Err(e) = node.old_jobs.clone().cold_resume(node, &library).await { - error!("Failed to resume jobs for library. {:#?}", e); + error!(?e, "Failed to resume jobs for library;"); } tokio::spawn({ @@ -639,20 +654,20 @@ impl Libraries { if should_update { warn!("Library instance on cloud is outdated. Updating..."); - if let Err(err) = - sd_cloud_api::library::update_instance( - node.cloud_api_config().await, - library.id, - this_instance.uuid, - Some(node_config.id), - Some(node_config.identity.to_remote_identity()), - Some(node.p2p.peer_metadata()), - ) - .await + if let Err(e) = sd_cloud_api::library::update_instance( + node.cloud_api_config().await, + library.id, + this_instance.uuid, + Some(node_config.id), + Some(node_config.identity.to_remote_identity()), + Some(node.p2p.peer_metadata()), + ) + .await { error!( - "Failed to updating instance '{}' on cloud: {:#?}", - this_instance.uuid, err + instance_uuid = %this_instance.uuid, + ?e, + "Failed to updating instance on cloud;", ); } } @@ -661,29 +676,26 @@ impl Libraries { if lib.name != *library.config().await.name { warn!("Library name on cloud is outdated. Updating..."); - if let Err(err) = sd_cloud_api::library::update( + if let Err(e) = sd_cloud_api::library::update( node.cloud_api_config().await, library.id, Some(lib.name), ) .await { - error!( - "Failed to update library name on cloud: {:#?}", - err - ); + error!(?e, "Failed to update library name on cloud;"); } } for instance in lib.instances { - if let Err(err) = cloud::sync::receive::upsert_instance( + if let Err(e) = cloud::sync::receive::upsert_instance( library.id, &library.db, &library.sync, &node.libraries, - instance.uuid, + &instance.uuid, instance.identity, - instance.node_id, + &instance.node_id, RemoteIdentity::from_str( &instance.node_remote_identity, ) @@ -692,10 +704,7 @@ impl Libraries { ) .await { - error!( - "Failed to create instance from cloud: {:#?}", - err - ); + error!(?e, "Failed to create instance on cloud;"); } } } diff --git a/core/src/library/statistics.rs b/core/src/library/statistics.rs index 529de701e..74c7fcd87 100644 --- a/core/src/library/statistics.rs +++ b/core/src/library/statistics.rs @@ -37,8 +37,8 @@ pub async fn update_library_statistics( .find_many(vec![]) .exec() .await - .unwrap_or_else(|err| { - error!("Failed to get locations: {:#?}", err); + .unwrap_or_else(|e| { + error!(?e, "Failed to get locations;"); vec![] }) .into_iter() @@ -79,7 +79,7 @@ pub async fn update_library_statistics( .exec() .await?; - info!("Updated library statistics: {:?}", stats); + info!(?stats, "Updated library statistics;"); Ok(stats) } diff --git a/core/src/location/error.rs b/core/src/location/error.rs index df1bb3c93..87d074b3f 100644 --- a/core/src/location/error.rs +++ b/core/src/location/error.rs @@ -81,35 +81,33 @@ pub enum LocationError { } impl From for rspc::Error { - fn from(err: LocationError) -> Self { + fn from(e: LocationError) -> Self { use LocationError::*; - match err { + match e { // Not found errors PathNotFound(_) | UuidNotFound(_) | IdNotFound(_) | FilePath(FilePathError::IdNotFound(_) | FilePathError::NotFound(_)) => { - Self::with_cause(ErrorCode::NotFound, err.to_string(), err) + Self::with_cause(ErrorCode::NotFound, e.to_string(), e) } // User's fault errors NotDirectory(_) | NestedLocation(_) | LocationAlreadyExists(_) => { - Self::with_cause(ErrorCode::BadRequest, err.to_string(), err) + Self::with_cause(ErrorCode::BadRequest, e.to_string(), e) } // Custom error message is used to differentiate these errors in the frontend // TODO: A better solution would be for rspc to support sending custom data alongside errors - NeedRelink { .. } => { - Self::with_cause(ErrorCode::Conflict, "NEED_RELINK".to_owned(), err) - } + NeedRelink { .. } => Self::with_cause(ErrorCode::Conflict, "NEED_RELINK".to_owned(), e), AddLibraryToMetadata(_) => { - Self::with_cause(ErrorCode::Conflict, "ADD_LIBRARY".to_owned(), err) + Self::with_cause(ErrorCode::Conflict, "ADD_LIBRARY".to_owned(), e) } // Internal errors MissingField(missing_error) => missing_error.into(), - _ => Self::with_cause(ErrorCode::InternalServerError, err.to_string(), err), + _ => Self::with_cause(ErrorCode::InternalServerError, e.to_string(), e), } } } diff --git a/core/src/location/indexer/mod.rs b/core/src/location/indexer/mod.rs deleted file mode 100644 index ef1dff558..000000000 --- a/core/src/location/indexer/mod.rs +++ /dev/null @@ -1,546 +0,0 @@ -use crate::library::Library; - -use sd_core_file_path_helper::{FilePathError, IsolatedFilePathData, IsolatedFilePathDataParts}; -use sd_core_indexer_rules::IndexerRuleError; -use sd_core_prisma_helpers::file_path_pub_and_cas_ids; - -use sd_prisma::{ - prisma::{file_path, location, PrismaClient}, - prisma_sync, -}; -use sd_sync::*; -use sd_utils::{db::inode_to_db, error::FileIOError, from_bytes_to_uuid, msgpack}; - -use std::{collections::HashMap, path::Path}; - -use chrono::Utc; -use futures_concurrency::future::TryJoin; -use itertools::Itertools; -use prisma_client_rust::operator::or; -use rspc::ErrorCode; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tracing::{trace, warn}; - -use super::location_with_indexer_rules; - -pub mod old_indexer_job; -mod old_shallow; -mod old_walk; - -use old_walk::WalkedEntry; - -pub use old_indexer_job::OldIndexerJobInit; -pub use old_shallow::*; - -#[derive(Serialize, Deserialize, Debug)] -pub struct OldIndexerJobSaveStep { - chunk_idx: usize, - walked: Vec, -} - -#[derive(Serialize, Deserialize, Debug)] -pub struct OldIndexerJobUpdateStep { - chunk_idx: usize, - to_update: Vec, -} - -/// Error type for the indexer module -#[derive(Error, Debug)] -pub enum IndexerError { - // Not Found errors - #[error("indexer rule not found: ")] - IndexerRuleNotFound(i32), - #[error("received sub path not in database: ", .0.display())] - SubPathNotFound(Box), - - // Internal Errors - #[error("Database Error: {}", .0.to_string())] - Database(#[from] prisma_client_rust::QueryError), - #[error(transparent)] - FileIO(#[from] FileIOError), - #[error(transparent)] - FilePath(#[from] FilePathError), - - // Mixed errors - #[error(transparent)] - IndexerRules(#[from] IndexerRuleError), -} - -impl From for rspc::Error { - fn from(err: IndexerError) -> Self { - match err { - IndexerError::IndexerRuleNotFound(_) | IndexerError::SubPathNotFound(_) => { - rspc::Error::with_cause(ErrorCode::NotFound, err.to_string(), err) - } - - IndexerError::IndexerRules(rule_err) => rule_err.into(), - - _ => rspc::Error::with_cause(ErrorCode::InternalServerError, err.to_string(), err), - } - } -} - -async fn execute_indexer_save_step( - location: &location_with_indexer_rules::Data, - OldIndexerJobSaveStep { walked, .. }: &OldIndexerJobSaveStep, - library: &Library, -) -> Result { - let Library { sync, db, .. } = library; - - let (sync_stuff, paths): (Vec<_>, Vec<_>) = walked - .iter() - .map(|entry| { - let IsolatedFilePathDataParts { - materialized_path, - is_dir, - name, - extension, - .. - } = &entry.iso_file_path.to_parts(); - - use file_path::*; - - let pub_id = sd_utils::uuid_to_bytes(entry.pub_id); - - let (sync_params, db_params): (Vec<_>, Vec<_>) = [ - ( - ( - location::NAME, - msgpack!(prisma_sync::location::SyncId { - pub_id: location.pub_id.clone() - }), - ), - location_id::set(Some(location.id)), - ), - sync_db_entry!(materialized_path.to_string(), materialized_path), - sync_db_entry!(name.to_string(), name), - sync_db_entry!(*is_dir, is_dir), - sync_db_entry!(extension.to_string(), extension), - sync_db_entry!( - entry.metadata.size_in_bytes.to_be_bytes().to_vec(), - size_in_bytes_bytes - ), - sync_db_entry!(inode_to_db(entry.metadata.inode), inode), - { - let v = entry.metadata.created_at.into(); - sync_db_entry!(v, date_created) - }, - { - let v = entry.metadata.modified_at.into(); - sync_db_entry!(v, date_modified) - }, - { - let v = Utc::now().into(); - sync_db_entry!(v, date_indexed) - }, - sync_db_entry!(entry.metadata.hidden, hidden), - ] - .into_iter() - .unzip(); - - ( - sync.shared_create( - prisma_sync::file_path::SyncId { - pub_id: sd_utils::uuid_to_bytes(entry.pub_id), - }, - sync_params, - ), - file_path::create_unchecked(pub_id, db_params), - ) - }) - .unzip(); - - let count = sync - .write_ops( - db, - ( - sync_stuff.into_iter().flatten().collect(), - db.file_path().create_many(paths).skip_duplicates(), - ), - ) - .await?; - - trace!("Inserted {count} records"); - - Ok(count) -} - -async fn execute_indexer_update_step( - update_step: &OldIndexerJobUpdateStep, - Library { sync, db, .. }: &Library, -) -> Result { - let (sync_stuff, paths_to_update): (Vec<_>, Vec<_>) = update_step - .to_update - .iter() - .map(|entry| async move { - let IsolatedFilePathDataParts { is_dir, .. } = &entry.iso_file_path.to_parts(); - - let pub_id = sd_utils::uuid_to_bytes(entry.pub_id); - - let should_unlink_object = if let Some(object_id) = entry.maybe_object_id { - db.file_path() - .count(vec![file_path::object_id::equals(Some(object_id))]) - .exec() - .await? > 1 - } else { - false - }; - - use file_path::*; - - let (sync_params, db_params): (Vec<_>, Vec<_>) = [ - // As this file was updated while Spacedrive was offline, we mark the object_id and cas_id as null - // So this file_path will be updated at file identifier job - should_unlink_object - .then_some(((object_id::NAME, msgpack!(nil)), object::disconnect())), - Some(((cas_id::NAME, msgpack!(nil)), cas_id::set(None))), - Some(sync_db_entry!(*is_dir, is_dir)), - Some(sync_db_entry!( - entry.metadata.size_in_bytes.to_be_bytes().to_vec(), - size_in_bytes_bytes - )), - Some(sync_db_entry!(inode_to_db(entry.metadata.inode), inode)), - Some({ - let v = entry.metadata.created_at.into(); - sync_db_entry!(v, date_created) - }), - Some({ - let v = entry.metadata.modified_at.into(); - sync_db_entry!(v, date_modified) - }), - Some(sync_db_entry!(entry.metadata.hidden, hidden)), - ] - .into_iter() - .flatten() - .unzip(); - - Ok::<_, IndexerError>(( - sync_params - .into_iter() - .map(|(field, value)| { - sync.shared_update( - prisma_sync::file_path::SyncId { - pub_id: pub_id.clone(), - }, - field, - value, - ) - }) - .collect::>(), - db.file_path() - .update(file_path::pub_id::equals(pub_id), db_params) - .select(file_path::select!({ id })), - )) - }) - .collect::>() - .try_join() - .await? - .into_iter() - .unzip(); - - let updated = sync - .write_ops( - db, - (sync_stuff.into_iter().flatten().collect(), paths_to_update), - ) - .await?; - - trace!("Updated {updated:?} records"); - - Ok(updated.len() as i64) -} - -fn iso_file_path_factory( - location_id: location::id::Type, - location_path: &Path, -) -> impl Fn(&Path, bool) -> Result, IndexerError> + '_ { - move |path, is_dir| { - IsolatedFilePathData::new(location_id, location_path, path, is_dir).map_err(Into::into) - } -} - -async fn remove_non_existing_file_paths( - to_remove: impl IntoIterator, - db: &PrismaClient, - sync: &sd_core_sync::Manager, -) -> Result { - let (sync_params, db_params): (Vec<_>, Vec<_>) = to_remove - .into_iter() - .map(|d| { - ( - sync.shared_delete(prisma_sync::file_path::SyncId { pub_id: d.pub_id }), - d.id, - ) - }) - .unzip(); - - sync.write_ops( - db, - ( - sync_params, - db.file_path() - .delete_many(vec![file_path::id::in_vec(db_params)]), - ), - ) - .await?; - - Ok(0) -} - -// TODO: Change this macro to a fn when we're able to return -// `impl Fn(Vec) -> impl Future, IndexerError>>` -// Maybe when TAITs arrive -#[macro_export] -macro_rules! file_paths_db_fetcher_fn { - ($db:expr) => {{ - |found_paths| async { - // Each found path is a AND with 4 terms, and SQLite has a expression tree limit of 1000 terms - // so we will use chunks of 200 just to be safe - - // FIXME: Can't pass this chunks variable direct to _batch because of lifetime issues - let chunks = found_paths - .into_iter() - .chunks(200) - .into_iter() - .map(|founds| { - $db.file_path() - .find_many(vec![::prisma_client_rust::operator::or( - founds.collect::>(), - )]) - .select(::sd_core_prisma_helpers::file_path_walker::select()) - }) - .collect::>(); - - $db._batch(chunks) - .await - .map(|fetched| fetched.into_iter().flatten().collect::>()) - .map_err(Into::into) - } - }}; -} - -// TODO: Change this macro to a fn when we're able to return -// `impl Fn(&Path, Vec) -> impl Future, IndexerError>>` -// Maybe when TAITs arrive -// FIXME: (fogodev) I was receiving this error here https://github.com/rust-lang/rust/issues/74497 -#[macro_export] -macro_rules! to_remove_db_fetcher_fn { - ($location_id:expr, $db:expr) => {{ - |parent_iso_file_path, unique_location_id_materialized_path_name_extension_params| async { - let location_id: ::sd_prisma::prisma::location::id::Type = $location_id; - let db: &::sd_prisma::prisma::PrismaClient = $db; - let parent_iso_file_path: ::sd_core_file_path_helper::IsolatedFilePathData< - 'static, - > = parent_iso_file_path; - let unique_location_id_materialized_path_name_extension_params: ::std::vec::Vec< - ::sd_prisma::prisma::file_path::WhereParam, - > = unique_location_id_materialized_path_name_extension_params; - - // FIXME: Can't pass this chunks variable direct to _batch because of lifetime issues - let chunks = unique_location_id_materialized_path_name_extension_params - .into_iter() - .chunks(200) - .into_iter() - .map(|unique_params| { - db.file_path() - .find_many(vec![::prisma_client_rust::operator::or( - unique_params.collect(), - )]) - .select(::sd_prisma::prisma::file_path::select!({ id })) - }) - .collect::<::std::vec::Vec<_>>(); - - let founds_ids = db._batch(chunks).await.map(|founds_chunk| { - founds_chunk - .into_iter() - .map(|file_paths| file_paths.into_iter().map(|file_path| file_path.id)) - .flatten() - .collect::<::std::collections::HashSet<_>>() - })?; - - // NOTE: This batch size can be increased if we wish to trade memory for more performance - const BATCH_SIZE: i64 = 1000; - - let mut to_remove = vec![]; - let mut cursor = 1; - - loop { - let found = $db.file_path() - .find_many(vec![ - ::sd_prisma::prisma::file_path::location_id::equals(Some(location_id)), - ::sd_prisma::prisma::file_path::materialized_path::equals(Some( - parent_iso_file_path - .materialized_path_for_children() - .expect("the received isolated file path must be from a directory"), - )), - ]) - .order_by(::sd_prisma::prisma::file_path::id::order(::sd_prisma::prisma::SortOrder::Asc)) - .take(BATCH_SIZE) - .cursor(::sd_prisma::prisma::file_path::id::equals(cursor)) - .select(::sd_prisma::prisma::file_path::select!({ id pub_id cas_id })) - .exec() - .await?; - - let should_stop = (found.len() as i64) < BATCH_SIZE; - - if let Some(last) = found.last() { - cursor = last.id; - } else { - break; - } - - to_remove.extend( - found - .into_iter() - .filter(|file_path| !founds_ids.contains(&file_path.id)) - .map(|file_path| ::sd_core_prisma_helpers::file_path_pub_and_cas_ids::Data { - id: file_path.id, - pub_id: file_path.pub_id, - cas_id: file_path.cas_id, - }), - ); - - if should_stop { - break; - } - } - - Ok(to_remove) - } - }}; -} - -pub async fn reverse_update_directories_sizes( - base_path: impl AsRef, - location_id: location::id::Type, - location_path: impl AsRef, - library: &Library, -) -> Result<(), FilePathError> { - let base_path = base_path.as_ref(); - let location_path = location_path.as_ref(); - - let Library { sync, db, .. } = library; - - let ancestors = base_path - .ancestors() - .take_while(|&ancestor| ancestor != location_path) - .map(|ancestor| IsolatedFilePathData::new(location_id, location_path, ancestor, true)) - .collect::, _>>()?; - - let chunked_queries = ancestors - .iter() - .chunks(200) - .into_iter() - .map(|ancestors_iso_file_paths_chunk| { - db.file_path() - .find_many(vec![or(ancestors_iso_file_paths_chunk - .into_iter() - .map(file_path::WhereParam::from) - .collect::>())]) - .select(file_path::select!({ pub_id materialized_path name })) - }) - .collect::>(); - - let mut pub_id_by_ancestor_materialized_path = db - ._batch(chunked_queries) - .await? - .into_iter() - .flatten() - .filter_map( - |file_path| match (file_path.materialized_path, file_path.name) { - (Some(materialized_path), Some(name)) => { - Some((format!("{materialized_path}{name}/"), (file_path.pub_id, 0))) - } - _ => { - warn!( - "Found a file_path missing its materialized_path or name: ", - from_bytes_to_uuid(&file_path.pub_id) - ); - None - } - }, - ) - .collect::>(); - - db.file_path() - .find_many(vec![ - file_path::location_id::equals(Some(location_id)), - file_path::materialized_path::in_vec( - ancestors - .iter() - .map(|ancestor_iso_file_path| { - ancestor_iso_file_path - .materialized_path_for_children() - .expect("each ancestor is a directory") - }) - .collect(), - ), - ]) - .select(file_path::select!({ materialized_path size_in_bytes_bytes })) - .exec() - .await? - .into_iter() - .for_each(|file_path| { - if let Some(materialized_path) = file_path.materialized_path { - if let Some((_, size)) = - pub_id_by_ancestor_materialized_path.get_mut(&materialized_path) - { - *size += file_path - .size_in_bytes_bytes - .map(|size_in_bytes_bytes| { - u64::from_be_bytes([ - size_in_bytes_bytes[0], - size_in_bytes_bytes[1], - size_in_bytes_bytes[2], - size_in_bytes_bytes[3], - size_in_bytes_bytes[4], - size_in_bytes_bytes[5], - size_in_bytes_bytes[6], - size_in_bytes_bytes[7], - ]) - }) - .unwrap_or_else(|| { - warn!("Got a directory missing its size in bytes"); - 0 - }); - } - } else { - warn!("Corrupt database possessing a file_path entry without materialized_path"); - } - }); - - let to_sync_and_update = ancestors - .into_iter() - .filter_map(|ancestor_iso_file_path| { - if let Some((pub_id, size)) = pub_id_by_ancestor_materialized_path.remove( - &ancestor_iso_file_path - .materialized_path_for_children() - .expect("each ancestor is a directory"), - ) { - let size_bytes = size.to_be_bytes().to_vec(); - - Some(( - sync.shared_update( - prisma_sync::file_path::SyncId { - pub_id: pub_id.clone(), - }, - file_path::size_in_bytes_bytes::NAME, - msgpack!(size_bytes.clone()), - ), - db.file_path().update( - file_path::pub_id::equals(pub_id), - vec![file_path::size_in_bytes_bytes::set(Some(size_bytes))], - ), - )) - } else { - warn!("Got a missing ancestor for a file_path in the database, maybe we have a corruption"); - None - } - }) - .unzip::<_, _, Vec<_>, Vec<_>>(); - - sync.write_ops(db, to_sync_and_update).await?; - - Ok(()) -} diff --git a/core/src/location/indexer/old_indexer_job.rs b/core/src/location/indexer/old_indexer_job.rs deleted file mode 100644 index 552a62867..000000000 --- a/core/src/location/indexer/old_indexer_job.rs +++ /dev/null @@ -1,660 +0,0 @@ -use crate::{ - file_paths_db_fetcher_fn, invalidate_query, - library::Library, - location::{location_with_indexer_rules, update_location_size, ScanState}, - old_job::{ - CurrentStep, JobError, JobInitOutput, JobReportUpdate, JobResult, JobRunMetadata, - JobStepOutput, StatefulJob, WorkerContext, - }, - to_remove_db_fetcher_fn, -}; - -use sd_core_file_path_helper::{ - ensure_file_path_exists, ensure_sub_path_is_directory, ensure_sub_path_is_in_location, - IsolatedFilePathData, -}; -use sd_core_indexer_rules::IndexerRule; - -use sd_prisma::{ - prisma::{file_path, location}, - prisma_sync, -}; -use sd_sync::*; -use sd_utils::{db::maybe_missing, from_bytes_to_uuid, msgpack}; - -use std::{ - collections::HashMap, - hash::{Hash, Hasher}, - path::{Path, PathBuf}, - sync::Arc, - time::Duration, -}; - -use itertools::Itertools; -use prisma_client_rust::operator::or; -use serde::{Deserialize, Serialize}; -use serde_json::json; -use tokio::time::Instant; -use tracing::{debug, info, warn}; - -use super::{ - execute_indexer_save_step, execute_indexer_update_step, iso_file_path_factory, - old_walk::{keep_walking, walk, ToWalkEntry, WalkResult}, - remove_non_existing_file_paths, reverse_update_directories_sizes, IndexerError, - OldIndexerJobSaveStep, OldIndexerJobUpdateStep, -}; - -/// BATCH_SIZE is the number of files to index at each step, writing the chunk of files metadata in the database. -const BATCH_SIZE: usize = 1000; - -/// `IndexerJobInit` receives a `location::Data` object to be indexed -/// and possibly a `sub_path` to be indexed. The `sub_path` is used when -/// we want do index just a part of a location. -#[derive(Serialize, Deserialize, Debug)] -pub struct OldIndexerJobInit { - pub location: location_with_indexer_rules::Data, - pub sub_path: Option, -} - -impl Hash for OldIndexerJobInit { - fn hash(&self, state: &mut H) { - self.location.id.hash(state); - if let Some(ref sub_path) = self.sub_path { - sub_path.hash(state); - } - } -} - -/// `IndexerJobData` contains the state of the indexer job, which includes a `location_path` that -/// is cached and casted on `PathBuf` from `local_path` column in the `location` table. It also -/// contains some metadata for logging purposes. -#[derive(Serialize, Deserialize, Debug)] -pub struct OldIndexerJobData { - location_path: PathBuf, - indexed_path: PathBuf, - indexer_rules: Vec, -} - -#[derive(Serialize, Deserialize, Default, Debug)] -pub struct OldIndexerJobRunMetadata { - db_write_time: Duration, - scan_read_time: Duration, - total_paths: u64, - total_updated_paths: u64, - total_save_steps: u64, - total_update_steps: u64, - indexed_count: u64, - updated_count: u64, - removed_count: u64, - paths_and_sizes: HashMap, -} - -impl JobRunMetadata for OldIndexerJobRunMetadata { - fn update(&mut self, new_data: Self) { - self.db_write_time += new_data.db_write_time; - self.scan_read_time += new_data.scan_read_time; - self.total_paths += new_data.total_paths; - self.total_updated_paths += new_data.total_updated_paths; - self.total_save_steps += new_data.total_save_steps; - self.total_update_steps += new_data.total_update_steps; - self.indexed_count += new_data.indexed_count; - self.removed_count += new_data.removed_count; - - for (path, size) in new_data.paths_and_sizes { - *self.paths_and_sizes.entry(path).or_default() += size; - } - } -} - -#[derive(Clone)] -pub enum ScanProgress { - ChunkCount(usize), - SavedChunks(usize), - UpdatedChunks(usize), - Message(String), -} - -impl OldIndexerJobData { - fn on_scan_progress(ctx: &WorkerContext, progress: Vec) { - ctx.progress( - progress - .into_iter() - .map(|p| match p { - ScanProgress::ChunkCount(c) => JobReportUpdate::TaskCount(c), - ScanProgress::SavedChunks(p) | ScanProgress::UpdatedChunks(p) => { - JobReportUpdate::CompletedTaskCount(p) - } - ScanProgress::Message(m) => JobReportUpdate::Message(m), - }) - .collect(), - ) - } -} - -/// `IndexerJobStepInput` defines the action that should be executed in the current step -#[derive(Serialize, Deserialize, Debug)] -pub enum OldIndexerJobStepInput { - Save(OldIndexerJobSaveStep), - Walk(ToWalkEntry), - Update(OldIndexerJobUpdateStep), -} - -/// A `IndexerJob` is a stateful job that walks a directory and indexes all files. -/// First it walks the directory and generates a list of files to index, chunked into -/// batches of [`BATCH_SIZE`]. Then for each chunk it write the file metadata to the database. -#[async_trait::async_trait] -impl StatefulJob for OldIndexerJobInit { - type Data = OldIndexerJobData; - type Step = OldIndexerJobStepInput; - type RunMetadata = OldIndexerJobRunMetadata; - - const NAME: &'static str = "indexer"; - const IS_BATCHED: bool = true; - - fn target_location(&self) -> location::id::Type { - self.location.id - } - - /// Creates a vector of valid path buffers from a directory, chunked into batches of `BATCH_SIZE`. - async fn init( - &self, - ctx: &WorkerContext, - data: &mut Option, - ) -> Result, JobError> { - let init = self; - let location_id = init.location.id; - let location_path = maybe_missing(&init.location.path, "location.path").map(Path::new)?; - - let db = Arc::clone(&ctx.library.db); - let sync = &ctx.library.sync; - - let indexer_rules = init - .location - .indexer_rules - .iter() - .map(|rule| IndexerRule::try_from(&rule.indexer_rule)) - .collect::, _>>() - .map_err(IndexerError::from)?; - - let to_walk_path = match &init.sub_path { - Some(sub_path) if sub_path != Path::new("") => { - let full_path = ensure_sub_path_is_in_location(location_path, sub_path) - .await - .map_err(IndexerError::from)?; - ensure_sub_path_is_directory(location_path, sub_path) - .await - .map_err(IndexerError::from)?; - - ensure_file_path_exists( - sub_path, - &IsolatedFilePathData::new(location_id, location_path, &full_path, true) - .map_err(IndexerError::from)?, - &db, - IndexerError::SubPathNotFound, - ) - .await?; - - full_path - } - _ => location_path.to_path_buf(), - }; - - let scan_start = Instant::now(); - let WalkResult { - walked, - to_update, - to_walk, - to_remove, - errors, - paths_and_sizes, - } = walk( - &location_path, - &to_walk_path, - &indexer_rules, - update_notifier_fn(ctx), - file_paths_db_fetcher_fn!(&db), - to_remove_db_fetcher_fn!(location_id, &db), - iso_file_path_factory(location_id, location_path), - 50_000, - ) - .await?; - let scan_read_time = scan_start.elapsed(); - let to_remove = to_remove.collect::>(); - - debug!( - "Walker at indexer job found {} file_paths to be removed", - to_remove.len() - ); - - ctx.node - .thumbnailer - .remove_indexed_cas_ids( - to_remove - .iter() - .filter_map(|file_path| file_path.cas_id.clone()) - .collect::>(), - ctx.library.id, - ) - .await; - - let db_delete_start = Instant::now(); - // TODO pass these uuids to sync system - let removed_count = remove_non_existing_file_paths(to_remove, &db, sync).await?; - let db_delete_time = db_delete_start.elapsed(); - - let total_new_paths = &mut 0; - let total_updated_paths = &mut 0; - let to_walk_count = to_walk.len(); - let to_save_chunks = &mut 0; - let to_update_chunks = &mut 0; - - let steps = walked - .chunks(BATCH_SIZE) - .into_iter() - .enumerate() - .map(|(i, chunk)| { - let chunk_steps = chunk.collect::>(); - - *total_new_paths += chunk_steps.len() as u64; - *to_save_chunks += 1; - - OldIndexerJobStepInput::Save(OldIndexerJobSaveStep { - chunk_idx: i, - walked: chunk_steps, - }) - }) - .chain( - to_update - .chunks(BATCH_SIZE) - .into_iter() - .enumerate() - .map(|(i, chunk)| { - let chunk_updates = chunk.collect::>(); - - *total_updated_paths += chunk_updates.len() as u64; - *to_update_chunks += 1; - - OldIndexerJobStepInput::Update(OldIndexerJobUpdateStep { - chunk_idx: i, - to_update: chunk_updates, - }) - }), - ) - .chain(to_walk.into_iter().map(OldIndexerJobStepInput::Walk)) - .collect::>(); - - debug!("Walker at indexer job found {total_updated_paths} file_paths to be updated"); - - OldIndexerJobData::on_scan_progress( - ctx, - vec![ - ScanProgress::ChunkCount(*to_save_chunks + *to_update_chunks), - ScanProgress::Message(format!( - "Starting saving {total_new_paths} files or directories, \ - {total_updated_paths} files or directories to update, \ - there still {to_walk_count} directories to index", - )), - ], - ); - - *data = Some(OldIndexerJobData { - location_path: location_path.to_path_buf(), - indexed_path: to_walk_path, - indexer_rules, - }); - - Ok(( - OldIndexerJobRunMetadata { - db_write_time: db_delete_time, - scan_read_time, - total_paths: *total_new_paths, - total_updated_paths: *total_updated_paths, - indexed_count: 0, - updated_count: 0, - removed_count, - total_save_steps: *to_save_chunks as u64, - total_update_steps: *to_update_chunks as u64, - paths_and_sizes, - }, - steps, - errors - .into_iter() - .map(|e| format!("{e}")) - .collect::>() - .into(), - ) - .into()) - } - - /// Process each chunk of entries in the indexer job, writing to the `file_path` table - async fn execute_step( - &self, - ctx: &WorkerContext, - CurrentStep { step, .. }: CurrentStep<'_, Self::Step>, - data: &Self::Data, - run_metadata: &Self::RunMetadata, - ) -> Result, JobError> { - let init = self; - let mut new_metadata = Self::RunMetadata::default(); - match step { - OldIndexerJobStepInput::Save(step) => { - let start_time = Instant::now(); - - OldIndexerJobData::on_scan_progress( - ctx, - vec![ - ScanProgress::SavedChunks(step.chunk_idx + 1), - ScanProgress::Message(format!( - "Writing chunk {} of {} to database", - step.chunk_idx, run_metadata.total_save_steps - )), - ], - ); - - let count = execute_indexer_save_step(&init.location, step, &ctx.library).await?; - - new_metadata.indexed_count = count as u64; - new_metadata.db_write_time = start_time.elapsed(); - - Ok(new_metadata.into()) - } - OldIndexerJobStepInput::Update(to_update) => { - let start_time = Instant::now(); - OldIndexerJobData::on_scan_progress( - ctx, - vec![ - ScanProgress::UpdatedChunks(to_update.chunk_idx + 1), - ScanProgress::Message(format!( - "Updating chunk {} of {} to database", - to_update.chunk_idx, run_metadata.total_save_steps - )), - ], - ); - - let count = execute_indexer_update_step(to_update, &ctx.library).await?; - - new_metadata.updated_count = count as u64; - new_metadata.db_write_time = start_time.elapsed(); - - Ok(new_metadata.into()) - } - - OldIndexerJobStepInput::Walk(to_walk_entry) => { - let location_id = init.location.id; - let location_path = - maybe_missing(&init.location.path, "location.path").map(Path::new)?; - - let db = Arc::clone(&ctx.library.db); - let sync = &ctx.library.sync; - - let scan_start = Instant::now(); - - let WalkResult { - walked, - to_update, - to_walk, - to_remove, - errors, - paths_and_sizes, - } = keep_walking( - location_path, - to_walk_entry, - &data.indexer_rules, - update_notifier_fn(ctx), - file_paths_db_fetcher_fn!(&db), - to_remove_db_fetcher_fn!(location_id, &db), - iso_file_path_factory(location_id, location_path), - ) - .await?; - - new_metadata.paths_and_sizes = paths_and_sizes; - - new_metadata.scan_read_time = scan_start.elapsed(); - - let db_delete_time = Instant::now(); - // TODO pass these uuids to sync system - new_metadata.removed_count = - remove_non_existing_file_paths(to_remove, &db, sync).await?; - new_metadata.db_write_time = db_delete_time.elapsed(); - - let to_walk_count = to_walk.len(); - - let more_steps = walked - .chunks(BATCH_SIZE) - .into_iter() - .enumerate() - .map(|(i, chunk)| { - let chunk_steps = chunk.collect::>(); - new_metadata.total_paths += chunk_steps.len() as u64; - new_metadata.total_save_steps += 1; - - OldIndexerJobStepInput::Save(OldIndexerJobSaveStep { - chunk_idx: i, - walked: chunk_steps, - }) - }) - .chain(to_update.chunks(BATCH_SIZE).into_iter().enumerate().map( - |(i, chunk)| { - let chunk_updates = chunk.collect::>(); - new_metadata.total_updated_paths += chunk_updates.len() as u64; - new_metadata.total_update_steps += 1; - - OldIndexerJobStepInput::Update(OldIndexerJobUpdateStep { - chunk_idx: i, - to_update: chunk_updates, - }) - }, - )) - .chain(to_walk.into_iter().map(OldIndexerJobStepInput::Walk)) - .collect::>(); - - OldIndexerJobData::on_scan_progress( - ctx, - vec![ - ScanProgress::ChunkCount(more_steps.len() - to_walk_count), - ScanProgress::Message(format!( - "Scanned {} more files or directories; \ - {} more directories to scan and {} more entries to update", - new_metadata.total_paths, - to_walk_count, - new_metadata.total_updated_paths - )), - ], - ); - - Ok(( - more_steps, - new_metadata, - errors - .into_iter() - .map(|e| format!("{e}")) - .collect::>() - .into(), - ) - .into()) - } - } - } - - async fn finalize( - &self, - ctx: &WorkerContext, - data: &Option, - run_metadata: &Self::RunMetadata, - ) -> JobResult { - let init = self; - let indexed_path_str = data - .as_ref() - .map(|data| Ok(data.indexed_path.to_string_lossy().to_string())) - .unwrap_or_else(|| maybe_missing(&init.location.path, "location.path").cloned())?; - - info!( - "Scan of {indexed_path_str} completed in {:?}. {} new files found, \ - indexed {} files in db, updated {} entries. db write completed in {:?}", - run_metadata.scan_read_time, - run_metadata.total_paths, - run_metadata.indexed_count, - run_metadata.total_updated_paths, - run_metadata.db_write_time, - ); - - if run_metadata.indexed_count > 0 || run_metadata.removed_count > 0 { - invalidate_query!(ctx.library, "search.paths"); - } - - if run_metadata.total_updated_paths > 0 { - // Invoking orphan remover here as we probably have some orphans objects due to updates - // ctx.library.orphan_remover.invoke().await; - } - - if run_metadata.indexed_count > 0 - || run_metadata.removed_count > 0 - || run_metadata.updated_count > 0 - { - if let Some(data) = data { - update_directories_sizes( - &run_metadata.paths_and_sizes, - init.location.id, - &data.indexed_path, - &ctx.library, - ) - .await?; - - if data.indexed_path != data.location_path { - reverse_update_directories_sizes( - &data.indexed_path, - init.location.id, - &data.location_path, - &ctx.library, - ) - .await - .map_err(IndexerError::from)?; - } - - update_location_size(init.location.id, &ctx.library) - .await - .map_err(IndexerError::from)?; - - ctx.library - .db - .location() - .update( - location::id::equals(init.location.id), - vec![location::scan_state::set(ScanState::Indexed as i32)], - ) - .exec() - .await - .map_err(IndexerError::from)?; - } - } - - // FIXME(fogodev): This is currently a workaround to don't save paths and sizes in the - // metadata after a job is completed, as it's pretty heavy. A proper fix isn't needed - // right now as I already changed it in the new indexer job. And this old one - // will be removed eventually. - let run_metadata = Self::RunMetadata { - db_write_time: run_metadata.db_write_time, - scan_read_time: run_metadata.scan_read_time, - total_paths: run_metadata.total_paths, - total_updated_paths: run_metadata.total_updated_paths, - total_save_steps: run_metadata.total_save_steps, - total_update_steps: run_metadata.total_update_steps, - indexed_count: run_metadata.indexed_count, - updated_count: run_metadata.updated_count, - removed_count: run_metadata.removed_count, - paths_and_sizes: HashMap::new(), - }; - - Ok(Some(json!({"init: ": init, "run_metadata": run_metadata}))) - } -} - -fn update_notifier_fn(ctx: &WorkerContext) -> impl FnMut(&Path, usize) + '_ { - move |path, total_entries| { - OldIndexerJobData::on_scan_progress( - ctx, - vec![ScanProgress::Message(format!( - "{total_entries} entries found at {}", - path.display() - ))], - ); - } -} - -async fn update_directories_sizes( - paths_and_sizes: &HashMap, - location_id: location::id::Type, - location_path: impl AsRef, - library: &Library, -) -> Result<(), IndexerError> { - let location_path = location_path.as_ref(); - - let Library { db, sync, .. } = library; - - let chunked_queries = paths_and_sizes - .keys() - .chunks(200) - .into_iter() - .map(|paths_chunk| { - paths_chunk - .into_iter() - .map(|path| { - IsolatedFilePathData::new(location_id, location_path, path, true) - .map(file_path::WhereParam::from) - }) - .collect::, _>>() - .map(|params| { - db.file_path() - .find_many(vec![or(params)]) - .select(file_path::select!({ pub_id materialized_path name })) - }) - }) - .collect::, _>>()?; - - let to_sync_and_update = db - ._batch(chunked_queries) - .await? - .into_iter() - .flatten() - .filter_map( - |file_path| match (file_path.materialized_path, file_path.name) { - (Some(materialized_path), Some(name)) => { - let mut directory_full_path = location_path.join(&materialized_path[1..]); - directory_full_path.push(name); - - if let Some(size) = paths_and_sizes.get(&directory_full_path) { - let size_bytes = size.to_be_bytes().to_vec(); - - Some(( - sync.shared_update( - prisma_sync::file_path::SyncId { - pub_id: file_path.pub_id.clone(), - }, - file_path::size_in_bytes_bytes::NAME, - msgpack!(size_bytes.clone()), - ), - db.file_path().update( - file_path::pub_id::equals(file_path.pub_id), - vec![file_path::size_in_bytes_bytes::set(Some(size_bytes))], - ), - )) - } else { - warn!("Found a file_path without ancestor in the database, possible corruption"); - None - } - } - _ => { - warn!( - "Found a file_path missing its materialized_path or name: ", - from_bytes_to_uuid(&file_path.pub_id) - ); - None - } - }, - ) - .unzip::<_, _, Vec<_>, Vec<_>>(); - - sync.write_ops(db, to_sync_and_update).await?; - - Ok(()) -} diff --git a/core/src/location/indexer/old_shallow.rs b/core/src/location/indexer/old_shallow.rs deleted file mode 100644 index d7857bd10..000000000 --- a/core/src/location/indexer/old_shallow.rs +++ /dev/null @@ -1,197 +0,0 @@ -use crate::{ - file_paths_db_fetcher_fn, invalidate_query, - library::Library, - location::{ - indexer::{ - execute_indexer_update_step, reverse_update_directories_sizes, OldIndexerJobUpdateStep, - }, - scan_location_sub_path, update_location_size, - }, - old_job::JobError, - to_remove_db_fetcher_fn, Node, -}; - -use sd_core_file_path_helper::{ - check_file_path_exists, ensure_sub_path_is_directory, ensure_sub_path_is_in_location, - IsolatedFilePathData, -}; -use sd_core_indexer_rules::IndexerRule; - -use sd_utils::db::maybe_missing; - -use std::{ - collections::HashSet, - path::{Path, PathBuf}, - sync::Arc, -}; - -use futures::future::join_all; -use itertools::Itertools; -use tracing::{debug, error}; - -use super::{ - execute_indexer_save_step, iso_file_path_factory, location_with_indexer_rules, - old_walk::walk_single_dir, remove_non_existing_file_paths, IndexerError, OldIndexerJobSaveStep, -}; - -/// BATCH_SIZE is the number of files to index at each step, writing the chunk of files metadata in the database. -const BATCH_SIZE: usize = 1000; - -pub async fn old_shallow( - location: &location_with_indexer_rules::Data, - sub_path: &PathBuf, - node: &Arc, - library: &Arc, -) -> Result<(), JobError> { - let location_id = location.id; - let location_path = maybe_missing(&location.path, "location.path").map(Path::new)?; - - let db = library.db.clone(); - let sync = &library.sync; - - let indexer_rules = location - .indexer_rules - .iter() - .map(|rule| IndexerRule::try_from(&rule.indexer_rule)) - .collect::, _>>() - .map_err(IndexerError::from)?; - - let (add_root, to_walk_path) = if sub_path != Path::new("") && sub_path != Path::new("/") { - let full_path = ensure_sub_path_is_in_location(&location_path, &sub_path) - .await - .map_err(IndexerError::from)?; - ensure_sub_path_is_directory(&location_path, &sub_path) - .await - .map_err(IndexerError::from)?; - - ( - !check_file_path_exists::( - &IsolatedFilePathData::new(location_id, location_path, &full_path, true) - .map_err(IndexerError::from)?, - &db, - ) - .await?, - full_path, - ) - } else { - (false, location_path.to_path_buf()) - }; - - let (walked, to_update, to_remove, errors, _s) = { - walk_single_dir( - location_path, - &to_walk_path, - &indexer_rules, - file_paths_db_fetcher_fn!(&db), - to_remove_db_fetcher_fn!(location_id, &db), - iso_file_path_factory(location_id, location_path), - add_root, - ) - .await? - }; - - let to_remove_count = to_remove.len(); - - node.thumbnailer - .remove_indexed_cas_ids( - to_remove - .iter() - .filter_map(|file_path| file_path.cas_id.clone()) - .collect::>(), - library.id, - ) - .await; - - errors.into_iter().for_each(|e| error!("{e}")); - - remove_non_existing_file_paths(to_remove, &db, sync).await?; - - let mut new_directories_to_scan = HashSet::new(); - - let mut to_create_count = 0; - - let save_steps = walked - .chunks(BATCH_SIZE) - .into_iter() - .enumerate() - .map(|(i, chunk)| { - let walked = chunk.collect::>(); - to_create_count += walked.len(); - - walked - .iter() - .filter_map(|walked_entry| { - walked_entry.iso_file_path.materialized_path_for_children() - }) - .for_each(|new_dir| { - new_directories_to_scan.insert(new_dir); - }); - - OldIndexerJobSaveStep { - chunk_idx: i, - walked, - } - }) - .collect::>(); - - for step in save_steps { - execute_indexer_save_step(location, &step, library).await?; - } - - for scan in join_all( - new_directories_to_scan - .into_iter() - .map(|sub_path| scan_location_sub_path(node, library, location.clone(), sub_path)), - ) - .await - { - if let Err(e) = scan { - error!("{e}"); - } - } - - let mut to_update_count = 0; - - let update_steps = to_update - .chunks(BATCH_SIZE) - .into_iter() - .enumerate() - .map(|(i, chunk)| { - let to_update = chunk.collect::>(); - to_update_count += to_update.len(); - - OldIndexerJobUpdateStep { - chunk_idx: i, - to_update, - } - }) - .collect::>(); - - for step in update_steps { - execute_indexer_update_step(&step, library).await?; - } - - debug!( - "Walker at shallow indexer found: \ - To create: {to_create_count}; To update: {to_update_count}; To remove: {to_remove_count};" - ); - - if to_create_count > 0 || to_update_count > 0 || to_remove_count > 0 { - if to_walk_path != location_path { - reverse_update_directories_sizes(to_walk_path, location_id, location_path, library) - .await - .map_err(IndexerError::from)?; - } - - update_location_size(location.id, library) - .await - .map_err(IndexerError::from)?; - - invalidate_query!(library, "search.paths"); - invalidate_query!(library, "search.objects"); - } - - // library.orphan_remover.invoke().await; - - Ok(()) -} diff --git a/core/src/location/indexer/old_walk.rs b/core/src/location/indexer/old_walk.rs deleted file mode 100644 index 1cc73dfd5..000000000 --- a/core/src/location/indexer/old_walk.rs +++ /dev/null @@ -1,1120 +0,0 @@ -use sd_core_file_path_helper::{FilePathMetadata, IsolatedFilePathData}; -use sd_core_indexer_rules::{ - seed::{GitIgnoreRules, GITIGNORE}, - IndexerRule, RuleKind, -}; -use sd_core_prisma_helpers::{file_path_pub_and_cas_ids, file_path_walker}; - -use sd_prisma::prisma::file_path; -use sd_utils::{db::inode_from_db, error::FileIOError}; - -use std::{ - collections::{HashMap, HashSet, VecDeque}, - future::Future, - hash::{Hash, Hasher}, - ops::Deref, - path::{Path, PathBuf}, -}; - -use chrono::{DateTime, Duration, FixedOffset}; -use serde::{Deserialize, Serialize}; -use tokio::fs; -use tracing::trace; -use uuid::Uuid; - -use super::IndexerError; - -const TO_WALK_QUEUE_INITIAL_CAPACITY: usize = 32; -const WALKER_PATHS_BUFFER_INITIAL_CAPACITY: usize = 256; -const WALK_SINGLE_DIR_PATHS_BUFFER_INITIAL_CAPACITY: usize = 32; - -/// `WalkEntry` represents a single path in the filesystem, for any comparison purposes, we only -/// consider the path itself, not the metadata. -#[derive(Debug, Serialize, Deserialize)] -pub struct WalkedEntry { - pub pub_id: Uuid, - pub maybe_object_id: file_path::object_id::Type, - pub iso_file_path: IsolatedFilePathData<'static>, - pub metadata: FilePathMetadata, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct ToWalkEntry { - path: PathBuf, - parent_dir_accepted_by_its_children: Option, - maybe_parent: Option, -} - -#[derive(Debug)] -struct WalkingEntry { - iso_file_path: IsolatedFilePathData<'static>, - maybe_metadata: Option, -} - -impl From for WalkedEntry { - fn from(walking_entry: WalkingEntry) -> Self { - let WalkingEntry { - iso_file_path, - maybe_metadata, - } = walking_entry; - - Self { - pub_id: Uuid::new_v4(), - maybe_object_id: None, - iso_file_path, - metadata: maybe_metadata - .expect("we always use Some in `the inner_walk_single_dir` function"), - } - } -} - -impl From<(Uuid, file_path::object_id::Type, WalkingEntry)> for WalkedEntry { - fn from( - (pub_id, maybe_object_id, walking_entry): (Uuid, file_path::object_id::Type, WalkingEntry), - ) -> Self { - let WalkingEntry { - iso_file_path, - maybe_metadata, - } = walking_entry; - - Self { - pub_id, - maybe_object_id, - iso_file_path, - metadata: maybe_metadata - .expect("we always use Some in `the inner_walk_single_dir` function"), - } - } -} - -impl PartialEq for WalkingEntry { - fn eq(&self, other: &Self) -> bool { - self.iso_file_path == other.iso_file_path - } -} - -impl Eq for WalkingEntry {} - -impl Hash for WalkingEntry { - fn hash(&self, state: &mut H) { - self.iso_file_path.hash(state); - } -} - -pub struct WalkResult -where - Walked: Iterator, - ToUpdate: Iterator, - ToRemove: Iterator, -{ - pub walked: Walked, - pub to_update: ToUpdate, - pub to_walk: VecDeque, - pub to_remove: ToRemove, - pub errors: Vec, - pub paths_and_sizes: HashMap, -} - -/// This function walks through the filesystem, applying the rules to each entry and then returning -/// a list of accepted entries. There are some useful comments in the implementation of this function -/// in case of doubts. -pub(super) async fn walk( - library_root: impl AsRef, - current_dir: impl AsRef, - indexer_rules: &[IndexerRule], - mut update_notifier: impl FnMut(&Path, usize), - file_paths_db_fetcher: impl Fn(Vec) -> FilePathDBFetcherFut, - to_remove_db_fetcher: impl Fn( - IsolatedFilePathData<'static>, - Vec, - ) -> ToRemoveDbFetcherFut, - iso_file_path_factory: impl Fn(&Path, bool) -> Result, IndexerError>, - limit: u64, -) -> Result< - WalkResult< - impl Iterator, - impl Iterator, - impl Iterator, - >, - IndexerError, -> -where - FilePathDBFetcherFut: Future, IndexerError>>, - ToRemoveDbFetcherFut: - Future, IndexerError>>, -{ - let current_dir = current_dir.as_ref(); - - let mut to_walk = VecDeque::with_capacity(TO_WALK_QUEUE_INITIAL_CAPACITY); - to_walk.push_back(ToWalkEntry { - path: current_dir.to_path_buf(), - parent_dir_accepted_by_its_children: None, - maybe_parent: None, - }); - let mut indexed_paths = HashSet::with_capacity(WALKER_PATHS_BUFFER_INITIAL_CAPACITY); - let mut errors = vec![]; - let mut paths_buffer = HashSet::with_capacity(WALKER_PATHS_BUFFER_INITIAL_CAPACITY); - let mut paths_and_sizes = HashMap::with_capacity(TO_WALK_QUEUE_INITIAL_CAPACITY); - let mut to_remove = vec![]; - - while let Some(entry) = to_walk.pop_front() { - let last_indexed_count = indexed_paths.len(); - - let (entry_size, current_to_remove) = inner_walk_single_dir( - library_root.as_ref(), - current_dir, - &entry, - indexer_rules, - &to_remove_db_fetcher, - &iso_file_path_factory, - WorkingTable { - indexed_paths: &mut indexed_paths, - paths_buffer: &mut paths_buffer, - maybe_to_walk: Some(&mut to_walk), - errors: &mut errors, - }, - ) - .await; - to_remove.push(current_to_remove); - - update_notifier(&entry.path, indexed_paths.len() - last_indexed_count); - - // Saving the size of current entry - paths_and_sizes.insert(entry.path, entry_size); - - // Adding the size of current entry to its parent - if let Some(parent) = entry.maybe_parent { - *paths_and_sizes.entry(parent).or_default() += entry_size; - } - - if indexed_paths.len() >= limit as usize { - break; - } - } - - let (walked, to_update) = filter_existing_paths(indexed_paths, file_paths_db_fetcher).await?; - - Ok(WalkResult { - walked, - to_update, - to_walk, - to_remove: to_remove.into_iter().flatten(), - errors, - paths_and_sizes, - }) -} - -pub(super) async fn keep_walking( - location_path: impl AsRef, - to_walk_entry: &ToWalkEntry, - indexer_rules: &[IndexerRule], - mut update_notifier: impl FnMut(&Path, usize), - file_paths_db_fetcher: impl Fn(Vec) -> FilePathDBFetcherFut, - to_remove_db_fetcher: impl Fn( - IsolatedFilePathData<'static>, - Vec, - ) -> ToRemoveDbFetcherFut, - iso_file_path_factory: impl Fn(&Path, bool) -> Result, IndexerError>, -) -> Result< - WalkResult< - impl Iterator, - impl Iterator, - impl Iterator, - >, - IndexerError, -> -where - FilePathDBFetcherFut: Future, IndexerError>>, - ToRemoveDbFetcherFut: - Future, IndexerError>>, -{ - let mut to_keep_walking = VecDeque::with_capacity(TO_WALK_QUEUE_INITIAL_CAPACITY); - let mut indexed_paths = HashSet::with_capacity(WALK_SINGLE_DIR_PATHS_BUFFER_INITIAL_CAPACITY); - let mut paths_buffer = HashSet::with_capacity(WALK_SINGLE_DIR_PATHS_BUFFER_INITIAL_CAPACITY); - let mut errors = vec![]; - - let (to_walk_entry_size, to_remove) = inner_walk_single_dir( - location_path, - to_walk_entry.path.clone(), - to_walk_entry, - indexer_rules, - &to_remove_db_fetcher, - &iso_file_path_factory, - WorkingTable { - indexed_paths: &mut indexed_paths, - paths_buffer: &mut paths_buffer, - maybe_to_walk: Some(&mut to_keep_walking), - errors: &mut errors, - }, - ) - .await; - - update_notifier(&to_walk_entry.path, indexed_paths.len()); - - let (walked, to_update) = filter_existing_paths(indexed_paths, file_paths_db_fetcher).await?; - - Ok(WalkResult { - walked, - to_update, - to_walk: to_keep_walking, - to_remove: to_remove.into_iter(), - errors, - paths_and_sizes: [ - Some((to_walk_entry.path.clone(), to_walk_entry_size)), - to_walk_entry - .maybe_parent - .as_ref() - .map(|parent_path| (parent_path.clone(), to_walk_entry_size)), - ] - .into_iter() - .flatten() - .collect(), - }) -} - -pub(super) async fn walk_single_dir( - location_path: impl AsRef, - current_dir: impl AsRef, - indexer_rules: &[IndexerRule], - file_paths_db_fetcher: impl Fn(Vec) -> FilePathDBFetcherFut, - to_remove_db_fetcher: impl Fn( - IsolatedFilePathData<'static>, - Vec, - ) -> ToRemoveDbFetcherFut, - iso_file_path_factory: impl Fn(&Path, bool) -> Result, IndexerError>, - add_root: bool, -) -> Result< - ( - impl Iterator, - impl Iterator, - Vec, - Vec, - u64, - ), - IndexerError, -> -where - FilePathDBFetcherFut: Future, IndexerError>>, - ToRemoveDbFetcherFut: - Future, IndexerError>>, -{ - let current_directory = current_dir.as_ref(); - - let mut indexed_paths = HashSet::with_capacity(WALK_SINGLE_DIR_PATHS_BUFFER_INITIAL_CAPACITY); - - if add_root { - let metadata = fs::metadata(current_directory) - .await - .map_err(|e| FileIOError::from((current_directory, e)))?; - - indexed_paths.insert(WalkingEntry { - iso_file_path: iso_file_path_factory(current_directory, true)?, - maybe_metadata: Some(FilePathMetadata::from_path(current_directory, &metadata)?), - }); - } - - let mut paths_buffer = HashSet::with_capacity(WALK_SINGLE_DIR_PATHS_BUFFER_INITIAL_CAPACITY); - let mut errors = vec![]; - - let (root_size, to_remove) = inner_walk_single_dir( - location_path, - current_directory, - &ToWalkEntry { - path: current_directory.to_path_buf(), - parent_dir_accepted_by_its_children: None, - maybe_parent: None, - }, - indexer_rules, - &to_remove_db_fetcher, - &iso_file_path_factory, - WorkingTable { - indexed_paths: &mut indexed_paths, - paths_buffer: &mut paths_buffer, - maybe_to_walk: None, - errors: &mut errors, - }, - ) - .await; - - let (walked, to_update) = filter_existing_paths(indexed_paths, file_paths_db_fetcher).await?; - - Ok((walked, to_update, to_remove, errors, root_size)) -} - -async fn filter_existing_paths( - indexed_paths: HashSet, - file_paths_db_fetcher: impl Fn(Vec) -> F, -) -> Result< - ( - impl Iterator, - impl Iterator, - ), - IndexerError, -> -where - F: Future, IndexerError>>, -{ - if !indexed_paths.is_empty() { - file_paths_db_fetcher( - indexed_paths - .iter() - .map(|entry| &entry.iso_file_path) - .map(Into::into) - .collect(), - ) - .await - } else { - Ok(vec![]) - } - .map(move |file_paths| { - let isolated_paths_already_in_db = file_paths - .into_iter() - .flat_map(|file_path| { - IsolatedFilePathData::try_from(file_path.clone()) - .map(|iso_file_path| (iso_file_path, file_path)) - }) - .collect::>(); - - let mut to_update = vec![]; - - let to_create = indexed_paths - .into_iter() - .filter_map(|entry| { - if let Some(file_path) = isolated_paths_already_in_db.get(&entry.iso_file_path) { - if let (Some(metadata), Some(inode), Some(date_modified)) = ( - &entry.maybe_metadata, - &file_path.inode, - &file_path.date_modified, - ) { - if ( - inode_from_db(&inode[0..8]) != metadata.inode - // Datetimes stored in DB loses a bit of precision, so we need to check against a delta - // instead of using != operator - || DateTime::::from(metadata.modified_at) - *date_modified - > Duration::milliseconds(1) || file_path.hidden.is_none() || metadata.hidden != file_path.hidden.unwrap_or_default() - ) - // We ignore the size of directories because it is not reliable, we need to - // calculate it ourselves later - && !( - entry.iso_file_path.to_parts().is_dir - && metadata.size_in_bytes - != file_path - .size_in_bytes_bytes - .as_ref() - .map(|size_in_bytes_bytes| { - u64::from_be_bytes([ - size_in_bytes_bytes[0], - size_in_bytes_bytes[1], - size_in_bytes_bytes[2], - size_in_bytes_bytes[3], - size_in_bytes_bytes[4], - size_in_bytes_bytes[5], - size_in_bytes_bytes[6], - size_in_bytes_bytes[7], - ]) - }) - .unwrap_or_default() - ) { - to_update.push( - (sd_utils::from_bytes_to_uuid(&file_path.pub_id), file_path.object_id, entry).into(), - ); - } - } - - None - } else { - Some(entry.into()) - } - }) - .collect::>(); - - (to_create.into_iter(), to_update.into_iter()) - }) -} - -struct WorkingTable<'a> { - indexed_paths: &'a mut HashSet, - paths_buffer: &'a mut HashSet, - maybe_to_walk: Option<&'a mut VecDeque>, - errors: &'a mut Vec, -} - -async fn inner_walk_single_dir( - library_root: impl AsRef, - current_dir: impl AsRef, - ToWalkEntry { - path, - parent_dir_accepted_by_its_children, - .. - }: &ToWalkEntry, - indexer_rules: &[IndexerRule], - to_remove_db_fetcher: impl Fn( - IsolatedFilePathData<'static>, - Vec, - ) -> ToRemoveDbFetcherFut, - iso_file_path_factory: &impl Fn(&Path, bool) -> Result, IndexerError>, - WorkingTable { - indexed_paths, - paths_buffer, - mut maybe_to_walk, - errors, - }: WorkingTable<'_>, -) -> (u64, Vec) -where - ToRemoveDbFetcherFut: - Future, IndexerError>>, -{ - let Ok(iso_file_path_to_walk) = iso_file_path_factory(path, true).map_err(|e| errors.push(e)) - else { - return (0, vec![]); - }; - - let Ok(mut read_dir) = fs::read_dir(path) - .await - .map_err(|e| errors.push(FileIOError::from((path.clone(), e)).into())) - else { - return (0, vec![]); - }; - - let mut rules = indexer_rules.to_owned(); - - if rules.iter().any(|rule| GITIGNORE.deref() == rule) { - if let Some(pat) = - GitIgnoreRules::get_rules_if_in_git_repo(library_root.as_ref(), path).await - { - rules.extend(pat.into_iter().map(Into::into)); - } - } - - let current_dir = current_dir.as_ref(); - - // Just to make sure... - paths_buffer.clear(); - - // Marking with a loop label here in case of rejection or errors, to continue with next entry - 'entries: loop { - let entry = match read_dir.next_entry().await { - Ok(Some(entry)) => entry, - Ok(None) => break, - Err(e) => { - errors.push(FileIOError::from((path.clone(), e)).into()); - continue; - } - }; - - // Accept by children has three states, - // None if we don't now yet or if this check doesn't apply - // Some(true) if this check applies and it passes - // Some(false) if this check applies and it was rejected - // and we pass the current parent state to its children - let mut accept_by_children_dir = *parent_dir_accepted_by_its_children; - - let current_path = entry.path(); - - trace!( - "Current filesystem path: {}, accept_by_children_dir: {:#?}", - current_path.display(), - accept_by_children_dir - ); - - let Ok(rules_per_kind) = IndexerRule::apply_all(&rules, ¤t_path) - .await - .map_err(|e| errors.push(e.into())) - else { - continue 'entries; - }; - - if rules_per_kind - .get(&RuleKind::RejectFilesByGlob) - .map_or(false, |reject_results| { - reject_results.iter().any(|reject| !reject) - }) { - trace!( - "Path {} rejected by `RuleKind::RejectFilesByGlob`", - current_path.display() - ); - continue 'entries; - } - - if let Some(f) = rules_per_kind.get(&RuleKind::IgnoredByGit) { - if f.iter().any(|s| !s) { - continue 'entries; - } - } - - let Ok(metadata) = entry - .metadata() - .await - .map_err(|e| errors.push(FileIOError::from((¤t_path, e)).into())) - else { - continue 'entries; - }; - - // TODO: Hard ignoring symlinks for now, but this should be configurable - if metadata.is_symlink() { - continue 'entries; - } - - let is_dir = metadata.is_dir(); - - if is_dir { - // If it is a directory, first we check if we must reject it and its children entirely - if rules_per_kind - .get(&RuleKind::RejectIfChildrenDirectoriesArePresent) - .map_or(false, |reject_results| { - reject_results.iter().any(|reject| !reject) - }) { - trace!( - "Path {} rejected by rule `RuleKind::RejectIfChildrenDirectoriesArePresent`", - current_path.display(), - ); - continue 'entries; - } - - // Then we check if we must accept it and its children - if let Some(accept_by_children_rules) = - rules_per_kind.get(&RuleKind::AcceptIfChildrenDirectoriesArePresent) - { - if accept_by_children_rules.iter().any(|accept| *accept) { - accept_by_children_dir = Some(true); - } - - // If it wasn't accepted then we mark as rejected - if accept_by_children_dir.is_none() { - trace!( - "Path {} rejected because it didn't passed in any AcceptIfChildrenDirectoriesArePresent rule", - current_path.display() - ); - accept_by_children_dir = Some(false); - } - } - - // Then we check if there's a git ignore rule for it - if let Some(accept) = rules_per_kind.get(&RuleKind::IgnoredByGit) { - if !accept.iter().any(|&r| r) { - trace!(dir=?current_path, "ignoring files because of git ignore"); - continue 'entries; - } - } - - // Then we mark this directory the be walked in too - if let Some(ref mut to_walk) = maybe_to_walk { - to_walk.push_back(ToWalkEntry { - path: current_path.clone(), - parent_dir_accepted_by_its_children: accept_by_children_dir, - maybe_parent: Some(path.clone()), - }); - } - } - - if rules_per_kind - .get(&RuleKind::AcceptFilesByGlob) - .map_or(false, |accept_rules| { - accept_rules.iter().all(|accept| !accept) - }) { - trace!( - "Path {} reject because it didn't passed in any AcceptFilesByGlob rules", - current_path.display() - ); - continue 'entries; - } - - if accept_by_children_dir.unwrap_or(true) { - let Ok(iso_file_path) = - iso_file_path_factory(¤t_path, is_dir).map_err(|e| errors.push(e)) - else { - continue 'entries; - }; - - let Ok(metadata) = FilePathMetadata::from_path(¤t_path, &metadata) - .map_err(|e| errors.push(e.into())) - else { - continue; - }; - - paths_buffer.insert(WalkingEntry { - iso_file_path, - maybe_metadata: Some(metadata), - }); - - // If the ancestors directories wasn't indexed before, now we do - for ancestor in current_path - .ancestors() - .skip(1) // Skip the current directory as it was already indexed - .take_while(|&ancestor| ancestor != current_dir) - { - let Ok(iso_file_path) = - iso_file_path_factory(ancestor, true).map_err(|e| errors.push(e)) - else { - // Checking the next ancestor, as this one we got an error - continue; - }; - - let mut ancestor_iso_walking_entry = WalkingEntry { - iso_file_path, - maybe_metadata: None, - }; - trace!("Indexing ancestor {}", ancestor.display()); - if !indexed_paths.contains(&ancestor_iso_walking_entry) { - let Ok(metadata) = fs::metadata(ancestor) - .await - .map_err(|e| errors.push(FileIOError::from((&ancestor, e)).into())) - else { - // Checking the next ancestor, as this one we got an error - continue; - }; - - let Ok(metadata) = FilePathMetadata::from_path(ancestor, &metadata) - .map_err(|e| errors.push(e.into())) - else { - continue; - }; - - ancestor_iso_walking_entry.maybe_metadata = Some(metadata); - - paths_buffer.insert(ancestor_iso_walking_entry); - } else { - // If indexed_paths contains the current ancestors, then it will contain - // also all if its ancestors too, so we can stop here - break; - } - } - } - } - - // We continue the function even if we fail to fetch `file_path`s to remove, - // the DB will have old `file_path`s but at least this is better than - // don't adding the newly indexed paths - let to_remove = to_remove_db_fetcher( - iso_file_path_to_walk, - paths_buffer - .iter() - .map(|entry| &entry.iso_file_path) - .map(Into::into) - .collect(), - ) - .await - .unwrap_or_else(|e| { - errors.push(e); - vec![] - }); - - let mut to_walk_entry_size = 0; - - // Just merging the `found_paths` with `indexed_paths` here in the end to avoid possibly - // multiple rehashes during function execution - indexed_paths.extend(paths_buffer.drain().map(|walking_entry| { - if let Some(metadata) = &walking_entry.maybe_metadata { - to_walk_entry_size += metadata.size_in_bytes; - } - walking_entry - })); - - (to_walk_entry_size, to_remove) -} - -#[cfg(test)] -#[allow(clippy::unwrap_used, clippy::panic)] -mod tests { - use super::*; - use chrono::Utc; - use globset::{Glob, GlobSetBuilder}; - use sd_core_indexer_rules::RulePerKind; - use tempfile::{tempdir, TempDir}; - // use tracing_test::traced_test; - - impl PartialEq for WalkedEntry { - fn eq(&self, other: &Self) -> bool { - self.iso_file_path == other.iso_file_path - } - } - - impl Eq for WalkedEntry {} - - impl Hash for WalkedEntry { - fn hash(&self, state: &mut H) { - self.iso_file_path.hash(state); - } - } - - fn new_indexer_rule( - name: impl Into, - default: bool, - rules: Vec, - ) -> IndexerRule { - IndexerRule { - id: None, - name: name.into(), - default, - rules, - date_created: Utc::now(), - date_modified: Utc::now(), - } - } - - async fn prepare_location() -> TempDir { - let root = tempdir().unwrap(); - let root_path = root.path(); - let rust_project = root_path.join("rust_project"); - let inner_project = root_path.join("inner"); - let node_project = inner_project.join("node_project"); - let photos = root_path.join("photos"); - - fs::create_dir(&rust_project).await.unwrap(); - fs::create_dir(&inner_project).await.unwrap(); - fs::create_dir(&node_project).await.unwrap(); - fs::create_dir(&photos).await.unwrap(); - - // Making rust and node projects a git repository - fs::create_dir(rust_project.join(".git")).await.unwrap(); - fs::create_dir(node_project.join(".git")).await.unwrap(); - - // Populating rust project - fs::File::create(rust_project.join("Cargo.toml")) - .await - .unwrap(); - let rust_src_dir = rust_project.join("src"); - fs::create_dir(&rust_src_dir).await.unwrap(); - fs::File::create(rust_src_dir.join("main.rs")) - .await - .unwrap(); - let rust_target_dir = rust_project.join("target"); - fs::create_dir(&rust_target_dir).await.unwrap(); - let rust_build_dir = rust_target_dir.join("debug"); - fs::create_dir(&rust_build_dir).await.unwrap(); - fs::File::create(rust_build_dir.join("main")).await.unwrap(); - - // Populating node project - fs::File::create(node_project.join("package.json")) - .await - .unwrap(); - let node_src_dir = node_project.join("src"); - fs::create_dir(&node_src_dir).await.unwrap(); - fs::File::create(node_src_dir.join("App.tsx")) - .await - .unwrap(); - let node_modules = node_project.join("node_modules"); - fs::create_dir(&node_modules).await.unwrap(); - let node_modules_dep = node_modules.join("react"); - fs::create_dir(&node_modules_dep).await.unwrap(); - fs::File::create(node_modules_dep.join("package.json")) - .await - .unwrap(); - - // Photos directory - for photo in ["photo1.png", "photo2.jpg", "photo3.jpeg", "text.txt"].iter() { - fs::File::create(photos.join(photo)).await.unwrap(); - } - - root - } - - #[tokio::test] - async fn test_walk_without_rules() { - let root = prepare_location().await; - let root_path = root.path(); - - let metadata = FilePathMetadata { - inode: 0, - size_in_bytes: 0, - created_at: Utc::now(), - modified_at: Utc::now(), - hidden: false, - }; - - let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); - let pub_id = Uuid::new_v4(); - let maybe_object_id = None; - - #[rustfmt::skip] - let expected = [ - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/Cargo.toml"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src/main.rs"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/target"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/target/debug"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/target/debug/main"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/package.json"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src/App.tsx"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react/package.json"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo1.png"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo2.jpg"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo3.jpeg"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/text.txt"), false), metadata }, - ] - .into_iter() - .collect::>(); - - let walk_result = walk( - root_path.to_path_buf(), - root_path.to_path_buf(), - &mut [], - |_, _| {}, - |_| async { Ok(vec![]) }, - |_, _| async { Ok(vec![]) }, - |path, is_dir| { - IsolatedFilePathData::new(0, root_path, path, is_dir).map_err(Into::into) - }, - 420, - ) - .await - .unwrap(); - - if !walk_result.errors.is_empty() { - panic!("errors: {:#?}", walk_result.errors); - } - - let actual = walk_result.walked.collect::>(); - - if actual != expected { - panic!("difference: {:#?}", expected.difference(&actual)); - } - } - - #[tokio::test] - // #[traced_test] - async fn test_only_photos() { - let root = prepare_location().await; - let root_path = root.path(); - - let metadata = FilePathMetadata { - inode: 0, - size_in_bytes: 0, - created_at: Utc::now(), - modified_at: Utc::now(), - hidden: false, - }; - - let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); - let pub_id = Uuid::new_v4(); - let maybe_object_id = None; - - #[rustfmt::skip] - let expected = [ - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo1.png"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo2.jpg"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("photos/photo3.jpeg"), false), metadata }, - ] - .into_iter() - .collect::>(); - - let mut only_photos_rule = vec![new_indexer_rule( - "only photos".to_string(), - false, - vec![RulePerKind::AcceptFilesByGlob( - vec![], - GlobSetBuilder::new() - .add(Glob::new("{*.png,*.jpg,*.jpeg}").unwrap()) - .build() - .unwrap(), - )], - )]; - - let walk_result = walk( - root_path.to_path_buf(), - root_path.to_path_buf(), - &mut only_photos_rule, - |_, _| {}, - |_| async { Ok(vec![]) }, - |_, _| async { Ok(vec![]) }, - |path, is_dir| { - IsolatedFilePathData::new(0, root_path, path, is_dir).map_err(Into::into) - }, - 420, - ) - .await - .unwrap(); - - if !walk_result.errors.is_empty() { - panic!("errors: {:#?}", walk_result.errors); - } - - let actual = walk_result.walked.collect::>(); - - if actual != expected { - panic!("difference: {:#?}", expected.difference(&actual)); - } - } - - #[tokio::test] - // #[traced_test] - async fn test_git_repos() { - let root = prepare_location().await; - let root_path = root.path(); - - let metadata = FilePathMetadata { - inode: 0, - size_in_bytes: 0, - created_at: Utc::now(), - modified_at: Utc::now(), - hidden: false, - }; - - let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); - let pub_id = Uuid::new_v4(); - let maybe_object_id = None; - - #[rustfmt::skip] - let expected = [ - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/Cargo.toml"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src/main.rs"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/target/"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/target/debug"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/target/debug/main"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/package.json"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src/App.tsx"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/node_modules/react/package.json"), false), metadata }, - ] - .into_iter() - .collect::>(); - - let mut git_repos = vec![new_indexer_rule( - "git repos".to_string(), - false, - vec![RulePerKind::AcceptIfChildrenDirectoriesArePresent( - [".git".to_string()].into_iter().collect(), - )], - )]; - - let walk_result = walk( - root_path.to_path_buf(), - root_path.to_path_buf(), - &mut git_repos, - |_, _| {}, - |_| async { Ok(vec![]) }, - |_, _| async { Ok(vec![]) }, - |path, is_dir| { - IsolatedFilePathData::new(0, root_path, path, is_dir).map_err(Into::into) - }, - 420, - ) - .await - .unwrap(); - - if !walk_result.errors.is_empty() { - panic!("errors: {:#?}", walk_result.errors); - } - - let actual = walk_result.walked.collect::>(); - - if actual != expected { - let not_found = expected.difference(&actual); - let not_expected = actual.difference(&expected); - panic!("difference:\nexpected, but not found: {not_found:#?}\nfound, but not expected:{not_expected:#?}"); - } - } - - #[tokio::test] - // #[traced_test] - async fn git_repos_without_deps_or_build_dirs() { - let root = prepare_location().await; - let root_path = root.path(); - - let metadata = FilePathMetadata { - inode: 0, - size_in_bytes: 0, - created_at: Utc::now(), - modified_at: Utc::now(), - hidden: false, - }; - - let f = |path, is_dir| IsolatedFilePathData::new(0, root_path, path, is_dir).unwrap(); - let pub_id = Uuid::new_v4(); - let maybe_object_id = None; - - #[rustfmt::skip] - let expected = [ - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/Cargo.toml"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("rust_project/src/main.rs"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/.git"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/package.json"), false), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src"), true), metadata }, - WalkedEntry { pub_id, maybe_object_id, iso_file_path: f(root_path.join("inner/node_project/src/App.tsx"), false), metadata }, - ] - .into_iter() - .collect::>(); - - let mut git_repos_no_deps_no_build_dirs = vec![ - new_indexer_rule( - "git repos".to_string(), - false, - vec![RulePerKind::AcceptIfChildrenDirectoriesArePresent( - [".git".to_string()].into_iter().collect(), - )], - ), - new_indexer_rule( - "reject node_modules".to_string(), - false, - vec![RulePerKind::RejectFilesByGlob( - vec![], - GlobSetBuilder::new() - .add(Glob::new("{**/node_modules/*,**/node_modules}").unwrap()) - .build() - .unwrap(), - )], - ), - new_indexer_rule( - "reject rust build dir".to_string(), - false, - vec![RulePerKind::RejectFilesByGlob( - vec![], - GlobSetBuilder::new() - .add(Glob::new("{**/target/*,**/target}").unwrap()) - .build() - .unwrap(), - )], - ), - ]; - - let walk_result = walk( - root_path.to_path_buf(), - root_path.to_path_buf(), - &mut git_repos_no_deps_no_build_dirs, - |_, _| {}, - |_| async { Ok(vec![]) }, - |_, _| async { Ok(vec![]) }, - |path, is_dir| { - IsolatedFilePathData::new(0, root_path, path, is_dir).map_err(Into::into) - }, - 420, - ) - .await - .unwrap(); - - if !walk_result.errors.is_empty() { - panic!("errors: {:#?}", walk_result.errors); - } - - let actual = walk_result.walked.collect::>(); - - if actual != expected { - let not_found = expected.difference(&actual); - let not_expected = actual.difference(&expected); - panic!("difference:\nexpected, but not found: {not_found:#?}\nfound, but not expected:{not_expected:#?}"); - } - } -} diff --git a/core/src/location/manager/helpers.rs b/core/src/location/manager/helpers.rs deleted file mode 100644 index 095d99768..000000000 --- a/core/src/location/manager/helpers.rs +++ /dev/null @@ -1,281 +0,0 @@ -use crate::{ - library::{Library, LibraryId}, - Node, -}; - -use sd_prisma::prisma::location; -use sd_utils::db::maybe_missing; - -use std::{ - collections::{HashMap, HashSet}, - path::{Path, PathBuf}, - sync::Arc, - time::Duration, -}; - -use tokio::{fs, io::ErrorKind, sync::oneshot, time::sleep}; -use tracing::{error, warn}; -use uuid::Uuid; - -use super::{watcher::LocationWatcher, LocationManagerError}; - -type LocationAndLibraryKey = (location::id::Type, LibraryId); - -const LOCATION_CHECK_INTERVAL: Duration = Duration::from_secs(5); - -pub(super) async fn check_online( - location: &location::Data, - node: &Node, - library: &Library, -) -> Result { - let pub_id = Uuid::from_slice(&location.pub_id)?; - - let location_path = maybe_missing(&location.path, "location.path").map(Path::new)?; - - // TODO(N): This isn't gonna work with removable media and this will likely permanently break if the DB is restored from a backup. - if location.instance_id == Some(library.config().await.instance_id) { - match fs::metadata(&location_path).await { - Ok(_) => { - node.locations.add_online(pub_id).await; - Ok(true) - } - Err(e) if e.kind() == ErrorKind::NotFound => { - node.locations.remove_online(&pub_id).await; - Ok(false) - } - Err(e) => { - error!("Failed to check if location is online: {:#?}", e); - Ok(false) - } - } - } else { - // In this case, we don't have a `local_path`, but this location was marked as online - node.locations.remove_online(&pub_id).await; - Err(LocationManagerError::NonLocalLocation(location.id)) - } -} - -pub(super) async fn location_check_sleep( - location_id: location::id::Type, - library: Arc, -) -> (location::id::Type, Arc) { - sleep(LOCATION_CHECK_INTERVAL).await; - (location_id, library) -} - -pub(super) fn watch_location( - location: location::Data, - library_id: LibraryId, - locations_watched: &mut HashMap, - locations_unwatched: &mut HashMap, -) { - let location_id = location.id; - let location_path = location.path.as_ref(); - let Some(location_path) = location_path.map(Path::new) else { - return; - }; - - if let Some(mut watcher) = locations_unwatched.remove(&(location_id, library_id)) { - if watcher.check_path(location_path) { - watcher.watch(); - } - - locations_watched.insert((location_id, library_id), watcher); - } -} - -pub(super) fn unwatch_location( - location: location::Data, - library_id: LibraryId, - locations_watched: &mut HashMap, - locations_unwatched: &mut HashMap, -) { - let location_id = location.id; - let location_path = location.path.as_ref(); - let Some(location_path) = location_path.map(Path::new) else { - return; - }; - - if let Some(mut watcher) = locations_watched.remove(&(location_id, library_id)) { - if watcher.check_path(location_path) { - watcher.unwatch(); - } - - locations_unwatched.insert((location_id, library_id), watcher); - } -} - -pub(super) fn drop_location( - location_id: location::id::Type, - library_id: LibraryId, - message: &str, - locations_watched: &mut HashMap, - locations_unwatched: &mut HashMap, -) { - warn!("{message}: ",); - if let Some(mut watcher) = locations_watched.remove(&(location_id, library_id)) { - watcher.unwatch(); - } else { - locations_unwatched.remove(&(location_id, library_id)); - } -} - -pub(super) async fn get_location( - location_id: location::id::Type, - library: &Library, -) -> Option { - library - .db - .location() - .find_unique(location::id::equals(location_id)) - .exec() - .await - .unwrap_or_else(|err| { - error!("Failed to get location data from location_id: {:#?}", err); - None - }) -} - -pub(super) async fn handle_remove_location_request( - location_id: location::id::Type, - library: Arc, - response_tx: oneshot::Sender>, - forced_unwatch: &mut HashSet, - locations_watched: &mut HashMap, - locations_unwatched: &mut HashMap, - to_remove: &mut HashSet, -) { - let key = (location_id, library.id); - if let Some(location) = get_location(location_id, &library).await { - // TODO(N): This isn't gonna work with removable media and this will likely permanently break if the DB is restored from a backup. - if location.instance_id == Some(library.config().await.instance_id) { - unwatch_location(location, library.id, locations_watched, locations_unwatched); - locations_unwatched.remove(&key); - forced_unwatch.remove(&key); - } else { - drop_location( - location_id, - library.id, - "Dropping location from location manager, because we don't have a `local_path` anymore", - locations_watched, - locations_unwatched - ); - } - } else { - drop_location( - location_id, - library.id, - "Removing location from manager, as we failed to fetch from db", - locations_watched, - locations_unwatched, - ); - } - - // Marking location as removed, so we don't try to check it when the time comes - to_remove.insert(key); - - let _ = response_tx.send(Ok(())); // ignore errors, we handle errors on receiver -} - -pub(super) async fn handle_stop_watcher_request( - location_id: location::id::Type, - library: Arc, - response_tx: oneshot::Sender>, - forced_unwatch: &mut HashSet, - locations_watched: &mut HashMap, - locations_unwatched: &mut HashMap, -) { - async fn inner( - location_id: location::id::Type, - library: Arc, - forced_unwatch: &mut HashSet, - locations_watched: &mut HashMap, - locations_unwatched: &mut HashMap, - ) -> Result<(), LocationManagerError> { - let key = (location_id, library.id); - if !forced_unwatch.contains(&key) && locations_watched.contains_key(&key) { - get_location(location_id, &library) - .await - .ok_or_else(|| LocationManagerError::FailedToStopOrReinitWatcher { - reason: String::from("failed to fetch location from db"), - }) - .map(|location| { - unwatch_location(location, library.id, locations_watched, locations_unwatched); - forced_unwatch.insert(key); - }) - } else { - Ok(()) - } - } - - let _ = response_tx.send( - inner( - location_id, - library, - forced_unwatch, - locations_watched, - locations_unwatched, - ) - .await, - ); // ignore errors, we handle errors on receiver -} - -pub(super) async fn handle_reinit_watcher_request( - location_id: location::id::Type, - library: Arc, - response_tx: oneshot::Sender>, - forced_unwatch: &mut HashSet, - locations_watched: &mut HashMap, - locations_unwatched: &mut HashMap, -) { - async fn inner( - location_id: location::id::Type, - library: Arc, - forced_unwatch: &mut HashSet, - locations_watched: &mut HashMap, - locations_unwatched: &mut HashMap, - ) -> Result<(), LocationManagerError> { - let key = (location_id, library.id); - if forced_unwatch.contains(&key) && locations_unwatched.contains_key(&key) { - get_location(location_id, &library) - .await - .ok_or_else(|| LocationManagerError::FailedToStopOrReinitWatcher { - reason: String::from("failed to fetch location from db"), - }) - .map(|location| { - watch_location(location, library.id, locations_watched, locations_unwatched); - forced_unwatch.remove(&key); - }) - } else { - Ok(()) - } - } - - let _ = response_tx.send( - inner( - location_id, - library, - forced_unwatch, - locations_watched, - locations_unwatched, - ) - .await, - ); // ignore errors, we handle errors on receiver -} - -pub(super) fn handle_ignore_path_request( - location_id: location::id::Type, - library: Arc, - path: PathBuf, - ignore: bool, - response_tx: oneshot::Sender>, - locations_watched: &HashMap, -) { - let _ = response_tx.send( - if let Some(watcher) = locations_watched.get(&(location_id, library.id)) { - watcher.ignore_path(path, ignore) - } else { - Ok(()) - }, - ); // ignore errors, we handle errors on receiver -} diff --git a/core/src/location/manager/mod.rs b/core/src/location/manager/mod.rs index 813589363..ebd5fde8c 100644 --- a/core/src/location/manager/mod.rs +++ b/core/src/location/manager/mod.rs @@ -1,6 +1,5 @@ use crate::{ library::{Library, LibraryManagerEvent}, - old_job::JobManagerError, Node, }; @@ -15,19 +14,22 @@ use std::{ sync::Arc, }; +use async_channel as chan; use futures::executor::block_on; use thiserror::Error; -use tokio::sync::{ - broadcast::{self, Receiver}, - mpsc, oneshot, RwLock, +use tokio::{ + spawn, + sync::{ + broadcast::{self, Receiver}, + oneshot, RwLock, + }, }; -use tracing::{debug, error}; +use tracing::{debug, error, instrument, trace}; use uuid::Uuid; +mod runner; mod watcher; -mod helpers; - #[derive(Clone, Copy, Debug)] enum ManagementMessageAction { Add, @@ -39,13 +41,13 @@ pub struct LocationManagementMessage { location_id: location::id::Type, library: Arc, action: ManagementMessageAction, - response_tx: oneshot::Sender>, + ack: oneshot::Sender>, } #[derive(Debug)] enum WatcherManagementMessageAction { - Stop, - Reinit, + Pause, + Resume, IgnoreEventsForPath { path: PathBuf, ignore: bool }, } @@ -54,51 +56,42 @@ pub struct WatcherManagementMessage { location_id: location::id::Type, library: Arc, action: WatcherManagementMessageAction, - response_tx: oneshot::Sender>, + ack: oneshot::Sender>, } #[derive(Error, Debug)] pub enum LocationManagerError { - #[error("Unable to send location management message to location manager actor: (error: {0})")] - ActorSendLocationError(#[from] mpsc::error::SendError), + #[error("location not found in database: ")] + LocationNotFound(location::id::Type), - #[error("Unable to send path to be ignored by watcher actor: (error: {0})")] - ActorIgnorePathError(#[from] mpsc::error::SendError), + #[error("watcher error: {0}")] + Watcher(#[from] notify::Error), - #[error("Unable to watcher management message to watcher manager actor: (error: {0})")] - ActorIgnorePathMessageError(#[from] mpsc::error::SendError), - - #[error("Unable to receive actor response: (error: {0})")] - ActorResponseError(#[from] oneshot::error::RecvError), - - #[error("Watcher error: (error: {0})")] - WatcherError(#[from] notify::Error), - - #[error("Failed to stop or reinit a watcher: {reason}")] - FailedToStopOrReinitWatcher { reason: String }, - - #[error("Missing location from database: ")] - MissingLocation(location::id::Type), - - #[error("Non local location: ")] + #[error("non local location: ")] NonLocalLocation(location::id::Type), - #[error("failed to move file '{}' for reason: {reason}", .path.display())] - MoveError { path: Box, reason: String }, + #[error("file still exists on disk after remove event received: ", .0.display())] + FileStillExistsOnDisk(Box), - #[error("Tried to update a non-existing file: ")] - UpdateNonExistingFile(PathBuf), - #[error("Database error: {0}")] + #[error("failed to move file '{}' for reason: {reason}", .path.display())] + MoveError { + path: Box, + reason: &'static str, + }, + + #[error("database error: {0}")] Database(#[from] prisma_client_rust::QueryError), - #[error("File path related error (error: {0})")] - FilePath(#[from] FilePathError), - #[error("Corrupted location pub_id on database: (error: {0})")] + #[error("corrupted location pub_id on database: {0}")] CorruptedLocationPubId(#[from] uuid::Error), - #[error("Job Manager error: (error: {0})")] - JobManager(#[from] JobManagerError), - #[error("missing-field")] + #[error("missing field: {0}")] MissingField(#[from] MissingFieldError), + #[error(transparent)] + FilePath(#[from] FilePathError), + #[error(transparent)] + IndexerRuler(#[from] sd_core_indexer_rules::Error), + #[error(transparent)] + JobSystem(#[from] sd_core_heavy_lifting::Error), #[error(transparent)] FileIO(#[from] FileIOError), } @@ -107,20 +100,18 @@ type OnlineLocations = BTreeSet>; #[must_use = "'LocationManagerActor::start' must be used to start the actor"] pub struct LocationManagerActor { - location_management_rx: mpsc::Receiver, - - watcher_management_rx: mpsc::Receiver, - - stop_rx: oneshot::Receiver<()>, + location_management_rx: chan::Receiver, + watcher_management_rx: chan::Receiver, + stop_rx: chan::Receiver<()>, } impl LocationManagerActor { pub fn start(self, node: Arc) { - tokio::spawn({ + spawn({ let node = node.clone(); let rx = node.libraries.rx.clone(); async move { - if let Err(err) = rx + if let Err(e) = rx .subscribe(|event| { let node = node.clone(); async move { @@ -134,17 +125,18 @@ impl LocationManagerActor { .await .unwrap_or_else(|e| { error!( - "Failed to get locations from database for location manager: {:#?}", - e - ); + ?e, + "Failed to get locations from database for location manager;", + ); + vec![] }) { if let Err(e) = node.locations.add(location.id, library.clone()).await { error!( - "Failed to add location to location manager: {:#?}", - e + ?e, + "Failed to add location to location manager;", ); } } @@ -160,17 +152,46 @@ impl LocationManagerActor { }) .await { - error!("Core may become unstable! LocationManager's library manager subscription aborted with error: {err:?}"); + error!( + ?e, + "Core may become unstable! LocationManager's \ + library manager subscription aborted with error;", + ); } } }); - tokio::spawn(Locations::run_locations_checker( - self.location_management_rx, - self.watcher_management_rx, - self.stop_rx, - node, - )); + spawn({ + let node = Arc::clone(&node); + let Self { + location_management_rx, + watcher_management_rx, + stop_rx, + } = self; + + async move { + while let Err(e) = spawn({ + runner::run( + location_management_rx.clone(), + watcher_management_rx.clone(), + stop_rx.clone(), + Arc::clone(&node), + ) + }) + .await + { + if e.is_panic() { + error!(?e, "Location manager panicked;"); + } else { + trace!("Location manager received shutdown signal and will exit..."); + break; + } + trace!("Restarting location manager processing task..."); + } + + debug!("Location manager gracefully shutdown"); + } + }); } } @@ -178,64 +199,62 @@ pub struct Locations { online_locations: RwLock, pub online_tx: broadcast::Sender, - location_management_tx: mpsc::Sender, + location_management_tx: chan::Sender, - watcher_management_tx: mpsc::Sender, - stop_tx: Option>, + watcher_management_tx: chan::Sender, + stop_tx: chan::Sender<()>, } impl Locations { pub fn new() -> (Self, LocationManagerActor) { - let online_tx = broadcast::channel(16).0; + let (location_management_tx, location_management_rx) = chan::bounded(128); + let (watcher_management_tx, watcher_management_rx) = chan::bounded(128); + let (stop_tx, stop_rx) = chan::bounded(1); - { - let (location_management_tx, location_management_rx) = mpsc::channel(128); - let (watcher_management_tx, watcher_management_rx) = mpsc::channel(128); - let (stop_tx, stop_rx) = oneshot::channel(); - debug!("Starting location manager actor"); + debug!("Starting location manager actor"); - ( - Self { - online_locations: Default::default(), - online_tx, - location_management_tx, - watcher_management_tx, - stop_tx: Some(stop_tx), - }, - LocationManagerActor { - location_management_rx, - watcher_management_rx, - stop_rx, - }, - ) - } + ( + Self { + online_locations: Default::default(), + online_tx: broadcast::channel(16).0, + location_management_tx, + watcher_management_tx, + stop_tx, + }, + LocationManagerActor { + location_management_rx, + watcher_management_rx, + stop_rx, + }, + ) } + #[instrument(skip(self, library), fields(library_id = %library.id), err)] #[inline] - #[allow(unused_variables)] async fn location_management_message( &self, location_id: location::id::Type, library: Arc, action: ManagementMessageAction, ) -> Result<(), LocationManagerError> { - { - let (tx, rx) = oneshot::channel(); - debug!("Sending location management message to location manager actor: {action:?}"); + let (tx, rx) = oneshot::channel(); + trace!("Sending location management message to location manager actor"); - self.location_management_tx - .send(LocationManagementMessage { - location_id, - library, - action, - response_tx: tx, - }) - .await?; + self.location_management_tx + .send(LocationManagementMessage { + location_id, + library, + action, + ack: tx, + }) + .await + .expect("Location manager actor channel closed sending new location message"); - rx.await? - } + rx.await + .expect("Ack channel closed for location management message response") } + #[instrument(skip(self, library), fields(library_id = %library.id), err)] #[inline] #[allow(unused_variables)] async fn watcher_management_message( @@ -244,22 +263,21 @@ impl Locations { library: Arc, action: WatcherManagementMessageAction, ) -> Result<(), LocationManagerError> { - { - let (tx, rx) = oneshot::channel(); + let (tx, rx) = oneshot::channel(); + trace!("Sending watcher management message to location manager actor"); - debug!("Sending watcher management message to location manager actor: {action:?}"); + self.watcher_management_tx + .send(WatcherManagementMessage { + location_id, + library, + action, + ack: tx, + }) + .await + .expect("Location manager actor channel closed sending new watcher message"); - self.watcher_management_tx - .send(WatcherManagementMessage { - location_id, - library, - action, - response_tx: tx, - }) - .await?; - - rx.await? - } + rx.await + .expect("Ack channel closed for watcher management message response") } pub async fn add( @@ -280,16 +298,16 @@ impl Locations { .await } - pub async fn stop_watcher( + pub async fn pause_watcher( &self, location_id: location::id::Type, library: Arc, ) -> Result<(), LocationManagerError> { - self.watcher_management_message(location_id, library, WatcherManagementMessageAction::Stop) + self.watcher_management_message(location_id, library, WatcherManagementMessageAction::Pause) .await } - pub async fn reinit_watcher( + pub async fn resume_watcher( &self, location_id: location::id::Type, library: Arc, @@ -297,19 +315,19 @@ impl Locations { self.watcher_management_message( location_id, library, - WatcherManagementMessageAction::Reinit, + WatcherManagementMessageAction::Resume, ) .await } - pub async fn temporary_stop( + pub async fn temporary_watcher_pause( &self, location_id: location::id::Type, library: Arc, - ) -> Result { - self.stop_watcher(location_id, library.clone()).await?; + ) -> Result, LocationManagerError> { + self.pause_watcher(location_id, library.clone()).await?; - Ok(StopWatcherGuard { + Ok(PauseWatcherGuard { location_id, library: Some(library), manager: self, @@ -320,8 +338,8 @@ impl Locations { &self, location_id: location::id::Type, library: Arc, - path: impl AsRef, - ) -> Result { + path: impl AsRef + Send, + ) -> Result, LocationManagerError> { let path = path.as_ref().to_path_buf(); self.watcher_management_message( @@ -342,217 +360,6 @@ impl Locations { }) } - async fn run_locations_checker( - mut location_management_rx: mpsc::Receiver, - mut watcher_management_rx: mpsc::Receiver, - mut stop_rx: oneshot::Receiver<()>, - node: Arc, - ) -> Result<(), LocationManagerError> { - use std::collections::{HashMap, HashSet}; - - use futures::stream::{FuturesUnordered, StreamExt}; - use tokio::select; - use tracing::warn; - - use helpers::{ - check_online, drop_location, get_location, handle_ignore_path_request, - handle_reinit_watcher_request, handle_remove_location_request, - handle_stop_watcher_request, location_check_sleep, unwatch_location, watch_location, - }; - use watcher::LocationWatcher; - - let mut to_check_futures = FuturesUnordered::new(); - let mut to_remove = HashSet::new(); - let mut locations_watched = HashMap::new(); - let mut locations_unwatched = HashMap::new(); - let mut forced_unwatch = HashSet::new(); - - loop { - select! { - // Location management messages - Some(LocationManagementMessage{ - location_id, - library, - action, - response_tx - }) = location_management_rx.recv() => { - match action { - - // To add a new location - ManagementMessageAction::Add => { - response_tx.send( - if let Some(location) = get_location(location_id, &library).await { - match check_online(&location, &node, &library).await { - Ok(is_online) => { - - LocationWatcher::new(location, library.clone(), node.clone()) - .await - .map(|mut watcher| { - if is_online { - watcher.watch(); - locations_watched.insert( - (location_id, library.id), - watcher - ); - debug!("Location {location_id} is online, watching it"); - // info!("Locations watched: {:#?}", locations_watched); - } else { - locations_unwatched.insert( - (location_id, library.id), - watcher - ); - } - - to_check_futures.push( - location_check_sleep(location_id, library) - ); - } - ) - }, - Err(e) => { - error!("Error while checking online status of location {location_id}: {e}"); - Ok(()) // TODO: Probs should be error but that will break startup when location is offline - } - } - } else { - warn!( - "Location not found in database to be watched: {}", - location_id - ); - Ok(()) // TODO: Probs should be error but that will break startup when location is offline - }).ok(); // ignore errors, we handle errors on receiver - }, - - // To remove an location - ManagementMessageAction::Remove => { - handle_remove_location_request( - location_id, - library, - response_tx, - &mut forced_unwatch, - &mut locations_watched, - &mut locations_unwatched, - &mut to_remove, - ).await; - }, - } - } - - // Watcher management messages - Some(WatcherManagementMessage{ - location_id, - library, - action, - response_tx, - }) = watcher_management_rx.recv() => { - match action { - // To stop a watcher - WatcherManagementMessageAction::Stop => { - handle_stop_watcher_request( - location_id, - library, - response_tx, - &mut forced_unwatch, - &mut locations_watched, - &mut locations_unwatched, - ).await; - }, - - // To reinit a stopped watcher - WatcherManagementMessageAction::Reinit => { - handle_reinit_watcher_request( - location_id, - library, - response_tx, - &mut forced_unwatch, - &mut locations_watched, - &mut locations_unwatched, - ).await; - }, - - // To ignore or not events for a path - WatcherManagementMessageAction::IgnoreEventsForPath { path, ignore } => { - handle_ignore_path_request( - location_id, - library, - path, - ignore, - response_tx, - &locations_watched, - ); - }, - } - } - - // Periodically checking locations - Some((location_id, library)) = to_check_futures.next() => { - let key = (location_id, library.id); - - if to_remove.contains(&key) { - // The time to check came for an already removed library, so we just ignore it - to_remove.remove(&key); - } else if let Some(location) = get_location(location_id, &library).await { - // TODO(N): This isn't gonna work with removable media and this will likely permanently break if the DB is restored from a backup. - if location.instance_id == Some(library.config().await.instance_id) { - let is_online = match check_online(&location, &node, &library).await { - Ok(is_online) => is_online, - Err(e) => { - error!("Error while checking online status of location {location_id}: {e}"); - continue; - } - }; - - if is_online - && !forced_unwatch.contains(&key) - { - watch_location( - location, - library.id, - &mut locations_watched, - &mut locations_unwatched, - ); - } else { - unwatch_location( - location, - library.id, - &mut locations_watched, - &mut locations_unwatched, - ); - } - to_check_futures.push(location_check_sleep(location_id, library)); - } else { - drop_location( - location_id, - library.id, - "Dropping location from location manager, because \ - it isn't a location in the current node", - &mut locations_watched, - &mut locations_unwatched - ); - forced_unwatch.remove(&key); - } - } else { - drop_location( - location_id, - library.id, - "Removing location from manager, as we failed to fetch from db", - &mut locations_watched, - &mut locations_unwatched, - ); - forced_unwatch.remove(&key); - } - } - - _ = &mut stop_rx => { - debug!("Stopping location manager"); - break; - } - } - } - - Ok(()) - } - pub async fn is_online(&self, id: &Uuid) -> bool { let online_locations = self.online_locations.read().await; online_locations.iter().any(|v| v == id.as_bytes()) @@ -591,29 +398,28 @@ impl Locations { impl Drop for Locations { fn drop(&mut self) { - if let Some(stop_tx) = self.stop_tx.take() { - if stop_tx.send(()).is_err() { - error!("Failed to send stop signal to location manager"); - } + // SAFETY: This will never block as we only have 1 sender and this channel has 1 slot + if self.stop_tx.send_blocking(()).is_err() { + error!("Failed to send stop signal to location manager"); } } } #[must_use = "this `StopWatcherGuard` must be held for some time, so the watcher is stopped"] -pub struct StopWatcherGuard<'m> { +pub struct PauseWatcherGuard<'m> { manager: &'m Locations, location_id: location::id::Type, library: Option>, } -impl Drop for StopWatcherGuard<'_> { +impl Drop for PauseWatcherGuard<'_> { fn drop(&mut self) { // FIXME: change this Drop to async drop in the future - if let Err(e) = block_on(self.manager.reinit_watcher( + if let Err(e) = block_on(self.manager.resume_watcher( self.location_id, self.library.take().expect("library should be set"), )) { - error!("Failed to reinit watcher on stop watcher guard drop: {e}"); + error!(?e, "Failed to resume watcher on stop watcher guard drop;"); } } } @@ -637,7 +443,7 @@ impl Drop for IgnoreEventsForPathGuard<'_> { ignore: false, }, )) { - error!("Failed to un-ignore path on watcher guard drop: {e}"); + error!(?e, "Failed to un-ignore path on watcher guard drop;"); } } } diff --git a/core/src/location/manager/runner.rs b/core/src/location/manager/runner.rs new file mode 100644 index 000000000..1daa383ce --- /dev/null +++ b/core/src/location/manager/runner.rs @@ -0,0 +1,449 @@ +use crate::{ + library::{Library, LibraryId}, + Node, +}; + +use sd_core_prisma_helpers::location_ids_and_path; + +use sd_prisma::prisma::location; +use sd_utils::db::maybe_missing; + +use std::{ + collections::{HashMap, HashSet}, + io::ErrorKind, + path::PathBuf, + pin::pin, + sync::Arc, + time::Duration, +}; + +use async_channel as chan; +use futures::stream::StreamExt; +use futures_concurrency::stream::Merge; +use tokio::{ + fs, + sync::oneshot, + time::{interval, MissedTickBehavior}, +}; +use tokio_stream::wrappers::IntervalStream; +use tracing::{debug, error, instrument, trace, warn}; +use uuid::Uuid; + +use super::{ + watcher::LocationWatcher, LocationManagementMessage, LocationManagerError, + ManagementMessageAction, WatcherManagementMessage, WatcherManagementMessageAction, +}; + +type LocationIdAndLibraryId = (location::id::Type, LibraryId); + +struct Runner { + node: Arc, + locations_to_check: HashMap>, + locations_watched: HashMap, + locations_unwatched: HashMap, + forced_unwatch: HashSet, +} +impl Runner { + fn new(node: Arc) -> Self { + Self { + node, + locations_to_check: HashMap::new(), + locations_watched: HashMap::new(), + locations_unwatched: HashMap::new(), + forced_unwatch: HashSet::new(), + } + } + + async fn add_location( + &mut self, + location_id: i32, + library: Arc, + ) -> Result<(), LocationManagerError> { + if let Some(location) = get_location(location_id, &library).await? { + check_online(&location, &self.node, &library) + .await + .and_then(|is_online| { + LocationWatcher::new(location, Arc::clone(&library), Arc::clone(&self.node)) + .map(|mut watcher| { + if is_online { + trace!(%location_id, "Location is online, watching it!;"); + watcher.watch(); + self.locations_watched + .insert((location_id, library.id), watcher); + } else { + self.locations_unwatched + .insert((location_id, library.id), watcher); + } + + self.locations_to_check + .insert(location_id, Arc::clone(&library)); + }) + }) + } else { + Err(LocationManagerError::LocationNotFound(location_id)) + } + } + + async fn remove_location( + &mut self, + location_id: i32, + library: Arc, + ) -> Result<(), LocationManagerError> { + let key = (location_id, library.id); + + if let Some(location) = get_location(location_id, &library).await? { + // TODO(N): This isn't gonna work with removable media and this will likely permanently break if the DB is restored from a backup. + if location.instance_id == Some(library.config().await.instance_id) { + self.unwatch_location(location, library.id); + self.locations_unwatched.remove(&key); + self.forced_unwatch.remove(&key); + } else { + self.drop_location( + location_id, + library.id, + "Dropping location from location manager, because we don't have a `local_path` anymore", + ); + } + } else { + self.drop_location( + location_id, + library.id, + "Removing location from location manager, as we failed to fetch from db", + ); + } + + // Removing location from checker + self.locations_to_check.remove(&location_id); + + Ok(()) + } + + #[instrument(skip(self, reason))] + fn drop_location( + &mut self, + location_id: location::id::Type, + library_id: LibraryId, + reason: &'static str, + ) { + warn!(%reason); + if let Some(mut watcher) = self.locations_watched.remove(&(location_id, library_id)) { + watcher.unwatch(); + } else { + self.locations_unwatched.remove(&(location_id, library_id)); + } + } + + fn watch_location( + &mut self, + location_ids_and_path::Data { + id: location_id, + path: maybe_location_path, + .. + }: location_ids_and_path::Data, + library_id: LibraryId, + ) { + if let Some(location_path) = maybe_location_path { + if let Some(mut watcher) = self.locations_unwatched.remove(&(location_id, library_id)) { + if watcher.check_path(location_path) { + watcher.watch(); + } + + self.locations_watched + .insert((location_id, library_id), watcher); + } + } + } + + fn unwatch_location( + &mut self, + location_ids_and_path::Data { + id: location_id, + path: maybe_location_path, + .. + }: location_ids_and_path::Data, + library_id: LibraryId, + ) { + if let Some(location_path) = maybe_location_path { + if let Some(mut watcher) = self.locations_watched.remove(&(location_id, library_id)) { + if watcher.check_path(location_path) { + watcher.unwatch(); + } + + self.locations_unwatched + .insert((location_id, library_id), watcher); + } + } + } + + #[instrument(skip(self, library), fields(library_id = %library.id), err)] + async fn pause_watcher( + &mut self, + location_id: location::id::Type, + library: Arc, + ) -> Result<(), LocationManagerError> { + let key = (location_id, library.id); + + if !self.forced_unwatch.contains(&key) && self.locations_watched.contains_key(&key) { + get_location(location_id, &library) + .await? + .ok_or(LocationManagerError::LocationNotFound(location_id)) + .map(|location| { + self.unwatch_location(location, library.id); + self.forced_unwatch.insert(key); + }) + } else { + Ok(()) + } + } + + #[instrument(skip(self, library), fields(library_id = %library.id), err)] + async fn resume_watcher( + &mut self, + location_id: location::id::Type, + library: Arc, + ) -> Result<(), LocationManagerError> { + let key = (location_id, library.id); + + if self.forced_unwatch.contains(&key) && self.locations_unwatched.contains_key(&key) { + get_location(location_id, &library) + .await? + .ok_or(LocationManagerError::LocationNotFound(location_id)) + .map(|location| { + self.watch_location(location, library.id); + self.forced_unwatch.remove(&key); + }) + } else { + Ok(()) + } + } + + async fn ignore_events_for_path( + &self, + location_id: location::id::Type, + library: Arc, + path: PathBuf, + ignore: bool, + ) { + if let Some(watcher) = self.locations_watched.get(&(location_id, library.id)) { + watcher.ignore_path(path, ignore).await + } + } + + async fn handle_location_management_message( + &mut self, + location_id: location::id::Type, + library: Arc, + action: ManagementMessageAction, + ack: oneshot::Sender>, + ) { + ack.send(match action { + ManagementMessageAction::Add => self.add_location(location_id, library).await, + ManagementMessageAction::Remove => self.remove_location(location_id, library).await, + }) + .expect("Ack channel closed") + } + + async fn handle_watcher_management_message( + &mut self, + location_id: location::id::Type, + library: Arc, + action: WatcherManagementMessageAction, + ack: oneshot::Sender>, + ) { + ack.send(match action { + WatcherManagementMessageAction::Pause => self.pause_watcher(location_id, library).await, + WatcherManagementMessageAction::Resume => { + self.resume_watcher(location_id, library).await + } + WatcherManagementMessageAction::IgnoreEventsForPath { path, ignore } => { + self.ignore_events_for_path(location_id, library, path, ignore) + .await; + Ok(()) + } + }) + .expect("Ack channel closed") + } + + async fn check_locations( + &mut self, + locations_to_check_buffer: &mut Vec<(location::id::Type, Arc)>, + ) -> Result<(), Vec> { + let mut errors = vec![]; + locations_to_check_buffer.clear(); + locations_to_check_buffer.extend(self.locations_to_check.drain()); + + for (location_id, library) in locations_to_check_buffer.drain(..) { + if let Err(e) = self + .check_single_location(location_id, Arc::clone(&library)) + .await + { + self.drop_location( + location_id, + library.id, + "Removing location from manager, as we failed to check if it was online", + ); + self.forced_unwatch.remove(&(location_id, library.id)); + errors.push(e); + } + } + + Ok(()) + } + + async fn check_single_location( + &mut self, + location_id: i32, + library: Arc, + ) -> Result<(), LocationManagerError> { + let key = (location_id, library.id); + + if let Some(location) = get_location(location_id, &library).await? { + // TODO(N): This isn't gonna work with removable media and this will likely permanently break if the DB is restored from a backup. + if location.instance_id == Some(library.config().await.instance_id) { + if check_online(&location, &self.node, &library).await? + && !self.forced_unwatch.contains(&key) + { + self.watch_location(location, library.id); + } else { + self.unwatch_location(location, library.id); + } + + self.locations_to_check.insert(location_id, library); + } else { + self.drop_location( + location_id, + library.id, + "Dropping location from location manager, because \ + it isn't a location in the current node", + ); + self.forced_unwatch.remove(&key); + } + + Ok(()) + } else { + Err(LocationManagerError::LocationNotFound(location_id)) + } + } +} + +pub(super) async fn run( + location_management_rx: chan::Receiver, + watcher_management_rx: chan::Receiver, + stop_rx: chan::Receiver<()>, + node: Arc, +) { + enum StreamMessage { + LocationManagementMessage(LocationManagementMessage), + WatcherManagementMessage(WatcherManagementMessage), + CheckLocations, + Stop, + } + + let mut locations_to_check_buffer = vec![]; + + let mut check_locations_interval = interval(Duration::from_secs(2)); + check_locations_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let mut runner = Runner::new(node); + + let mut msg_stream = pin!(( + location_management_rx.map(StreamMessage::LocationManagementMessage), + watcher_management_rx.map(StreamMessage::WatcherManagementMessage), + IntervalStream::new(check_locations_interval).map(|_| StreamMessage::CheckLocations), + stop_rx.map(|()| StreamMessage::Stop), + ) + .merge()); + + while let Some(msg) = msg_stream.next().await { + match msg { + StreamMessage::LocationManagementMessage(LocationManagementMessage { + location_id, + library, + action, + ack, + }) => { + runner + .handle_location_management_message(location_id, library, action, ack) + .await + } + // Watcher management messages + StreamMessage::WatcherManagementMessage(WatcherManagementMessage { + location_id, + library, + action, + ack, + }) => { + runner + .handle_watcher_management_message(location_id, library, action, ack) + .await + } + StreamMessage::CheckLocations => { + if let Err(errors) = runner.check_locations(&mut locations_to_check_buffer).await { + warn!(?errors, "Errors while checking locations;"); + } + } + StreamMessage::Stop => { + debug!("Stopping location manager"); + break; + } + } + } +} + +#[instrument(skip(library), fields(library_id = %library.id), err)] +async fn get_location( + location_id: location::id::Type, + library: &Library, +) -> Result, LocationManagerError> { + library + .db + .location() + .find_unique(location::id::equals(location_id)) + .select(location_ids_and_path::select()) + .exec() + .await + .map_err(Into::into) +} + +#[instrument( + skip_all, + fields(%location_id, library_id = %library.id), + err, +)] +pub(super) async fn check_online( + location_ids_and_path::Data { + id: location_id, + pub_id, + instance_id, + path, + }: &location_ids_and_path::Data, + node: &Node, + library: &Library, +) -> Result { + let pub_id = Uuid::from_slice(pub_id)?; + + // TODO(N): This isn't gonna work with removable media and this will likely permanently break if the DB is restored from a backup. + if *instance_id == Some(library.config().await.instance_id) { + match fs::metadata(maybe_missing(path, "location.path")?).await { + Ok(_) => { + node.locations.add_online(pub_id).await; + Ok(true) + } + Err(e) if e.kind() == ErrorKind::NotFound => { + node.locations.remove_online(&pub_id).await; + Ok(false) + } + Err(e) => { + error!( + ?e, + "Failed to check if location is online, will consider as offline;" + ); + Ok(false) + } + } + } else { + // In this case, we don't have a `local_path`, but this location was marked as online + node.locations.remove_online(&pub_id).await; + Err(LocationManagerError::NonLocalLocation(*location_id)) + } +} diff --git a/core/src/location/manager/watcher/android.rs b/core/src/location/manager/watcher/android.rs index e60d6515c..01bd8a2a1 100644 --- a/core/src/location/manager/watcher/android.rs +++ b/core/src/location/manager/watcher/android.rs @@ -12,40 +12,35 @@ use std::{ sync::Arc, }; -use async_trait::async_trait; use notify::{ event::{CreateKind, DataChange, ModifyKind, RenameMode}, Event, EventKind, }; use tokio::{fs, time::Instant}; -use tracing::{debug, error, trace}; +use tracing::{error, instrument, trace}; use super::{ utils::{create_dir, recalculate_directories_size, remove, rename, update_file}, - EventHandler, HUNDRED_MILLIS, ONE_SECOND, + HUNDRED_MILLIS, ONE_SECOND, }; #[derive(Debug)] -pub(super) struct AndroidEventHandler<'lib> { +pub(super) struct EventHandler { location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, + library: Arc, + node: Arc, last_events_eviction_check: Instant, rename_from: HashMap, recently_renamed_from: BTreeMap, files_to_update: HashMap, reincident_to_update_files: HashMap, to_recalculate_size: HashMap, + path_and_instant_buffer: Vec<(PathBuf, Instant)>, } -#[async_trait] -impl<'lib> EventHandler<'lib> for AndroidEventHandler<'lib> { - fn new( - location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, - ) -> Self { +impl super::EventHandler for EventHandler { + fn new(location_id: location::id::Type, library: Arc, node: Arc) -> Self { Self { location_id, library, @@ -60,8 +55,19 @@ impl<'lib> EventHandler<'lib> for AndroidEventHandler<'lib> { } } + #[instrument( + skip_all, + fields( + location_id = %self.location_id, + library_id = %self.library.id, + waiting_rename_count = %self.recently_renamed_from.len(), + waiting_update_count = %self.files_to_update.len(), + reincident_to_update_files_count = %self.reincident_to_update_files.len(), + waiting_size_count = %self.to_recalculate_size.len(), + ), + )] async fn handle_event(&mut self, event: Event) -> Result<(), LocationManagerError> { - debug!("Received Android event: {:#?}", event); + trace!("Received Android event"); let Event { kind, mut paths, .. @@ -70,7 +76,7 @@ impl<'lib> EventHandler<'lib> for AndroidEventHandler<'lib> { match kind { EventKind::Create(CreateKind::File) | EventKind::Modify(ModifyKind::Data(DataChange::Any)) => { - // When we receive a create, modify data or metadata events of the abore kinds + // When we receive a create, modify data or metadata events of the above kinds // we just mark the file to be updated in a near future // each consecutive event of these kinds that we receive for the same file // we just store the path again in the map below, with a new instant @@ -101,13 +107,14 @@ impl<'lib> EventHandler<'lib> for AndroidEventHandler<'lib> { &fs::metadata(path) .await .map_err(|e| FileIOError::from((path, e)))?, - self.node, - self.library, + &self.node, + &self.library, ) .await?; } + EventKind::Modify(ModifyKind::Name(RenameMode::From)) => { - // Just in case we can't garantee that we receive the Rename From event before the + // Just in case we can't guarantee that we receive the Rename From event before the // Rename Both event. Just a safeguard if self.recently_renamed_from.remove(&paths[0]).is_none() { self.rename_from.insert(paths.remove(0), Instant::now()); @@ -115,23 +122,25 @@ impl<'lib> EventHandler<'lib> for AndroidEventHandler<'lib> { } EventKind::Modify(ModifyKind::Name(RenameMode::Both)) => { - let from_path = &paths[0]; - let to_path = &paths[1]; + let to_path = paths.remove(1); + let from_path = paths.remove(0); + + self.rename_from.remove(&from_path); - self.rename_from.remove(from_path); rename( self.location_id, - to_path, - from_path, - fs::metadata(to_path) + &to_path, + &from_path, + fs::metadata(&to_path) .await - .map_err(|e| FileIOError::from((to_path, e)))?, - self.library, + .map_err(|e| FileIOError::from((&to_path, e)))?, + &self.library, ) .await?; - self.recently_renamed_from - .insert(paths.swap_remove(0), Instant::now()); + + self.recently_renamed_from.insert(from_path, Instant::now()); } + EventKind::Remove(_) => { let path = paths.remove(0); if let Some(parent) = path.parent() { @@ -141,10 +150,11 @@ impl<'lib> EventHandler<'lib> for AndroidEventHandler<'lib> { } } - remove(self.location_id, &path, self.library).await?; + remove(self.location_id, &path, &self.library).await?; } - other_event_kind => { - trace!("Other Linux event that we don't handle for now: {other_event_kind:#?}"); + + _ => { + trace!("Other Android event that we don't handle for now"); } } @@ -154,11 +164,14 @@ impl<'lib> EventHandler<'lib> for AndroidEventHandler<'lib> { async fn tick(&mut self) { if self.last_events_eviction_check.elapsed() > HUNDRED_MILLIS { if let Err(e) = self.handle_to_update_eviction().await { - error!("Error while handling recently created or update files eviction: {e:#?}"); + error!( + ?e, + "Error while handling recently created or update files eviction;" + ); } if let Err(e) = self.handle_rename_from_eviction().await { - error!("Failed to remove file_path: {e:#?}"); + error!(?e, "Failed to remove file_path;"); } self.recently_renamed_from @@ -169,11 +182,11 @@ impl<'lib> EventHandler<'lib> for AndroidEventHandler<'lib> { &mut self.to_recalculate_size, &mut self.path_and_instant_buffer, self.location_id, - self.library, + &self.library, ) .await { - error!("Failed to recalculate directories size: {e:#?}"); + error!(?e, "Failed to recalculate directories size;"); } } @@ -182,9 +195,10 @@ impl<'lib> EventHandler<'lib> for AndroidEventHandler<'lib> { } } -impl AndroidEventHandler<'_> { +impl EventHandler { async fn handle_to_update_eviction(&mut self) -> Result<(), LocationManagerError> { self.path_and_instant_buffer.clear(); + let mut should_invalidate = false; for (path, created_at) in self.files_to_update.drain() { @@ -197,8 +211,11 @@ impl AndroidEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } + self.reincident_to_update_files.remove(&path); - update_file(self.location_id, &path, self.node, self.library).await?; + + update_file(self.location_id, &path, &self.node, &self.library).await?; + should_invalidate = true; } } @@ -221,14 +238,17 @@ impl AndroidEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } + self.files_to_update.remove(&path); - update_file(self.location_id, &path, self.node, self.library).await?; + + update_file(self.location_id, &path, &self.node, &self.library).await?; + should_invalidate = true; } } if should_invalidate { - invalidate_query!(self.library, "search.paths"); + invalidate_query!(&self.library, "search.paths"); } self.reincident_to_update_files @@ -249,21 +269,23 @@ impl AndroidEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } - remove(self.location_id, &path, self.library).await?; + + remove(self.location_id, &path, &self.library).await?; + should_invalidate = true; - trace!("Removed file_path due timeout: {}", path.display()); + + trace!(path = %path.display(), "Removed file_path due timeout;"); } else { self.path_and_instant_buffer.push((path, instant)); } } if should_invalidate { - invalidate_query!(self.library, "search.paths"); + invalidate_query!(&self.library, "search.paths"); } - for (path, instant) in self.path_and_instant_buffer.drain(..) { - self.rename_from.insert(path, instant); - } + self.rename_from + .extend(self.path_and_instant_buffer.drain(..)); Ok(()) } diff --git a/core/src/location/manager/watcher/ios.rs b/core/src/location/manager/watcher/ios.rs index 63f512e52..3a9c91500 100644 --- a/core/src/location/manager/watcher/ios.rs +++ b/core/src/location/manager/watcher/ios.rs @@ -15,45 +15,40 @@ use std::{ sync::Arc, }; -use async_trait::async_trait; use notify::{ event::{CreateKind, DataChange, MetadataKind, ModifyKind, RenameMode}, Event, EventKind, }; use tokio::{fs, io, time::Instant}; -use tracing::{debug, error, trace, warn}; +use tracing::{error, instrument, trace, warn}; use super::{ utils::{ create_dir, create_file, extract_inode_from_path, extract_location_path, recalculate_directories_size, remove, rename, update_file, }, - EventHandler, INode, InstantAndPath, HUNDRED_MILLIS, ONE_SECOND, + INode, InstantAndPath, HUNDRED_MILLIS, ONE_SECOND, }; #[derive(Debug)] -pub(super) struct IosEventHandler<'lib> { +pub(super) struct EventHandler { location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, - files_to_update: HashMap, - reincident_to_update_files: HashMap, + library: Arc, + node: Arc, last_events_eviction_check: Instant, latest_created_dir: Option, old_paths_map: HashMap, new_paths_map: HashMap, - paths_map_buffer: Vec<(INode, InstantAndPath)>, + files_to_update: HashMap, + reincident_to_update_files: HashMap, to_recalculate_size: HashMap, + path_and_instant_buffer: Vec<(PathBuf, Instant)>, + paths_map_buffer: Vec<(INode, InstantAndPath)>, } -#[async_trait] -impl<'lib> EventHandler<'lib> for IosEventHandler<'lib> { - fn new( - location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, - ) -> Self +impl super::EventHandler for EventHandler { + fn new(location_id: location::id::Type, library: Arc, node: Arc) -> Self where Self: Sized, { @@ -61,38 +56,54 @@ impl<'lib> EventHandler<'lib> for IosEventHandler<'lib> { location_id, library, node, - files_to_update: HashMap::new(), - reincident_to_update_files: HashMap::new(), last_events_eviction_check: Instant::now(), latest_created_dir: None, old_paths_map: HashMap::new(), new_paths_map: HashMap::new(), - paths_map_buffer: Vec::new(), + files_to_update: HashMap::new(), + reincident_to_update_files: HashMap::new(), to_recalculate_size: HashMap::new(), path_and_instant_buffer: Vec::new(), + paths_map_buffer: Vec::new(), } } + #[instrument( + skip_all, + fields( + location_id = %self.location_id, + library_id = %self.library.id, + latest_created_dir = ?self.latest_created_dir, + old_paths_map_count = %self.old_paths_map.len(), + new_paths_map = %self.new_paths_map.len(), + waiting_update_count = %self.files_to_update.len(), + reincident_to_update_files_count = %self.reincident_to_update_files.len(), + waiting_size_count = %self.to_recalculate_size.len(), + ), + )] async fn handle_event(&mut self, event: Event) -> Result<(), LocationManagerError> { + trace!("Received iOS event"); + let Event { kind, mut paths, .. } = event; match kind { EventKind::Create(CreateKind::Folder) => { - let path = &paths[0]; + let path = paths.remove(0); create_dir( self.location_id, - path, - &fs::metadata(path) + &path, + &fs::metadata(&path) .await - .map_err(|e| FileIOError::from((path, e)))?, - self.node, - self.library, + .map_err(|e| FileIOError::from((&path, e)))?, + &self.node, + &self.library, ) .await?; - self.latest_created_dir = Some(paths.remove(0)); + + self.latest_created_dir = Some(path); } EventKind::Create(CreateKind::File) @@ -100,12 +111,13 @@ impl<'lib> EventHandler<'lib> for IosEventHandler<'lib> { | EventKind::Modify(ModifyKind::Metadata( MetadataKind::WriteTime | MetadataKind::Extended, )) => { - // When we receive a create, modify data or metadata events of the abore kinds + // When we receive a create, modify data or metadata events of the above kinds // we just mark the file to be updated in a near future // each consecutive event of these kinds that we receive for the same file // we just store the path again in the map below, with a new instant // that effectively resets the timer for the file to be updated <- Copied from macos.rs let path = paths.remove(0); + if self.files_to_update.contains_key(&path) { if let Some(old_instant) = self.files_to_update.insert(path.clone(), Instant::now()) @@ -118,6 +130,7 @@ impl<'lib> EventHandler<'lib> for IosEventHandler<'lib> { self.files_to_update.insert(path, Instant::now()); } } + EventKind::Modify(ModifyKind::Name(RenameMode::Any)) => { self.handle_single_rename_event(paths.remove(0)).await?; } @@ -125,18 +138,22 @@ impl<'lib> EventHandler<'lib> for IosEventHandler<'lib> { // For some reason, iOS doesn't have a Delete Event, so the vent type comes up as this. // Delete Event EventKind::Modify(ModifyKind::Metadata(MetadataKind::Any)) => { - debug!("File has been deleted: {:#?}", paths); let path = paths.remove(0); + + trace!(path = %path.display(), "File has been deleted;"); + if let Some(parent) = path.parent() { if parent != Path::new("") { self.to_recalculate_size .insert(parent.to_path_buf(), Instant::now()); } } - remove(self.location_id, &path, self.library).await?; //FIXME: Find out why this freezes the watcher + + remove(self.location_id, &path, &self.library).await?; //FIXME: Find out why this freezes the watcher } - other_event_kind => { - trace!("Other iOS event that we don't handle for now: {other_event_kind:#?}"); + + _ => { + trace!("Other iOS event that we don't handle for now"); } } @@ -146,16 +163,19 @@ impl<'lib> EventHandler<'lib> for IosEventHandler<'lib> { async fn tick(&mut self) { if self.last_events_eviction_check.elapsed() > HUNDRED_MILLIS { if let Err(e) = self.handle_to_update_eviction().await { - error!("Error while handling recently created or update files eviction: {e:#?}"); + error!( + ?e, + "Error while handling recently created or update files eviction;" + ); } // Cleaning out recently renamed files that are older than 100 milliseconds if let Err(e) = self.handle_rename_create_eviction().await { - error!("Failed to create file_path on iOS : {e:#?}"); + error!(?e, "Failed to create file_path on iOS;"); } if let Err(e) = self.handle_rename_remove_eviction().await { - error!("Failed to remove file_path: {e:#?}"); + error!(?e, "Failed to remove file_path;"); } if !self.to_recalculate_size.is_empty() { @@ -163,11 +183,11 @@ impl<'lib> EventHandler<'lib> for IosEventHandler<'lib> { &mut self.to_recalculate_size, &mut self.path_and_instant_buffer, self.location_id, - self.library, + &self.library, ) .await { - error!("Failed to recalculate directories size: {e:#?}"); + error!(?e, "Failed to recalculate directories size;"); } } @@ -176,7 +196,7 @@ impl<'lib> EventHandler<'lib> for IosEventHandler<'lib> { } } -impl IosEventHandler<'_> { +impl EventHandler { async fn handle_to_update_eviction(&mut self) -> Result<(), LocationManagerError> { self.path_and_instant_buffer.clear(); let mut should_invalidate = false; @@ -191,8 +211,11 @@ impl IosEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } + self.reincident_to_update_files.remove(&path); - update_file(self.location_id, &path, self.node, self.library).await?; + + update_file(self.location_id, &path, &self.node, &self.library).await?; + should_invalidate = true; } } @@ -215,8 +238,11 @@ impl IosEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } + self.files_to_update.remove(&path); - update_file(self.location_id, &path, self.node, self.library).await?; + + update_file(self.location_id, &path, &self.node, &self.library).await?; + should_invalidate = true; } } @@ -246,8 +272,14 @@ impl IosEventHandler<'_> { if metadata.is_dir() { // Don't need to dispatch a recalculate directory event as `create_dir` dispatches // a `scan_location_sub_path` function, which recalculates the size already - create_dir(self.location_id, &path, &metadata, self.node, self.library) - .await?; + create_dir( + self.location_id, + &path, + &metadata, + &self.node, + &self.library, + ) + .await?; } else { if let Some(parent) = path.parent() { if parent != Path::new("") { @@ -255,11 +287,19 @@ impl IosEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } - create_file(self.location_id, &path, &metadata, self.node, self.library) - .await?; + + create_file( + self.location_id, + &path, + &metadata, + &self.node, + &self.library, + ) + .await?; } - trace!("Created file_path due timeout: {}", path.display()); + trace!(path = %path.display(), "Created file_path due timeout;"); + should_invalidate = true; } } else { @@ -289,8 +329,11 @@ impl IosEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } - remove(self.location_id, &path, self.library).await?; - trace!("Removed file_path due timeout: {}", path.display()); + + remove(self.location_id, &path, &self.library).await?; + + trace!(path = %path.display(), "Removed file_path due timeout;"); + should_invalidate = true; } else { self.paths_map_buffer.push((inode, (instant, path))); @@ -313,10 +356,10 @@ impl IosEventHandler<'_> { match fs::metadata(&path).await { Ok(meta) => { // File or directory exists, so this can be a "new path" to an actual rename/move or a creation - trace!("Path exists: {}", path.display()); + trace!(path = %path.display(), "Path exists;"); let inode = get_inode(&meta); - let location_path = extract_location_path(self.location_id, self.library).await?; + let location_path = extract_location_path(self.location_id, &self.library).await?; if !check_file_path_exists::( &IsolatedFilePathData::new( @@ -331,21 +374,22 @@ impl IosEventHandler<'_> { { if let Some((_, old_path)) = self.old_paths_map.remove(&inode) { trace!( - "Got a match new -> old: {} -> {}", - path.display(), - old_path.display() + old_path = %old_path.display(), + new_path = %path.display(), + "Got a match new -> old;", ); // We found a new path for this old path, so we can rename it - rename(self.location_id, &path, &old_path, meta, self.library).await?; + rename(self.location_id, &path, &old_path, meta, &self.library).await?; } else { - trace!("No match for new path yet: {}", path.display()); + trace!(path = %path.display(), "No match for new path yet;"); + self.new_paths_map.insert(inode, (Instant::now(), path)); } } else { warn!( - "Received rename event for a file that already exists in the database: {}", - path.display() + path = %path.display(), + "Received rename event for a file that already exists in the database;", ); } } @@ -353,23 +397,25 @@ impl IosEventHandler<'_> { // File or directory does not exist in the filesystem, if it exists in the database, // then we try pairing it with the old path from our map - trace!("Path doesn't exists: {}", path.display()); + trace!(path = %path.display(), "Path doesn't exists;"); let inode = - match extract_inode_from_path(self.location_id, &path, self.library).await { + match extract_inode_from_path(self.location_id, &path, &self.library).await { Ok(inode) => inode, + Err(LocationManagerError::FilePath(FilePathError::NotFound(_))) => { // temporary file, we can ignore it return Ok(()); } + Err(e) => return Err(e), }; if let Some((_, new_path)) = self.new_paths_map.remove(&inode) { trace!( - "Got a match old -> new: {} -> {}", - path.display(), - new_path.display() + old_path = %path.display(), + new_path = %new_path.display(), + "Got a match old -> new;", ); // We found a new path for this old path, so we can rename it @@ -380,11 +426,12 @@ impl IosEventHandler<'_> { fs::metadata(&new_path) .await .map_err(|e| FileIOError::from((&new_path, e)))?, - self.library, + &self.library, ) .await?; } else { - trace!("No match for old path yet: {}", path.display()); + trace!(path = %path.display(), "No match for old path yet;"); + // We didn't find a new path for this old path, so we store ir for later self.old_paths_map.insert(inode, (Instant::now(), path)); } diff --git a/core/src/location/manager/watcher/linux.rs b/core/src/location/manager/watcher/linux.rs index 43bf0edf3..0ec459a3c 100644 --- a/core/src/location/manager/watcher/linux.rs +++ b/core/src/location/manager/watcher/linux.rs @@ -17,40 +17,35 @@ use std::{ sync::Arc, }; -use async_trait::async_trait; use notify::{ event::{CreateKind, DataChange, ModifyKind, RenameMode}, Event, EventKind, }; use tokio::{fs, time::Instant}; -use tracing::{error, trace}; +use tracing::{error, instrument, trace}; use super::{ utils::{create_dir, recalculate_directories_size, remove, rename, update_file}, - EventHandler, HUNDRED_MILLIS, ONE_SECOND, + HUNDRED_MILLIS, ONE_SECOND, }; #[derive(Debug)] -pub(super) struct LinuxEventHandler<'lib> { +pub(super) struct EventHandler { location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, + library: Arc, + node: Arc, last_events_eviction_check: Instant, rename_from: HashMap, recently_renamed_from: BTreeMap, files_to_update: HashMap, reincident_to_update_files: HashMap, to_recalculate_size: HashMap, + path_and_instant_buffer: Vec<(PathBuf, Instant)>, } -#[async_trait] -impl<'lib> EventHandler<'lib> for LinuxEventHandler<'lib> { - fn new( - location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, - ) -> Self { +impl super::EventHandler for EventHandler { + fn new(location_id: location::id::Type, library: Arc, node: Arc) -> Self { Self { location_id, library, @@ -65,8 +60,19 @@ impl<'lib> EventHandler<'lib> for LinuxEventHandler<'lib> { } } + #[instrument( + skip_all, + fields( + location_id = %self.location_id, + library_id = %self.library.id, + waiting_rename_count = %self.recently_renamed_from.len(), + waiting_update_count = %self.files_to_update.len(), + reincident_to_update_files_count = %self.reincident_to_update_files.len(), + waiting_size_count = %self.to_recalculate_size.len(), + ), + )] async fn handle_event(&mut self, event: Event) -> Result<(), LocationManagerError> { - trace!("Received Linux event: {:#?}", event); + trace!("Received Linux event"); let Event { kind, mut paths, .. @@ -81,6 +87,7 @@ impl<'lib> EventHandler<'lib> for LinuxEventHandler<'lib> { // we just store the path again in the map below, with a new instant // that effectively resets the timer for the file to be updated let path = paths.remove(0); + if self.files_to_update.contains_key(&path) { if let Some(old_instant) = self.files_to_update.insert(path.clone(), Instant::now()) @@ -95,22 +102,23 @@ impl<'lib> EventHandler<'lib> for LinuxEventHandler<'lib> { } EventKind::Create(CreateKind::Folder) => { - let path = &paths[0]; + let path = paths.remove(0); // Don't need to dispatch a recalculate directory event as `create_dir` dispatches // a `scan_location_sub_path` function, which recalculates the size already create_dir( self.location_id, - path, - &fs::metadata(path) + &path, + &fs::metadata(&path) .await - .map_err(|e| FileIOError::from((path, e)))?, - self.node, - self.library, + .map_err(|e| FileIOError::from((&path, e)))?, + &self.node, + &self.library, ) .await?; } + EventKind::Modify(ModifyKind::Name(RenameMode::From)) => { // Just in case we can't guarantee that we receive the Rename From event before the // Rename Both event. Just a safeguard @@ -120,23 +128,24 @@ impl<'lib> EventHandler<'lib> for LinuxEventHandler<'lib> { } EventKind::Modify(ModifyKind::Name(RenameMode::Both)) => { - let from_path = &paths[0]; - let to_path = &paths[1]; + let to_path = paths.remove(1); + let from_path = paths.remove(0); - self.rename_from.remove(from_path); + self.rename_from.remove(&from_path); rename( self.location_id, - to_path, - from_path, - fs::metadata(to_path) + &to_path, + &from_path, + fs::metadata(&to_path) .await - .map_err(|e| FileIOError::from((to_path, e)))?, - self.library, + .map_err(|e| FileIOError::from((&to_path, e)))?, + &self.library, ) .await?; - self.recently_renamed_from - .insert(paths.swap_remove(0), Instant::now()); + + self.recently_renamed_from.insert(from_path, Instant::now()); } + EventKind::Remove(_) => { let path = paths.remove(0); if let Some(parent) = path.parent() { @@ -146,10 +155,11 @@ impl<'lib> EventHandler<'lib> for LinuxEventHandler<'lib> { } } - remove(self.location_id, &path, self.library).await?; + remove(self.location_id, &path, &self.library).await?; } - other_event_kind => { - trace!("Other Linux event that we don't handle for now: {other_event_kind:#?}"); + + _ => { + trace!("Other Linux event that we don't handle for now"); } } @@ -159,11 +169,14 @@ impl<'lib> EventHandler<'lib> for LinuxEventHandler<'lib> { async fn tick(&mut self) { if self.last_events_eviction_check.elapsed() > HUNDRED_MILLIS { if let Err(e) = self.handle_to_update_eviction().await { - error!("Error while handling recently created or update files eviction: {e:#?}"); + error!( + ?e, + "Error while handling recently created or update files eviction;" + ); } if let Err(e) = self.handle_rename_from_eviction().await { - error!("Failed to remove file_path: {e:#?}"); + error!(?e, "Failed to remove file_path;"); } self.recently_renamed_from @@ -174,11 +187,11 @@ impl<'lib> EventHandler<'lib> for LinuxEventHandler<'lib> { &mut self.to_recalculate_size, &mut self.path_and_instant_buffer, self.location_id, - self.library, + &self.library, ) .await { - error!("Failed to recalculate directories size: {e:#?}"); + error!(?e, "Failed to recalculate directories size;"); } } @@ -187,9 +200,10 @@ impl<'lib> EventHandler<'lib> for LinuxEventHandler<'lib> { } } -impl LinuxEventHandler<'_> { +impl EventHandler { async fn handle_to_update_eviction(&mut self) -> Result<(), LocationManagerError> { self.path_and_instant_buffer.clear(); + let mut should_invalidate = false; for (path, created_at) in self.files_to_update.drain() { @@ -202,8 +216,11 @@ impl LinuxEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } + self.reincident_to_update_files.remove(&path); - update_file(self.location_id, &path, self.node, self.library).await?; + + update_file(self.location_id, &path, &self.node, &self.library).await?; + should_invalidate = true; } } @@ -226,8 +243,11 @@ impl LinuxEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } + self.files_to_update.remove(&path); - update_file(self.location_id, &path, self.node, self.library).await?; + + update_file(self.location_id, &path, &self.node, &self.library).await?; + should_invalidate = true; } } @@ -244,6 +264,7 @@ impl LinuxEventHandler<'_> { async fn handle_rename_from_eviction(&mut self) -> Result<(), LocationManagerError> { self.path_and_instant_buffer.clear(); + let mut should_invalidate = false; for (path, instant) in self.rename_from.drain() { @@ -254,9 +275,12 @@ impl LinuxEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } - remove(self.location_id, &path, self.library).await?; + + remove(self.location_id, &path, &self.library).await?; + should_invalidate = true; - trace!("Removed file_path due timeout: {}", path.display()); + + trace!(path = %path.display(), "Removed file_path due timeout;"); } else { self.path_and_instant_buffer.push((path, instant)); } @@ -266,9 +290,8 @@ impl LinuxEventHandler<'_> { invalidate_query!(self.library, "search.paths"); } - for (path, instant) in self.path_and_instant_buffer.drain(..) { - self.rename_from.insert(path, instant); - } + self.rename_from + .extend(self.path_and_instant_buffer.drain(..)); Ok(()) } diff --git a/core/src/location/manager/watcher/macos.rs b/core/src/location/manager/watcher/macos.rs index 99107e375..11486cd20 100644 --- a/core/src/location/manager/watcher/macos.rs +++ b/core/src/location/manager/watcher/macos.rs @@ -24,45 +24,40 @@ use std::{ sync::Arc, }; -use async_trait::async_trait; use notify::{ event::{CreateKind, DataChange, MetadataKind, ModifyKind, RenameMode}, Event, EventKind, }; use tokio::{fs, io, time::Instant}; -use tracing::{error, trace, warn}; +use tracing::{error, instrument, trace, warn}; use super::{ utils::{ create_dir, create_file, extract_inode_from_path, extract_location_path, recalculate_directories_size, remove, rename, update_file, }, - EventHandler, INode, InstantAndPath, HUNDRED_MILLIS, ONE_SECOND, + INode, InstantAndPath, HUNDRED_MILLIS, ONE_SECOND, }; #[derive(Debug)] -pub(super) struct MacOsEventHandler<'lib> { +pub(super) struct EventHandler { location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, - files_to_update: HashMap, - reincident_to_update_files: HashMap, + library: Arc, + node: Arc, last_events_eviction_check: Instant, latest_created_dir: Option, old_paths_map: HashMap, new_paths_map: HashMap, - paths_map_buffer: Vec<(INode, InstantAndPath)>, + files_to_update: HashMap, + reincident_to_update_files: HashMap, to_recalculate_size: HashMap, + path_and_instant_buffer: Vec<(PathBuf, Instant)>, + paths_map_buffer: Vec<(INode, InstantAndPath)>, } -#[async_trait] -impl<'lib> EventHandler<'lib> for MacOsEventHandler<'lib> { - fn new( - location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, - ) -> Self +impl super::EventHandler for EventHandler { + fn new(location_id: location::id::Type, library: Arc, node: Arc) -> Self where Self: Sized, { @@ -70,20 +65,33 @@ impl<'lib> EventHandler<'lib> for MacOsEventHandler<'lib> { location_id, library, node, - files_to_update: HashMap::new(), - reincident_to_update_files: HashMap::new(), last_events_eviction_check: Instant::now(), latest_created_dir: None, old_paths_map: HashMap::new(), new_paths_map: HashMap::new(), - paths_map_buffer: Vec::new(), + files_to_update: HashMap::new(), + reincident_to_update_files: HashMap::new(), to_recalculate_size: HashMap::new(), path_and_instant_buffer: Vec::new(), + paths_map_buffer: Vec::new(), } } + #[instrument( + skip_all, + fields( + location_id = %self.location_id, + library_id = %self.library.id, + latest_created_dir = ?self.latest_created_dir, + old_paths_map_count = %self.old_paths_map.len(), + new_paths_map = %self.new_paths_map.len(), + waiting_update_count = %self.files_to_update.len(), + reincident_to_update_files_count = %self.reincident_to_update_files.len(), + waiting_size_count = %self.to_recalculate_size.len(), + ), + )] async fn handle_event(&mut self, event: Event) -> Result<(), LocationManagerError> { - trace!("Received MacOS event: {:#?}", event); + trace!("Received MacOS event"); let Event { kind, mut paths, .. @@ -91,8 +99,9 @@ impl<'lib> EventHandler<'lib> for MacOsEventHandler<'lib> { match kind { EventKind::Create(CreateKind::Folder) => { - let path = &paths[0]; - if let Some(ref latest_created_dir) = self.latest_created_dir.take() { + let path = paths.remove(0); + + if let Some(latest_created_dir) = self.latest_created_dir.take() { if path == latest_created_dir { // NOTE: This is a MacOS specific event that happens when a folder is created // trough Finder. It creates a folder but 2 events are triggered in @@ -105,18 +114,27 @@ impl<'lib> EventHandler<'lib> for MacOsEventHandler<'lib> { // Don't need to dispatch a recalculate directory event as `create_dir` dispatches // a `scan_location_sub_path` function, which recalculates the size already + let metadata = match fs::metadata(&path).await { + Ok(metadata) => metadata, + Err(e) if e.kind() == io::ErrorKind::NotFound => { + // temporary file, bailing out + return Ok(()); + } + Err(e) => return Err(FileIOError::from((&path, e)).into()), + }; + create_dir( self.location_id, - path, - &fs::metadata(path) - .await - .map_err(|e| FileIOError::from((path, e)))?, - self.node, - self.library, + &path, + &metadata, + &self.node, + &self.library, ) .await?; - self.latest_created_dir = Some(paths.remove(0)); + + self.latest_created_dir = Some(path); } + EventKind::Create(CreateKind::File) | EventKind::Modify(ModifyKind::Data(DataChange::Content)) | EventKind::Modify(ModifyKind::Metadata( @@ -128,6 +146,7 @@ impl<'lib> EventHandler<'lib> for MacOsEventHandler<'lib> { // we just store the path again in the map below, with a new instant // that effectively resets the timer for the file to be updated let path = paths.remove(0); + if self.files_to_update.contains_key(&path) { if let Some(old_instant) = self.files_to_update.insert(path.clone(), Instant::now()) @@ -140,22 +159,24 @@ impl<'lib> EventHandler<'lib> for MacOsEventHandler<'lib> { self.files_to_update.insert(path, Instant::now()); } } + EventKind::Modify(ModifyKind::Name(RenameMode::Any)) => { self.handle_single_rename_event(paths.remove(0)).await?; } EventKind::Remove(_) => { let path = paths.remove(0); + if let Some(parent) = path.parent() { if parent != Path::new("") { self.to_recalculate_size .insert(parent.to_path_buf(), Instant::now()); } } - remove(self.location_id, &path, self.library).await?; + remove(self.location_id, &path, &self.library).await?; } - other_event_kind => { - trace!("Other MacOS event that we don't handle for now: {other_event_kind:#?}"); + _ => { + trace!("Other MacOS event that we don't handle for now"); } } @@ -165,16 +186,19 @@ impl<'lib> EventHandler<'lib> for MacOsEventHandler<'lib> { async fn tick(&mut self) { if self.last_events_eviction_check.elapsed() > HUNDRED_MILLIS { if let Err(e) = self.handle_to_update_eviction().await { - error!("Error while handling recently created or update files eviction: {e:#?}"); + error!( + ?e, + "Error while handling recently created or update files eviction;" + ); } // Cleaning out recently renamed files that are older than 100 milliseconds if let Err(e) = self.handle_rename_create_eviction().await { - error!("Failed to create file_path on MacOS : {e:#?}"); + error!(?e, "Failed to create file_path on MacOS;"); } if let Err(e) = self.handle_rename_remove_eviction().await { - error!("Failed to remove file_path: {e:#?}"); + error!(?e, "Failed to remove file_path;"); } if !self.to_recalculate_size.is_empty() { @@ -182,11 +206,11 @@ impl<'lib> EventHandler<'lib> for MacOsEventHandler<'lib> { &mut self.to_recalculate_size, &mut self.path_and_instant_buffer, self.location_id, - self.library, + &self.library, ) .await { - error!("Failed to recalculate directories size: {e:#?}"); + error!(?e, "Failed to recalculate directories size;"); } } @@ -195,9 +219,10 @@ impl<'lib> EventHandler<'lib> for MacOsEventHandler<'lib> { } } -impl MacOsEventHandler<'_> { +impl EventHandler { async fn handle_to_update_eviction(&mut self) -> Result<(), LocationManagerError> { self.path_and_instant_buffer.clear(); + let mut should_invalidate = false; for (path, created_at) in self.files_to_update.drain() { @@ -211,7 +236,7 @@ impl MacOsEventHandler<'_> { } } self.reincident_to_update_files.remove(&path); - update_file(self.location_id, &path, self.node, self.library).await?; + update_file(self.location_id, &path, &self.node, &self.library).await?; should_invalidate = true; } } @@ -235,7 +260,7 @@ impl MacOsEventHandler<'_> { } } self.files_to_update.remove(&path); - update_file(self.location_id, &path, self.node, self.library).await?; + update_file(self.location_id, &path, &self.node, &self.library).await?; should_invalidate = true; } } @@ -253,20 +278,32 @@ impl MacOsEventHandler<'_> { async fn handle_rename_create_eviction(&mut self) -> Result<(), LocationManagerError> { // Just to make sure that our buffer is clean self.paths_map_buffer.clear(); + let mut should_invalidate = false; for (inode, (instant, path)) in self.new_paths_map.drain() { if instant.elapsed() > HUNDRED_MILLIS { if !self.files_to_update.contains_key(&path) { - let metadata = fs::metadata(&path) - .await - .map_err(|e| FileIOError::from((&path, e)))?; + let metadata = match fs::metadata(&path).await { + Ok(metadata) => metadata, + Err(e) if e.kind() == io::ErrorKind::NotFound => { + // temporary file, bailing out + return Ok(()); + } + Err(e) => return Err(FileIOError::from((&path, e)).into()), + }; if metadata.is_dir() { // Don't need to dispatch a recalculate directory event as `create_dir` dispatches // a `scan_location_sub_path` function, which recalculates the size already - create_dir(self.location_id, &path, &metadata, self.node, self.library) - .await?; + create_dir( + self.location_id, + &path, + &metadata, + &self.node, + &self.library, + ) + .await?; } else { if let Some(parent) = path.parent() { if parent != Path::new("") { @@ -274,11 +311,18 @@ impl MacOsEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } - create_file(self.location_id, &path, &metadata, self.node, self.library) - .await?; + create_file( + self.location_id, + &path, + &metadata, + &self.node, + &self.library, + ) + .await?; } - trace!("Created file_path due timeout: {}", path.display()); + trace!(path = %path.display(), "Created file_path due timeout;"); + should_invalidate = true; } } else { @@ -298,6 +342,7 @@ impl MacOsEventHandler<'_> { async fn handle_rename_remove_eviction(&mut self) -> Result<(), LocationManagerError> { // Just to make sure that our buffer is clean self.paths_map_buffer.clear(); + let mut should_invalidate = false; for (inode, (instant, path)) in self.old_paths_map.drain() { @@ -308,8 +353,11 @@ impl MacOsEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } - remove(self.location_id, &path, self.library).await?; - trace!("Removed file_path due timeout: {}", path.display()); + + remove(self.location_id, &path, &self.library).await?; + + trace!(path = %path.display(), "Removed file_path due timeout;"); + should_invalidate = true; } else { self.paths_map_buffer.push((inode, (instant, path))); @@ -332,10 +380,10 @@ impl MacOsEventHandler<'_> { match fs::metadata(&path).await { Ok(meta) => { // File or directory exists, so this can be a "new path" to an actual rename/move or a creation - trace!("Path exists: {}", path.display()); + trace!(path = %path.display(), "Path exists;"); let inode = get_inode(&meta); - let location_path = extract_location_path(self.location_id, self.library).await?; + let location_path = extract_location_path(self.location_id, &self.library).await?; if !check_file_path_exists::( &IsolatedFilePathData::new( @@ -350,45 +398,49 @@ impl MacOsEventHandler<'_> { { if let Some((_, old_path)) = self.old_paths_map.remove(&inode) { trace!( - "Got a match new -> old: {} -> {}", - path.display(), - old_path.display() + new_path = %path.display(), + old_path = %old_path.display(), + "Got a match new -> old;", ); // We found a new path for this old path, so we can rename it - rename(self.location_id, &path, &old_path, meta, self.library).await?; + rename(self.location_id, &path, &old_path, meta, &self.library).await?; } else { - trace!("No match for new path yet: {}", path.display()); + trace!(path = %path.display(), "No match for new path yet;"); + self.new_paths_map.insert(inode, (Instant::now(), path)); } } else { warn!( - "Received rename event for a file that already exists in the database: {}", - path.display() + path = %path.display(), + "Received rename event for a file that already exists in the database;", ); } } + Err(e) if e.kind() == io::ErrorKind::NotFound => { // File or directory does not exist in the filesystem, if it exists in the database, // then we try pairing it with the old path from our map - trace!("Path doesn't exists: {}", path.display()); + trace!(path = %path.display(), "Path doesn't exists;"); let inode = - match extract_inode_from_path(self.location_id, &path, self.library).await { + match extract_inode_from_path(self.location_id, &path, &self.library).await { Ok(inode) => inode, + Err(LocationManagerError::FilePath(FilePathError::NotFound(_))) => { // temporary file, we can ignore it return Ok(()); } + Err(e) => return Err(e), }; if let Some((_, new_path)) = self.new_paths_map.remove(&inode) { trace!( - "Got a match old -> new: {} -> {}", - path.display(), - new_path.display() + old_path = %path.display(), + new_path = %new_path.display(), + "Got a match old -> new;", ); // We found a new path for this old path, so we can rename it @@ -399,15 +451,17 @@ impl MacOsEventHandler<'_> { fs::metadata(&new_path) .await .map_err(|e| FileIOError::from((&new_path, e)))?, - self.library, + &self.library, ) .await?; } else { - trace!("No match for old path yet: {}", path.display()); + trace!(path = %path.display(), "No match for old path yet;"); + // We didn't find a new path for this old path, so we store ir for later self.old_paths_map.insert(inode, (Instant::now(), path)); } } + Err(e) => return Err(FileIOError::from((path, e)).into()), } diff --git a/core/src/location/manager/watcher/mod.rs b/core/src/location/manager/watcher/mod.rs index d6d70b77f..a7950d4fd 100644 --- a/core/src/location/manager/watcher/mod.rs +++ b/core/src/location/manager/watcher/mod.rs @@ -1,25 +1,31 @@ use crate::{library::Library, Node}; -use sd_prisma::prisma::location; +use sd_core_indexer_rules::{IndexerRule, IndexerRuler}; +use sd_core_prisma_helpers::{location_ids_and_path, location_with_indexer_rules}; + +use sd_prisma::prisma::{location, PrismaClient}; use sd_utils::db::maybe_missing; use std::{ collections::HashSet, + future::Future, path::{Path, PathBuf}, + pin::pin, sync::Arc, time::Duration, }; -use async_trait::async_trait; +use async_channel as chan; +use futures::StreamExt; +use futures_concurrency::stream::Merge; use notify::{Config, Event, RecommendedWatcher, RecursiveMode, Watcher}; use tokio::{ - runtime::Handle, - select, - sync::{mpsc, oneshot}, - task::{block_in_place, JoinHandle}, + spawn, + task::JoinHandle, time::{interval_at, Instant, MissedTickBehavior}, }; -use tracing::{debug, error, warn}; +use tokio_stream::wrappers::IntervalStream; +use tracing::{debug, error, info, instrument, trace, warn, Instrument}; use uuid::Uuid; use super::LocationManagerError; @@ -32,22 +38,22 @@ mod windows; mod utils; -use utils::check_event; +use utils::reject_event; #[cfg(target_os = "linux")] -type Handler<'lib> = linux::LinuxEventHandler<'lib>; +type Handler = linux::EventHandler; #[cfg(target_os = "macos")] -type Handler<'lib> = macos::MacOsEventHandler<'lib>; +type Handler = macos::EventHandler; #[cfg(target_os = "windows")] -type Handler<'lib> = windows::WindowsEventHandler<'lib>; +type Handler = windows::EventHandler; #[cfg(target_os = "android")] -type Handler<'lib> = android::AndroidEventHandler<'lib>; +type Handler = android::EventHandler; #[cfg(target_os = "ios")] -type Handler<'lib> = ios::IosEventHandler<'lib>; +type Handler = ios::EventHandler; pub(super) type IgnorePath = (PathBuf, bool); @@ -55,82 +61,115 @@ type INode = u64; type InstantAndPath = (Instant, PathBuf); const ONE_SECOND: Duration = Duration::from_secs(1); +const THIRTY_SECONDS: Duration = Duration::from_secs(30); const HUNDRED_MILLIS: Duration = Duration::from_millis(100); -#[async_trait] -trait EventHandler<'lib> { - fn new( - location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, - ) -> Self +trait EventHandler: 'static { + fn new(location_id: location::id::Type, library: Arc, node: Arc) -> Self where Self: Sized; /// Handle a file system event. - async fn handle_event(&mut self, event: Event) -> Result<(), LocationManagerError>; + fn handle_event( + &mut self, + event: Event, + ) -> impl Future> + Send; /// As Event Handlers have some inner state, from time to time we need to call this tick method /// so the event handler can update its state. - async fn tick(&mut self); + fn tick(&mut self) -> impl Future + Send; } #[derive(Debug)] pub(super) struct LocationWatcher { - id: i32, - path: String, + location_id: location::id::Type, + location_path: PathBuf, watcher: RecommendedWatcher, - ignore_path_tx: mpsc::UnboundedSender, + ignore_path_tx: chan::Sender, handle: Option>, - stop_tx: Option>, + stop_tx: chan::Sender<()>, } impl LocationWatcher { - pub(super) async fn new( - location: location::Data, + #[instrument( + name = "location_watcher", + skip(pub_id, maybe_location_path, library, node), + fields( + library_id = %library.id, + location_path = ?maybe_location_path, + ), + )] + pub(super) fn new( + location_ids_and_path::Data { + id: location_id, + pub_id, + path: maybe_location_path, + .. + }: location_ids_and_path::Data, library: Arc, node: Arc, ) -> Result { - let (events_tx, events_rx) = mpsc::unbounded_channel(); - let (ignore_path_tx, ignore_path_rx) = mpsc::unbounded_channel(); - let (stop_tx, stop_rx) = oneshot::channel(); + let location_pub_id = Uuid::from_slice(&pub_id)?; + let location_path = maybe_missing(maybe_location_path, "location.path")?.into(); + + let (events_tx, events_rx) = chan::unbounded(); + let (ignore_path_tx, ignore_path_rx) = chan::bounded(8); + let (stop_tx, stop_rx) = chan::bounded(1); let watcher = RecommendedWatcher::new( move |result| { if !events_tx.is_closed() { - if events_tx.send(result).is_err() { - error!( - "Unable to send watcher event to location manager for location: ", - location.id - ); + // SAFETY: we are not blocking the thread as this is an unbounded channel + if events_tx.send_blocking(result).is_err() { + error!(%location_id, "Unable to send watcher event to location manager;"); } } else { - error!( - "Tried to send location file system events to a closed channel: , library: Arc, - mut events_rx: mpsc::UnboundedReceiver>, - mut ignore_path_rx: mpsc::UnboundedReceiver, - mut stop_rx: oneshot::Receiver<()>, + events_rx: chan::Receiver>, + ignore_path_rx: chan::Receiver, + stop_rx: chan::Receiver<()>, ) { - let mut event_handler = Handler::new(location_id, &library, &node); + enum StreamMessage { + NewEvent(notify::Result), + NewIgnorePath(IgnorePath), + Tick, + Stop, + } + + let mut event_handler = Handler::new(location_id, Arc::clone(&library), Arc::clone(&node)); + + let mut last_event_at = Instant::now(); + + let mut cached_indexer_ruler = None; + let mut cached_location_path = None; let mut paths_to_ignore = HashSet::new(); - let mut handler_interval = interval_at(Instant::now() + HUNDRED_MILLIS, HUNDRED_MILLIS); + let mut handler_tick_interval = + interval_at(Instant::now() + HUNDRED_MILLIS, HUNDRED_MILLIS); // In case of doubt check: https://docs.rs/tokio/latest/tokio/time/enum.MissedTickBehavior.html - handler_interval.set_missed_tick_behavior(MissedTickBehavior::Delay); - loop { - select! { - Some(event) = events_rx.recv() => { - match event { - Ok(event) => { - debug!("[Debug - handle_watch_events] Received event: {:#?}", event); - if let Err(e) = Self::handle_single_event( - location_id, - location_pub_id, - event, - &mut event_handler, - &node, - &library, - &paths_to_ignore, - ).await { - error!("Failed to handle location file system event: \ - ", - ); - } - } - Err(e) => { - error!("watch error: {:#?}", e); - } + handler_tick_interval.set_missed_tick_behavior(MissedTickBehavior::Delay); + + let mut msg_stream = pin!(( + events_rx.map(StreamMessage::NewEvent), + ignore_path_rx.map(StreamMessage::NewIgnorePath), + IntervalStream::new(handler_tick_interval).map(|_| StreamMessage::Tick), + stop_rx.map(|()| StreamMessage::Stop), + ) + .merge()); + + while let Some(msg) = msg_stream.next().await { + match msg { + StreamMessage::NewEvent(Ok(event)) => { + if let Err(e) = get_cached_indexer_ruler_and_location_path( + location_id, + &mut cached_indexer_ruler, + &mut cached_location_path, + &last_event_at, + &library.db, + ) + .await + { + error!(?e, "Failed to get indexer ruler;"); + } + + last_event_at = Instant::now(); + + if let Err(e) = Self::handle_single_event( + location_pub_id, + cached_location_path.as_deref(), + event, + &mut event_handler, + &node, + &paths_to_ignore, + cached_indexer_ruler.as_ref(), + ) + .await + { + error!(?e, "Failed to handle location file system event;"); } } - Some((path, ignore)) = ignore_path_rx.recv() => { - if ignore { + StreamMessage::NewEvent(Err(e)) => error!(?e, "Watcher error;"), + + StreamMessage::NewIgnorePath((path, should_ignore)) => { + if should_ignore { paths_to_ignore.insert(path); } else { paths_to_ignore.remove(&path); } } - _ = handler_interval.tick() => { - event_handler.tick().await; - } + StreamMessage::Tick => event_handler.tick().await, - _ = &mut stop_rx => { - debug!("Stop Location Manager event handler for location: ", location_id); - break + StreamMessage::Stop => { + debug!("Stopping Location Manager event handler for location"); + break; } } } } - async fn handle_single_event<'lib>( - location_id: location::id::Type, + #[instrument(skip_all, fields(?event, ?ignore_paths, ?location_path))] + async fn handle_single_event( location_pub_id: Uuid, + location_path: Option<&Path>, event: Event, - event_handler: &mut impl EventHandler<'lib>, - node: &'lib Node, - _library: &'lib Library, + event_handler: &mut impl EventHandler, + node: &Node, ignore_paths: &HashSet, + indexer_ruler: Option<&IndexerRuler>, ) -> Result<(), LocationManagerError> { - debug!("Event: {:#?}", event); - if !check_event(&event, ignore_paths) { + if reject_event(&event, ignore_paths, location_path, indexer_ruler).await { return Ok(()); } - // let Some(location) = find_location(library, location_id) - // .include(location_with_indexer_rules::include()) - // .exec() - // .await? - // else { - // warn!("Tried to handle event for unknown location: "); - // return Ok(()); - // }; - if !node.locations.is_online(&location_pub_id).await { - warn!("Tried to handle event for offline location: "); + warn!("Tried to handle event for offline location"); return Ok(()); } - // debug!("Handling event: {:#?}", event); - event_handler.handle_event(event).await } - pub(super) fn ignore_path( - &self, - path: PathBuf, - ignore: bool, - ) -> Result<(), LocationManagerError> { - self.ignore_path_tx.send((path, ignore)).map_err(Into::into) + #[instrument( + skip(self, path), + fields( + location_id = %self.location_id, + location_path = %self.location_path.display(), + path = %path.display(), + ), + )] + pub(super) async fn ignore_path(&self, path: PathBuf, ignore: bool) { + self.ignore_path_tx + .send((path, ignore)) + .await + .expect("Location watcher ignore path channel closed"); } pub(super) fn check_path(&self, path: impl AsRef) -> bool { - Path::new(&self.path) == path.as_ref() + self.location_path == path.as_ref() } + #[instrument( + skip(self), + fields( + location_id = %self.location_id, + location_path = %self.location_path.display(), + ), + )] pub(super) fn watch(&mut self) { - let path = &self.path; - debug!("Start watching location: (path: {path})"); + trace!("Start watching location"); if let Err(e) = self .watcher - .watch(Path::new(path), RecursiveMode::Recursive) + .watch(self.location_path.as_path(), RecursiveMode::Recursive) { - error!("Unable to watch location: (path: {path}, error: {e:#?})"); + error!(?e, "Unable to watch location;"); } else { - debug!("Now watching location: (path: {path})"); + trace!("Now watching location"); } } + #[instrument( + skip(self), + fields( + location_id = %self.location_id, + location_path = %self.location_path.display(), + ), + )] pub(super) fn unwatch(&mut self) { - let path = &self.path; - if let Err(e) = self.watcher.unwatch(Path::new(path)) { + if let Err(e) = self.watcher.unwatch(self.location_path.as_path()) { /**************************************** TODO: **************************************** * According to an unit test, this error may occur when a subdirectory is removed * * and we try to unwatch the parent directory then we have to check the implications * * of unwatch error for this case. * **************************************************************************************/ - error!("Unable to unwatch location: (path: {path}, error: {e:#?})",); + error!(?e, "Unable to unwatch location;"); } else { - debug!("Stop watching location: (path: {path})"); + trace!("Stop watching location"); } } } impl Drop for LocationWatcher { fn drop(&mut self) { - if let Some(stop_tx) = self.stop_tx.take() { - if stop_tx.send(()).is_err() { - error!( - "Failed to send stop signal to location watcher: ", - self.id - ); - } + // FIXME: change this Drop to async drop in the future + if let Some(handle) = self.handle.take() { + let stop_tx = self.stop_tx.clone(); + spawn(async move { + stop_tx + .send(()) + .await + .expect("Location watcher stop channel closed"); - // FIXME: change this Drop to async drop in the future - if let Some(handle) = self.handle.take() { - if let Err(e) = block_in_place(move || Handle::current().block_on(handle)) { - error!("Failed to join watcher task: {e:#?}") + if let Err(e) = handle.await { + error!(?e, "Failed to join watcher task;"); } - } + }); } } } +async fn get_cached_indexer_ruler_and_location_path( + location_id: location::id::Type, + cached_indexer_ruler: &mut Option, + location_path: &mut Option, + last_event_at: &Instant, + db: &PrismaClient, +) -> Result<(), LocationManagerError> { + if cached_indexer_ruler.is_none() || last_event_at.elapsed() > THIRTY_SECONDS { + if let Some(location_with_indexer_rules::Data { + path, + indexer_rules, + .. + }) = db + .location() + .find_unique(location::id::equals(location_id)) + .include(location_with_indexer_rules::include()) + .exec() + .await? + { + *cached_indexer_ruler = Some( + indexer_rules + .iter() + .map(|rule| IndexerRule::try_from(&rule.indexer_rule)) + .collect::, _>>() + .map(IndexerRuler::new)?, + ); + + *location_path = path.map(Into::into); + } + } + + Ok(()) +} + /*************************************************************************************************** * Some tests to validate our assumptions of events through different file systems * **************************************************************************************************** @@ -412,26 +520,23 @@ mod tests { expected_event: EventKind, ) { let path = path.as_ref(); - debug!( - "Expecting event: {expected_event:#?} at path: {}", - path.display() - ); + debug!(?expected_event, path = %path.display()); let mut tries = 0; loop { match events_rx.try_recv() { Ok(maybe_event) => { let event = maybe_event.expect("Failed to receive event"); - debug!("Received event: {event:#?}"); + debug!(?event, "Received event;"); // Using `ends_with` and removing root path here due to a weird edge case on CI tests at MacOS if event.paths[0].ends_with(path.iter().skip(1).collect::()) && event.kind == expected_event { - debug!("Received expected event: {expected_event:#?}"); + debug!("Received expected event"); break; } } Err(e) => { - debug!("No event yet: {e:#?}"); + debug!(?e, "No event yet;"); tries += 1; sleep(Duration::from_millis(100)).await; } @@ -451,7 +556,7 @@ mod tests { watcher .watch(root_dir.path(), notify::RecursiveMode::Recursive) .expect("Failed to watch root directory"); - debug!("Now watching {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Now watching;"); let file_path = root_dir.path().join("test.txt"); fs::write(&file_path, "test").await.unwrap(); @@ -475,9 +580,9 @@ mod tests { ) .await; - debug!("Unwatching root directory: {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Unwatching root directory;"); if let Err(e) = watcher.unwatch(root_dir.path()) { - error!("Failed to unwatch root directory: {e:#?}"); + error!(?e, "Failed to unwatch root directory;"); } } @@ -489,7 +594,7 @@ mod tests { watcher .watch(root_dir.path(), notify::RecursiveMode::Recursive) .expect("Failed to watch root directory"); - debug!("Now watching {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Now watching;"); let dir_path = root_dir.path().join("inner"); fs::create_dir(&dir_path) @@ -505,9 +610,9 @@ mod tests { #[cfg(target_os = "linux")] expect_event(events_rx, &dir_path, EventKind::Create(CreateKind::Folder)).await; - debug!("Unwatching root directory: {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Unwatching root directory;"); if let Err(e) = watcher.unwatch(root_dir.path()) { - error!("Failed to unwatch root directory: {e:#?}"); + error!(?e, "Failed to unwatch root directory;"); } } @@ -522,7 +627,7 @@ mod tests { watcher .watch(root_dir.path(), notify::RecursiveMode::Recursive) .expect("Failed to watch root directory"); - debug!("Now watching {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Now watching;"); let mut file = fs::OpenOptions::new() .append(true) @@ -556,9 +661,9 @@ mod tests { ) .await; - debug!("Unwatching root directory: {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Unwatching root directory;"); if let Err(e) = watcher.unwatch(root_dir.path()) { - error!("Failed to unwatch root directory: {e:#?}"); + error!(?e, "Failed to unwatch root directory;"); } } @@ -573,7 +678,7 @@ mod tests { watcher .watch(root_dir.path(), notify::RecursiveMode::Recursive) .expect("Failed to watch root directory"); - debug!("Now watching {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Now watching;"); let new_file_name = root_dir.path().join("test2.txt"); @@ -605,9 +710,9 @@ mod tests { ) .await; - debug!("Unwatching root directory: {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Unwatching root directory;"); if let Err(e) = watcher.unwatch(root_dir.path()) { - error!("Failed to unwatch root directory: {e:#?}"); + error!(?e, "Failed to unwatch root directory;"); } } @@ -624,7 +729,7 @@ mod tests { watcher .watch(root_dir.path(), notify::RecursiveMode::Recursive) .expect("Failed to watch root directory"); - debug!("Now watching {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Now watching;"); let new_dir_name = root_dir.path().join("inner2"); @@ -656,9 +761,9 @@ mod tests { ) .await; - debug!("Unwatching root directory: {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Unwatching root directory;"); if let Err(e) = watcher.unwatch(root_dir.path()) { - error!("Failed to unwatch root directory: {e:#?}"); + error!(?e, "Failed to unwatch root directory;"); } } @@ -673,7 +778,7 @@ mod tests { watcher .watch(root_dir.path(), notify::RecursiveMode::Recursive) .expect("Failed to watch root directory"); - debug!("Now watching {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Now watching;"); fs::remove_file(&file_path) .await @@ -696,9 +801,9 @@ mod tests { ) .await; - debug!("Unwatching root directory: {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Unwatching root directory;"); if let Err(e) = watcher.unwatch(root_dir.path()) { - error!("Failed to unwatch root directory: {e:#?}"); + error!(?e, "Failed to unwatch root directory;"); } } @@ -723,11 +828,11 @@ mod tests { watcher .watch(root_dir.path(), notify::RecursiveMode::Recursive) .expect("Failed to watch root directory"); - debug!("Now watching {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Now watching;"); debug!("First unwatching the inner directory before removing it"); if let Err(e) = watcher.unwatch(&dir_path) { - error!("Failed to unwatch inner directory: {e:#?}"); + error!(?e, "Failed to unwatch inner directory;"); } fs::remove_dir(&dir_path) @@ -751,9 +856,9 @@ mod tests { ) .await; - debug!("Unwatching root directory: {}", root_dir.path().display()); + debug!(root = %root_dir.path().display(), "Unwatching root directory;"); if let Err(e) = watcher.unwatch(root_dir.path()) { - error!("Failed to unwatch root directory: {e:#?}"); + error!(?e, "Failed to unwatch root directory;"); } } } diff --git a/core/src/location/manager/watcher/utils.rs b/core/src/location/manager/watcher/utils.rs index 98444008c..a597833ba 100644 --- a/core/src/location/manager/watcher/utils.rs +++ b/core/src/location/manager/watcher/utils.rs @@ -6,19 +6,7 @@ use crate::{ indexer::reverse_update_directories_sizes, location_with_indexer_rules, manager::LocationManagerError, scan_location_sub_path, update_location_size, }, - object::{ - media::{ - exif_data_image_to_query_params, - exif_metadata_extractor::{can_extract_exif_data_for_image, extract_exif_data}, - ffmpeg_metadata_extractor::{ - can_extract_ffmpeg_data_for_audio, can_extract_ffmpeg_data_for_video, - extract_ffmpeg_data, save_ffmpeg_data, - }, - old_thumbnail::get_indexed_thumbnail_path, - }, - old_file_identifier::FileMetadata, - validation::hash::file_checksum, - }, + object::validation::hash::file_checksum, Node, }; @@ -28,21 +16,32 @@ use sd_core_file_path_helper::{ loose_find_existing_file_path_params, path_is_hidden, FilePathError, FilePathMetadata, IsolatedFilePathData, MetadataExt, }; -use sd_core_prisma_helpers::file_path_with_object; +use sd_core_heavy_lifting::{ + file_identifier::FileMetadata, + media_processor::{ + exif_media_data, ffmpeg_media_data, generate_single_thumbnail, get_thumbnails_directory, + ThumbnailKind, + }, +}; +use sd_core_indexer_rules::{ + seed::{GitIgnoreRules, GITIGNORE}, + IndexerRuler, RulerDecision, +}; +use sd_core_prisma_helpers::{file_path_with_object, object_ids, CasId, ObjectPubId}; use sd_file_ext::{ extensions::{AudioExtension, ImageExtension, VideoExtension}, kind::ObjectKind, }; use sd_prisma::{ - prisma::{exif_data, file_path, location, object}, + prisma::{file_path, location, object}, prisma_sync, }; use sd_sync::OperationFactory; use sd_utils::{ db::{inode_from_db, inode_to_db, maybe_missing}, error::FileIOError, - msgpack, uuid_to_bytes, + msgpack, }; #[cfg(target_family = "unix")] @@ -61,31 +60,107 @@ use std::{ }; use chrono::{DateTime, FixedOffset, Local, Utc}; +use futures_concurrency::future::Join; use notify::Event; use tokio::{ fs, io::{self, ErrorKind}, spawn, - time::Instant, + time::{sleep, Instant}, }; -use tracing::{debug, error, trace, warn}; -use uuid::Uuid; +use tracing::{error, instrument, trace, warn}; -use super::{INode, HUNDRED_MILLIS}; +use super::{INode, HUNDRED_MILLIS, ONE_SECOND}; -pub(super) fn check_event(event: &Event, ignore_paths: &HashSet) -> bool { +pub(super) async fn reject_event( + event: &Event, + ignore_paths: &HashSet, + location_path: Option<&Path>, + indexer_ruler: Option<&IndexerRuler>, +) -> bool { // if path includes .DS_Store, .spacedrive file creation or is in the `ignore_paths` set, we ignore - !event.paths.iter().any(|p| { + if event.paths.iter().any(|p| { p.file_name() .and_then(OsStr::to_str) .map_or(false, |name| name == ".DS_Store" || name == ".spacedrive") || ignore_paths.contains(p) - }) + }) { + trace!("Rejected by ignored paths"); + return true; + } + + if let Some(indexer_ruler) = indexer_ruler { + let ruler_decisions = event + .paths + .iter() + .map(|path| async move { (path, fs::metadata(path).await) }) + .collect::>() + .join() + .await + .into_iter() + .filter_map(|(path, res)| { + res.map(|metadata| (path, metadata)) + .map_err(|e| { + if e.kind() != ErrorKind::NotFound { + error!(?e, path = %path.display(), "Failed to get metadata for path;"); + } + }) + .ok() + }) + .map(|(path, metadata)| { + let mut independent_ruler = indexer_ruler.clone(); + + async move { + let path_to_check_gitignore = if metadata.is_dir() { + Some(path.as_path()) + } else { + path.parent() + }; + + if let (Some(path_to_check_gitignore), Some(location_path)) = + (path_to_check_gitignore, location_path.as_ref()) + { + if independent_ruler.has_system(&GITIGNORE) { + if let Some(rules) = GitIgnoreRules::get_rules_if_in_git_repo( + location_path, + path_to_check_gitignore, + ) + .await + { + trace!("Found gitignore rules to follow"); + independent_ruler.extend(rules.map(Into::into)); + } + } + } + + independent_ruler.evaluate_path(path, &metadata).await + } + }) + .collect::>() + .join() + .await; + + if !ruler_decisions.is_empty() + && ruler_decisions.into_iter().all(|res| { + matches!( + res.map_err(|e| trace!(?e, "Failed to evaluate path;")) + // In case of error, we accept the path as a safe default + .unwrap_or(RulerDecision::Accept), + RulerDecision::Reject + ) + }) { + trace!("Rejected by indexer ruler"); + return true; + } + } + + false } +#[instrument(skip_all, fields(path = %path.as_ref().display()), err)] pub(super) async fn create_dir( location_id: location::id::Type, - path: impl AsRef, + path: impl AsRef + Send, metadata: &Metadata, node: &Arc, library: &Arc, @@ -94,17 +169,13 @@ pub(super) async fn create_dir( .include(location_with_indexer_rules::include()) .exec() .await? - .ok_or(LocationManagerError::MissingLocation(location_id))?; + .ok_or(LocationManagerError::LocationNotFound(location_id))?; let path = path.as_ref(); let location_path = maybe_missing(&location.path, "location.path")?; - trace!( - "Location: creating directory: {}", - location_path, - path.display() - ); + trace!(new_directory = %path.display(), "Creating directory;"); let iso_file_path = IsolatedFilePathData::new(location.id, location_path, path, true)?; @@ -112,10 +183,8 @@ pub(super) async fn create_dir( if !parent_iso_file_path.is_root() && !check_file_path_exists::(&parent_iso_file_path, &library.db).await? { - warn!( - "Watcher found a directory without parent: {}", - &iso_file_path - ); + warn!(%iso_file_path, "Watcher found a directory without parent;"); + return Ok(()); }; @@ -123,8 +192,6 @@ pub(super) async fn create_dir( .materialized_path_for_children() .expect("We're in the create dir function lol"); - debug!("Creating path: {}", iso_file_path); - create_file_path( library, iso_file_path.to_parts(), @@ -133,8 +200,24 @@ pub(super) async fn create_dir( ) .await?; - // scan the new directory - scan_location_sub_path(node, library, location, &children_materialized_path).await?; + spawn({ + let node = Arc::clone(node); + let library = Arc::clone(library); + + async move { + // Wait a bit for any files being moved into the new directory to be indexed by the watcher + sleep(ONE_SECOND).await; + + trace!(%iso_file_path, "Scanning new directory;"); + + // scan the new directory + if let Err(e) = + scan_location_sub_path(&node, &library, location, &children_materialized_path).await + { + error!(?e, "Failed to scan new directory;"); + } + } + }); invalidate_query!(library, "search.paths"); invalidate_query!(library, "search.objects"); @@ -142,9 +225,10 @@ pub(super) async fn create_dir( Ok(()) } +#[instrument(skip_all, fields(path = %path.as_ref().display()), err)] pub(super) async fn create_file( location_id: location::id::Type, - path: impl AsRef, + path: impl AsRef + Send, metadata: &Metadata, node: &Arc, library: &Arc, @@ -162,8 +246,8 @@ pub(super) async fn create_file( async fn inner_create_file( location_id: location::id::Type, - location_path: impl AsRef, - path: impl AsRef, + location_path: impl AsRef + Send, + path: impl AsRef + Send, metadata: &Metadata, node: &Arc, library @ Library { @@ -176,11 +260,7 @@ async fn inner_create_file( let path = path.as_ref(); let location_path = location_path.as_ref(); - trace!( - "Location: creating file: {}", - location_path.display(), - path.display() - ); + trace!(new_file = %path.display(), "Creating file;"); let iso_file_path = IsolatedFilePathData::new(location_id, location_path, path, false)?; let iso_file_path_parts = iso_file_path.to_parts(); @@ -200,7 +280,8 @@ async fn inner_create_file( .exec() .await? { - trace!("File already exists with that inode: {}", iso_file_path); + trace!(%iso_file_path, "File already exists with that inode;"); + return inner_update_file(location_path, &file_path, path, node, library, None).await; // If we can't find an existing file with the same inode, we check if there is a file with the same path @@ -216,10 +297,8 @@ async fn inner_create_file( .exec() .await? { - trace!( - "File already exists with that iso_file_path: {}", - iso_file_path - ); + trace!(%iso_file_path, "File already exists with that iso_file_path;"); + return inner_update_file( location_path, &file_path, @@ -235,7 +314,8 @@ async fn inner_create_file( if !parent_iso_file_path.is_root() && !check_file_path_exists::(&parent_iso_file_path, db).await? { - warn!("Watcher found a file without parent: {}", &iso_file_path); + warn!(%iso_file_path, "Watcher found a file without parent;"); + return Ok(()); }; @@ -246,17 +326,13 @@ async fn inner_create_file( fs_metadata, } = FileMetadata::new(&location_path, &iso_file_path).await?; - debug!("Creating path: {}", iso_file_path); - let created_file = create_file_path(library, iso_file_path_parts, cas_id.clone(), metadata).await?; - object::select!(object_ids { id pub_id }); - let existing_object = db .object() .find_first(vec![object::file_paths::some(vec![ - file_path::cas_id::equals(cas_id.clone()), + file_path::cas_id::equals(cas_id.clone().map(Into::into)), file_path::pub_id::not(created_file.pub_id.clone()), ])]) .select(object_ids::select()) @@ -269,16 +345,17 @@ async fn inner_create_file( } = if let Some(object) = existing_object { object } else { - let pub_id = uuid_to_bytes(Uuid::new_v4()); + let pub_id: ObjectPubId = ObjectPubId::new(); let date_created: DateTime = DateTime::::from(fs_metadata.created_or_now()).into(); let int_kind = kind as i32; + sync.write_ops( db, ( sync.shared_create( prisma_sync::object::SyncId { - pub_id: pub_id.clone(), + pub_id: pub_id.to_db(), }, [ (object::date_created::NAME, msgpack!(date_created)), @@ -287,7 +364,7 @@ async fn inner_create_file( ), db.object() .create( - pub_id.to_vec(), + pub_id.into(), vec![ object::date_created::set(Some(date_created)), object::kind::set(Some(int_kind)), @@ -330,16 +407,21 @@ async fn inner_create_file( spawn({ let extension = extension.clone(); let path = path.to_path_buf(); - let node = node.clone(); + let thumbnails_directory = + get_thumbnails_directory(node.config.data_directory()); let library_id = *library_id; async move { - if let Err(e) = node - .thumbnailer - .generate_single_indexed_thumbnail(&extension, cas_id, path, library_id) - .await + if let Err(e) = generate_single_thumbnail( + &thumbnails_directory, + extension, + cas_id, + path, + ThumbnailKind::Indexed(library_id), + ) + .await { - error!("Failed to generate thumbnail in the watcher: {e:#?}"); + error!(?e, "Failed to generate thumbnail in the watcher;"); } } }); @@ -349,34 +431,15 @@ async fn inner_create_file( match kind { ObjectKind::Image => { if let Ok(image_extension) = ImageExtension::from_str(&extension) { - if can_extract_exif_data_for_image(&image_extension) { - if let Ok(Some(exif_data)) = extract_exif_data(path) + if exif_media_data::can_extract(image_extension) { + if let Ok(Some(exif_data)) = exif_media_data::extract(path) .await - .map_err(|e| error!("Failed to extract media data: {e:#?}")) + .map_err(|e| error!(?e, "Failed to extract image media data;")) { - let (sync_params, db_params) = - exif_data_image_to_query_params(exif_data); - - sync.write_ops( + exif_media_data::save( + [(exif_data, object_id, object_pub_id.into())], db, - ( - sync.shared_create( - prisma_sync::exif_data::SyncId { - object: prisma_sync::object::SyncId { - pub_id: object_pub_id.clone(), - }, - }, - sync_params, - ), - db.exif_data().upsert( - exif_data::object_id::equals(object_id), - exif_data::create( - object::id::equals(object_id), - db_params.clone(), - ), - db_params, - ), - ), + sync, ) .await?; } @@ -386,12 +449,12 @@ async fn inner_create_file( ObjectKind::Audio => { if let Ok(audio_extension) = AudioExtension::from_str(&extension) { - if can_extract_ffmpeg_data_for_audio(&audio_extension) { - if let Ok(ffmpeg_data) = extract_ffmpeg_data(path) + if ffmpeg_media_data::can_extract_for_audio(audio_extension) { + if let Ok(ffmpeg_data) = ffmpeg_media_data::extract(path) .await - .map_err(|e| error!("Failed to extract media data: {e:#?}")) + .map_err(|e| error!(?e, "Failed to extract audio media data;")) { - save_ffmpeg_data([(ffmpeg_data, object_id)], db).await?; + ffmpeg_media_data::save([(ffmpeg_data, object_id)], db).await?; } } } @@ -399,12 +462,12 @@ async fn inner_create_file( ObjectKind::Video => { if let Ok(video_extension) = VideoExtension::from_str(&extension) { - if can_extract_ffmpeg_data_for_video(&video_extension) { - if let Ok(ffmpeg_data) = extract_ffmpeg_data(path) + if ffmpeg_media_data::can_extract_for_video(video_extension) { + if let Ok(ffmpeg_data) = ffmpeg_media_data::extract(path) .await - .map_err(|e| error!("Failed to extract media data: {e:#?}")) + .map_err(|e| error!(?e, "Failed to extract video media data;")) { - save_ffmpeg_data([(ffmpeg_data, object_id)], db).await?; + ffmpeg_media_data::save([(ffmpeg_data, object_id)], db).await?; } } } @@ -422,13 +485,14 @@ async fn inner_create_file( Ok(()) } +#[instrument(skip_all, fields(path = %path.as_ref().display()), err)] pub(super) async fn update_file( location_id: location::id::Type, - full_path: impl AsRef, + path: impl AsRef + Send, node: &Arc, library: &Arc, ) -> Result<(), LocationManagerError> { - let full_path = full_path.as_ref(); + let full_path = path.as_ref(); let metadata = match fs::metadata(full_path).await { Ok(metadata) => metadata, @@ -464,16 +528,16 @@ pub(super) async fn update_file( ) .await } - .map(|_| { + .map(|()| { invalidate_query!(library, "search.paths"); invalidate_query!(library, "search.objects"); }) } async fn inner_update_file( - location_path: impl AsRef, + location_path: impl AsRef + Send, file_path: &file_path_with_object::Data, - full_path: impl AsRef, + full_path: impl AsRef + Send, node: &Arc, library @ Library { db, sync, .. }: &Library, maybe_new_inode: Option, @@ -485,9 +549,9 @@ async fn inner_update_file( inode_from_db(&maybe_missing(file_path.inode.as_ref(), "file_path.inode")?[0..8]); trace!( - "Location: updating file: {}", - location_path.display(), - full_path.display() + location_path = %location_path.display(), + path = %full_path.display(), + "Updating file;", ); let iso_file_path = IsolatedFilePathData::try_from(file_path)?; @@ -514,7 +578,7 @@ async fn inner_update_file( }; let is_hidden = path_is_hidden(full_path, &fs_metadata); - if file_path.cas_id != cas_id { + if file_path.cas_id.as_deref() != cas_id.as_ref().map(CasId::as_str) { let (sync_params, db_params): (Vec<_>, Vec<_>) = { use file_path::*; @@ -637,7 +701,7 @@ async fn inner_update_file( .await?; } } else { - let pub_id = uuid_to_bytes(Uuid::new_v4()); + let pub_id = ObjectPubId::new(); let date_created: DateTime = DateTime::::from(fs_metadata.created_or_now()).into(); @@ -646,7 +710,7 @@ async fn inner_update_file( ( sync.shared_create( prisma_sync::object::SyncId { - pub_id: pub_id.clone(), + pub_id: pub_id.to_db(), }, [ (object::date_created::NAME, msgpack!(date_created)), @@ -654,7 +718,7 @@ async fn inner_update_file( ], ), db.object().create( - pub_id.to_vec(), + pub_id.to_db(), vec![ object::date_created::set(Some(date_created)), object::kind::set(Some(int_kind)), @@ -672,49 +736,57 @@ async fn inner_update_file( }, file_path::object::NAME, msgpack!(prisma_sync::object::SyncId { - pub_id: pub_id.clone() + pub_id: pub_id.to_db() }), ), db.file_path().update( file_path::pub_id::equals(file_path.pub_id.clone()), - vec![file_path::object::connect(object::pub_id::equals(pub_id))], + vec![file_path::object::connect(object::pub_id::equals( + pub_id.into(), + ))], ), ) .await?; } - if let Some(old_cas_id) = &file_path.cas_id { + if let Some(old_cas_id) = file_path.cas_id.as_ref().map(CasId::from) { // if this file had a thumbnail previously, we update it to match the new content - if library.thumbnail_exists(node, old_cas_id).await? { + if library.thumbnail_exists(node, &old_cas_id).await? { if let Some(ext) = file_path.extension.clone() { // Running in a detached task as thumbnail generation can take a while and we don't want to block the watcher if let Some(cas_id) = cas_id { let node = Arc::clone(node); let path = full_path.to_path_buf(); let library_id = library.id; - let old_cas_id = old_cas_id.clone(); + let old_cas_id = old_cas_id.to_owned(); + spawn(async move { + let thumbnails_directory = + get_thumbnails_directory(node.config.data_directory()); + let was_overwritten = old_cas_id == cas_id; - if let Err(e) = node - .thumbnailer - .generate_single_indexed_thumbnail( - &ext, cas_id, path, library_id, - ) - .await + if let Err(e) = generate_single_thumbnail( + &thumbnails_directory, + ext.clone(), + cas_id, + path, + ThumbnailKind::Indexed(library_id), + ) + .await { - error!("Failed to generate thumbnail in the watcher: {e:#?}"); + error!(?e, "Failed to generate thumbnail in the watcher;"); } // If only a few bytes changed, cas_id will probably remains intact // so we overwrote our previous thumbnail, so we can't remove it if !was_overwritten { // remove the old thumbnail as we're generating a new one - let thumb_path = - get_indexed_thumbnail_path(&node, &old_cas_id, library_id); + let thumb_path = ThumbnailKind::Indexed(library_id) + .compute_path(node.config.data_directory(), &old_cas_id); if let Err(e) = fs::remove_file(&thumb_path).await { error!( - "Failed to remove old thumbnail: {:#?}", - FileIOError::from((thumb_path, e)) + e = ?FileIOError::from((thumb_path, e)), + "Failed to remove old thumbnail;", ); } } @@ -728,34 +800,15 @@ async fn inner_update_file( match kind { ObjectKind::Image => { if let Ok(image_extension) = ImageExtension::from_str(extension) { - if can_extract_exif_data_for_image(&image_extension) { - if let Ok(Some(exif_data)) = extract_exif_data(full_path) + if exif_media_data::can_extract(image_extension) { + if let Ok(Some(exif_data)) = exif_media_data::extract(full_path) .await - .map_err(|e| error!("Failed to extract media data: {e:#?}")) + .map_err(|e| error!(?e, "Failed to extract media data;")) { - let (sync_params, db_params) = - exif_data_image_to_query_params(exif_data); - - sync.write_ops( + exif_media_data::save( + [(exif_data, object.id, object.pub_id.as_slice().into())], db, - ( - sync.shared_create( - prisma_sync::exif_data::SyncId { - object: prisma_sync::object::SyncId { - pub_id: object.pub_id.clone(), - }, - }, - sync_params, - ), - db.exif_data().upsert( - exif_data::object_id::equals(object.id), - exif_data::create( - object::id::equals(object.id), - db_params.clone(), - ), - db_params, - ), - ), + sync, ) .await?; } @@ -765,12 +818,12 @@ async fn inner_update_file( ObjectKind::Audio => { if let Ok(audio_extension) = AudioExtension::from_str(extension) { - if can_extract_ffmpeg_data_for_audio(&audio_extension) { - if let Ok(ffmpeg_data) = extract_ffmpeg_data(full_path) + if ffmpeg_media_data::can_extract_for_audio(audio_extension) { + if let Ok(ffmpeg_data) = ffmpeg_media_data::extract(full_path) .await - .map_err(|e| error!("Failed to extract media data: {e:#?}")) + .map_err(|e| error!(?e, "Failed to extract media data;")) { - save_ffmpeg_data([(ffmpeg_data, object.id)], db).await?; + ffmpeg_media_data::save([(ffmpeg_data, object.id)], db).await?; } } } @@ -778,12 +831,12 @@ async fn inner_update_file( ObjectKind::Video => { if let Ok(video_extension) = VideoExtension::from_str(extension) { - if can_extract_ffmpeg_data_for_video(&video_extension) { - if let Ok(ffmpeg_data) = extract_ffmpeg_data(full_path) + if ffmpeg_media_data::can_extract_for_video(video_extension) { + if let Ok(ffmpeg_data) = ffmpeg_media_data::extract(full_path) .await - .map_err(|e| error!("Failed to extract media data: {e:#?}")) + .map_err(|e| error!(?e, "Failed to extract media data;")) { - save_ffmpeg_data([(ffmpeg_data, object.id)], db).await?; + ffmpeg_media_data::save([(ffmpeg_data, object.id)], db).await?; } } } @@ -823,10 +876,15 @@ async fn inner_update_file( Ok(()) } +#[instrument( + skip_all, + fields(new_path = %new_path.as_ref().display(), old_path = %old_path.as_ref().display()), + err, +)] pub(super) async fn rename( location_id: location::id::Type, - new_path: impl AsRef, - old_path: impl AsRef, + new_path: impl AsRef + Send, + old_path: impl AsRef + Send, new_path_metadata: Metadata, library: &Library, ) -> Result<(), LocationManagerError> { @@ -841,7 +899,8 @@ pub(super) async fn rename( let new_path_materialized_str = extract_normalized_materialized_path_str(location_id, &location_path, new_path)?; - // Renaming a file could potentially be a move to another directory, so we check if our parent changed + // Renaming a file could potentially be a move to another directory, + // so we check if our parent changed if old_path_materialized_str != new_path_materialized_str && !check_file_path_exists::( &IsolatedFilePathData::new(location_id, &location_path, new_path, true)?.parent(), @@ -851,7 +910,7 @@ pub(super) async fn rename( { return Err(LocationManagerError::MoveError { path: new_path.into(), - reason: "parent directory does not exist".into(), + reason: "parent directory does not exist", }); } @@ -890,7 +949,7 @@ pub(super) async fn rename( .exec() .await?; - let len = paths.len(); + let total_paths_count = paths.len(); let (sync_params, db_params): (Vec<_>, Vec<_>) = paths .into_iter() .filter_map(|path| path.materialized_path.map(|mp| (path.id, path.pub_id, mp))) @@ -916,7 +975,7 @@ pub(super) async fn rename( sync.write_ops(db, (sync_params, db_params)).await?; - trace!("Updated {len} file_paths"); + trace!(%total_paths_count, "Updated file_paths;"); } let is_hidden = path_is_hidden(new_path, &new_path_metadata); @@ -979,12 +1038,13 @@ pub(super) async fn rename( Ok(()) } +#[instrument(skip_all, fields(path = %path.as_ref().display()), err)] pub(super) async fn remove( location_id: location::id::Type, - full_path: impl AsRef, + path: impl AsRef + Send, library: &Library, ) -> Result<(), LocationManagerError> { - let full_path = full_path.as_ref(); + let full_path = path.as_ref(); let location_path = extract_location_path(location_id, library).await?; // if it doesn't exist either way, then we don't care @@ -1005,16 +1065,22 @@ pub(super) async fn remove( remove_by_file_path(location_id, full_path, &file_path, library).await } -pub(super) async fn remove_by_file_path( +async fn remove_by_file_path( location_id: location::id::Type, - path: impl AsRef, + path: impl AsRef + Send, file_path: &file_path::Data, library: &Library, ) -> Result<(), LocationManagerError> { // check file still exists on disk match fs::metadata(path.as_ref()).await { Ok(_) => { - todo!("file has changed in some way, re-identify it") + // It's possible that in the interval of time between the removal file event being + // received and we reaching this point, the file has been created again for some + // external reason, so we just error out and hope to receive this new create event + // later + return Err(LocationManagerError::FileStillExistsOnDisk( + path.as_ref().into(), + )); } Err(e) if e.kind() == ErrorKind::NotFound => { let Library { sync, db, .. } = library; @@ -1060,9 +1126,10 @@ pub(super) async fn remove_by_file_path( Ok(()) } +#[instrument(skip_all, fields(path = %path.as_ref().display()), err)] pub(super) async fn extract_inode_from_path( location_id: location::id::Type, - path: impl AsRef, + path: impl AsRef + Send, library: &Library, ) -> Result { let path = path.as_ref(); @@ -1070,7 +1137,7 @@ pub(super) async fn extract_inode_from_path( .select(location::select!({ path })) .exec() .await? - .ok_or(LocationManagerError::MissingLocation(location_id))?; + .ok_or(LocationManagerError::LocationNotFound(location_id))?; let location_path = maybe_missing(&location.path, "location.path")?; @@ -1095,6 +1162,7 @@ pub(super) async fn extract_inode_from_path( ) } +#[instrument(skip_all, err)] pub(super) async fn extract_location_path( location_id: location::id::Type, library: &Library, @@ -1104,12 +1172,12 @@ pub(super) async fn extract_location_path( .exec() .await? .map_or( - Err(LocationManagerError::MissingLocation(location_id)), + Err(LocationManagerError::LocationNotFound(location_id)), // NOTE: The following usage of `PathBuf` doesn't incur a new allocation so it's fine |location| Ok(maybe_missing(location.path, "location.path")?.into()), ) } - +#[instrument(skip_all, err)] pub(super) async fn recalculate_directories_size( candidates: &mut HashMap, buffer: &mut Vec<(PathBuf, Instant)>, @@ -1129,7 +1197,7 @@ pub(super) async fn recalculate_directories_size( .select(location::select!({ path })) .exec() .await? - .ok_or(LocationManagerError::MissingLocation(location_id))? + .ok_or(LocationManagerError::LocationNotFound(location_id))? .path, "location.path", )?)) @@ -1138,12 +1206,29 @@ pub(super) async fn recalculate_directories_size( if let Some(location_path) = &location_path_cache { if path != *location_path { trace!( - "Reverse calculating directory sizes starting at {} until {}", - path.display(), - location_path.display(), + start_directory = %path.display(), + end_directory = %location_path.display(), + "Reverse calculating directory sizes;", ); - reverse_update_directories_sizes(path, location_id, location_path, library) - .await?; + let mut non_critical_errors = vec![]; + reverse_update_directories_sizes( + path, + location_id, + location_path, + &library.db, + &library.sync, + &mut non_critical_errors, + ) + .await + .map_err(sd_core_heavy_lifting::Error::from)?; + + if !non_critical_errors.is_empty() { + error!( + ?non_critical_errors, + "Reverse calculating directory sizes finished errors;", + ); + } + should_invalidate = true; } else { should_update_location_size = true; diff --git a/core/src/location/manager/watcher/windows.rs b/core/src/location/manager/watcher/windows.rs index f926f76b9..a9b24c54c 100644 --- a/core/src/location/manager/watcher/windows.rs +++ b/core/src/location/manager/watcher/windows.rs @@ -20,46 +20,41 @@ use std::{ sync::Arc, }; -use async_trait::async_trait; use notify::{ event::{CreateKind, ModifyKind, RenameMode}, Event, EventKind, }; use tokio::{fs, time::Instant}; -use tracing::{error, trace}; +use tracing::{error, instrument, trace}; use super::{ utils::{ create_dir, extract_inode_from_path, recalculate_directories_size, remove, rename, update_file, }, - EventHandler, INode, InstantAndPath, HUNDRED_MILLIS, ONE_SECOND, + INode, InstantAndPath, HUNDRED_MILLIS, ONE_SECOND, }; /// Windows file system event handler #[derive(Debug)] -pub(super) struct WindowsEventHandler<'lib> { +pub(super) struct EventHandler { location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, + library: Arc, + node: Arc, last_events_eviction_check: Instant, rename_from_map: BTreeMap, rename_to_map: BTreeMap, files_to_remove: HashMap, - files_to_remove_buffer: Vec<(INode, InstantAndPath)>, files_to_update: HashMap, reincident_to_update_files: HashMap, to_recalculate_size: HashMap, + path_and_instant_buffer: Vec<(PathBuf, Instant)>, + files_to_remove_buffer: Vec<(INode, InstantAndPath)>, } -#[async_trait] -impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { - fn new( - location_id: location::id::Type, - library: &'lib Arc, - node: &'lib Arc, - ) -> Self +impl super::EventHandler for EventHandler { + fn new(location_id: location::id::Type, library: Arc, node: Arc) -> Self where Self: Sized, { @@ -71,33 +66,51 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { rename_from_map: BTreeMap::new(), rename_to_map: BTreeMap::new(), files_to_remove: HashMap::new(), - files_to_remove_buffer: Vec::new(), files_to_update: HashMap::new(), reincident_to_update_files: HashMap::new(), to_recalculate_size: HashMap::new(), path_and_instant_buffer: Vec::new(), + files_to_remove_buffer: Vec::new(), } } + #[instrument( + skip_all, + fields( + location_id = %self.location_id, + library_id = %self.library.id, + rename_from_map_count = %self.rename_from_map.len(), + rename_to_map_count = %self.rename_to_map.len(), + files_to_remove_map = %self.files_to_remove.len(), + waiting_update_count = %self.files_to_update.len(), + reincident_to_update_files_count = %self.reincident_to_update_files.len(), + waiting_size_count = %self.to_recalculate_size.len(), + ), + )] async fn handle_event(&mut self, event: Event) -> Result<(), LocationManagerError> { - trace!("Received Windows event: {:#?}", event); + trace!("Received Windows event"); + let Event { kind, mut paths, .. } = event; match kind { EventKind::Create(CreateKind::Any) => { - let inode = match get_inode_from_path(&paths[0]).await { + let path = paths.remove(0); + + let inode = match get_inode_from_path(&path).await { Ok(inode) => inode, + Err(FilePathError::FileIO(FileIOError { source, .. })) if source.raw_os_error() == Some(32) => { // This is still being manipulated by another process, so we can just ignore it for now // as we will probably receive update events later - self.files_to_update.insert(paths.remove(0), Instant::now()); + self.files_to_update.insert(path, Instant::now()); return Ok(()); } + Err(e) => { return Err(e.into()); } @@ -109,24 +122,23 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { // so we can treat if just as a file rename, like in other OSes trace!( - "Got a rename instead of remove/create: {} -> {}", - old_path.display(), - paths[0].display(), + old_path = %old_path.display(), + new_path = %path.display(), + "Got a rename instead of remove/create;", ); // We found a new path for this old path, so we can rename it instead of removing and creating it rename( self.location_id, - &paths[0], + &path, &old_path, - fs::metadata(&paths[0]) + fs::metadata(&path) .await - .map_err(|e| FileIOError::from((&paths[0], e)))?, - self.library, + .map_err(|e| FileIOError::from((&path, e)))?, + &self.library, ) .await?; } else { - let path = paths.remove(0); let metadata = fs::metadata(&path) .await .map_err(|e| FileIOError::from((&path, e)))?; @@ -134,7 +146,7 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { if metadata.is_dir() { // Don't need to dispatch a recalculate directory event as `create_dir` dispatches // a `scan_location_sub_path` function, which recalculates the size already - create_dir(self.location_id, path, &metadata, self.node, self.library) + create_dir(self.location_id, path, &metadata, &self.node, &self.library) .await?; } else if self.files_to_update.contains_key(&path) { if let Some(old_instant) = @@ -149,8 +161,10 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { } } } + EventKind::Modify(ModifyKind::Any) => { let path = paths.remove(0); + if self.files_to_update.contains_key(&path) { if let Some(old_instant) = self.files_to_update.insert(path.clone(), Instant::now()) @@ -163,10 +177,11 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { self.files_to_update.insert(path, Instant::now()); } } + EventKind::Modify(ModifyKind::Name(RenameMode::From)) => { let path = paths.remove(0); - let inode = extract_inode_from_path(self.location_id, &path, self.library).await?; + let inode = extract_inode_from_path(self.location_id, &path, &self.library).await?; if let Some((_, new_path)) = self.rename_to_map.remove(&inode) { // We found a new path for this old path, so we can rename it @@ -177,13 +192,14 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { fs::metadata(&new_path) .await .map_err(|e| FileIOError::from((&new_path, e)))?, - self.library, + &self.library, ) .await?; } else { self.rename_from_map.insert(inode, (Instant::now(), path)); } } + EventKind::Modify(ModifyKind::Name(RenameMode::To)) => { let path = paths.remove(0); @@ -198,23 +214,25 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { fs::metadata(&path) .await .map_err(|e| FileIOError::from((&path, e)))?, - self.library, + &self.library, ) .await?; } else { self.rename_to_map.insert(inode, (Instant::now(), path)); } } + EventKind::Remove(_) => { let path = paths.remove(0); + self.files_to_remove.insert( - extract_inode_from_path(self.location_id, &path, self.library).await?, + extract_inode_from_path(self.location_id, &path, &self.library).await?, (Instant::now(), path), ); } - other_event_kind => { - trace!("Other Windows event that we don't handle for now: {other_event_kind:#?}"); + _ => { + trace!("Other Windows event that we don't handle for now"); } } @@ -224,26 +242,34 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { async fn tick(&mut self) { if self.last_events_eviction_check.elapsed() > HUNDRED_MILLIS { if let Err(e) = self.handle_to_update_eviction().await { - error!("Error while handling recently created or update files eviction: {e:#?}"); + error!( + ?e, + "Error while handling recently created or update files eviction;" + ); } self.rename_from_map.retain(|_, (created_at, path)| { let to_retain = created_at.elapsed() < HUNDRED_MILLIS; + if !to_retain { - trace!("Removing from rename from map: {:#?}", path.display()) + trace!(path = %path.display(), "Removing from rename from map;") } + to_retain }); + self.rename_to_map.retain(|_, (created_at, path)| { let to_retain = created_at.elapsed() < HUNDRED_MILLIS; + if !to_retain { - trace!("Removing from rename to map: {:#?}", path.display()) + trace!(path = %path.display(), "Removing from rename to map;") } + to_retain }); if let Err(e) = self.handle_removes_eviction().await { - error!("Failed to remove file_path: {e:#?}"); + error!(?e, "Failed to remove file_path;"); } if !self.to_recalculate_size.is_empty() { @@ -251,11 +277,11 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { &mut self.to_recalculate_size, &mut self.path_and_instant_buffer, self.location_id, - self.library, + &self.library, ) .await { - error!("Failed to recalculate directories size: {e:#?}"); + error!(?e, "Failed to recalculate directories size;"); } } @@ -264,9 +290,10 @@ impl<'lib> EventHandler<'lib> for WindowsEventHandler<'lib> { } } -impl WindowsEventHandler<'_> { +impl EventHandler { async fn handle_to_update_eviction(&mut self) -> Result<(), LocationManagerError> { self.path_and_instant_buffer.clear(); + let mut should_invalidate = false; for (path, created_at) in self.files_to_update.drain() { @@ -274,14 +301,16 @@ impl WindowsEventHandler<'_> { self.path_and_instant_buffer.push((path, created_at)); } else { self.reincident_to_update_files.remove(&path); + handle_update( self.location_id, &path, - self.node, + &self.node, &mut self.to_recalculate_size, - self.library, + &self.library, ) .await?; + should_invalidate = true; } } @@ -299,14 +328,16 @@ impl WindowsEventHandler<'_> { self.path_and_instant_buffer.push((path, created_at)); } else { self.files_to_update.remove(&path); + handle_update( self.location_id, &path, - self.node, + &self.node, &mut self.to_recalculate_size, - self.library, + &self.library, ) .await?; + should_invalidate = true; } } @@ -323,6 +354,7 @@ impl WindowsEventHandler<'_> { async fn handle_removes_eviction(&mut self) -> Result<(), LocationManagerError> { self.files_to_remove_buffer.clear(); + let mut should_invalidate = false; for (inode, (instant, path)) in self.files_to_remove.drain() { @@ -333,9 +365,12 @@ impl WindowsEventHandler<'_> { .insert(parent.to_path_buf(), Instant::now()); } } - remove(self.location_id, &path, self.library).await?; + + remove(self.location_id, &path, &self.library).await?; + should_invalidate = true; - trace!("Removed file_path due timeout: {}", path.display()); + + trace!(path = %path.display(), "Removed file_path due timeout;"); } else { self.files_to_remove_buffer.push((inode, (instant, path))); } @@ -344,30 +379,31 @@ impl WindowsEventHandler<'_> { invalidate_query!(self.library, "search.paths"); } - for (key, value) in self.files_to_remove_buffer.drain(..) { - self.files_to_remove.insert(key, value); - } + self.files_to_remove + .extend(self.files_to_remove_buffer.drain(..)); Ok(()) } } -async fn handle_update<'lib>( +async fn handle_update( location_id: location::id::Type, path: &PathBuf, - node: &'lib Arc, + node: &Arc, to_recalculate_size: &mut HashMap, - library: &'lib Arc, + library: &Arc, ) -> Result<(), LocationManagerError> { let metadata = fs::metadata(&path) .await .map_err(|e| FileIOError::from((&path, e)))?; + if metadata.is_file() { if let Some(parent) = path.parent() { if parent != Path::new("") { to_recalculate_size.insert(parent.to_path_buf(), Instant::now()); } } + update_file(location_id, path, node, library).await?; } diff --git a/core/src/location/metadata.rs b/core/src/location/metadata.rs index 7a8ffade5..ecbfabe42 100644 --- a/core/src/location/metadata.rs +++ b/core/src/location/metadata.rs @@ -56,10 +56,12 @@ impl SpacedriveLocationMetadataFile { #[cfg(debug_assertions)] { error!( + metadata_file_name = %metadata_file_name.display(), + ?e, "Failed to deserialize corrupted metadata file, \ - we will remove it and create a new one; File: {}; Error: {e}", - metadata_file_name.display() + we will remove it and create a new one;", ); + fs::remove_file(&metadata_file_name).await.map_err(|e| { LocationMetadataError::Delete( e, diff --git a/core/src/location/mod.rs b/core/src/location/mod.rs index b4a61566d..d613aca4e 100644 --- a/core/src/location/mod.rs +++ b/core/src/location/mod.rs @@ -1,18 +1,16 @@ -use crate::{ - invalidate_query, - library::Library, - object::{ - media::{old_media_processor, OldMediaProcessorJobInit}, - old_file_identifier::{self, old_file_identifier_job::OldFileIdentifierJobInit}, - }, - old_job::{JobBuilder, JobError, JobManagerError}, - Node, -}; +use crate::{context::NodeContext, invalidate_query, library::Library, Node}; use sd_core_file_path_helper::{ filter_existing_file_path_params, IsolatedFilePathData, IsolatedFilePathDataParts, }; -use sd_core_prisma_helpers::location_with_indexer_rules; +use sd_core_heavy_lifting::{ + file_identifier::{self, FileIdentifier}, + indexer::{self, job::Indexer}, + job_system::report::ReportInputMetadata, + media_processor::{self, job::MediaProcessor}, + JobEnqueuer, JobId, +}; +use sd_core_prisma_helpers::{location_with_indexer_rules, CasId}; use sd_prisma::{ prisma::{file_path, indexer_rules_in_location, location, PrismaClient}, @@ -22,7 +20,7 @@ use sd_sync::*; use sd_utils::{ db::{maybe_missing, MissingFieldError}, error::{FileIOError, NonUtf8PathError}, - msgpack, + msgpack, uuid_to_bytes, }; use std::{ @@ -36,20 +34,17 @@ use futures::future::TryFutureExt; use normpath::PathExt; use prisma_client_rust::{operator::and, or, QueryError}; use serde::{Deserialize, Serialize}; -use serde_json::json; use specta::Type; use tokio::{fs, io, time::Instant}; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, info, instrument, warn}; use uuid::Uuid; mod error; -pub mod indexer; mod manager; pub mod metadata; pub mod non_indexed; pub use error::LocationError; -use indexer::OldIndexerJobInit; pub use manager::{LocationManagerError, Locations}; use metadata::SpacedriveLocationMetadataFile; @@ -81,7 +76,7 @@ impl TryFrom for ScanState { /// `LocationCreateArgs` is the argument received from the client using `rspc` to create a new location. /// It has the actual path and a vector of indexer rules ids, to create many-to-many relationships /// between the location and indexer rules. -#[derive(Type, Deserialize)] +#[derive(Debug, Type, Deserialize)] pub struct LocationCreateArgs { pub path: PathBuf, pub dry_run: bool, @@ -89,6 +84,7 @@ pub struct LocationCreateArgs { } impl LocationCreateArgs { + #[instrument(skip(node, library), err)] pub async fn create( self, node: &Node, @@ -159,13 +155,12 @@ impl LocationCreateArgs { } debug!( - "{} new location for '{}'", + "{} new location", if self.dry_run { "Dry run: Would create" } else { "Trying to create" - }, - self.path.display() + } ); let uuid = Uuid::new_v4(); @@ -180,8 +175,10 @@ impl LocationCreateArgs { .await?; if let Some(location) = location { + info!(location_name = ?location.name, "Created location;"); + // Write location metadata to a .spacedrive file - if let Err(err) = SpacedriveLocationMetadataFile::create_and_save( + if let Err(e) = SpacedriveLocationMetadataFile::create_and_save( library.id, uuid, &self.path, @@ -197,19 +194,18 @@ impl LocationCreateArgs { .await { // DISABLED TO FAIL SILENTLY - HOTFIX FOR LACK OF WRITE PERMISSION PREVENTING LOCATION CREATION - error!("Failed to write .spacedrive file: {:?}", err); + error!(?e, "Failed to write .spacedrive file;"); // delete_location(node, library, location.data.id).await?; - // Err(err)?; + // Err(e)?; } - info!("Created location: {:?}", &location.data); - Ok(Some(location.data)) } else { Ok(None) } } + #[instrument(skip(node, library), fields(library_id = %library.id), err)] pub async fn add_library( self, node: &Node, @@ -242,14 +238,12 @@ impl LocationCreateArgs { } debug!( - "{} a new Library to an already existing location '{}'", + "{} a new Library to an already existing location", if self.dry_run { "Dry run: Would add" } else { "Trying to add" }, - library.id, - self.path.display() ); let uuid = Uuid::new_v4(); @@ -272,10 +266,7 @@ impl LocationCreateArgs { .add(location.data.id, library.clone()) .await?; - info!( - "Added library (library_id = {}) to location: {:?}", - library.id, &location.data - ); + info!(location_id = %location.data.id, "Added library to location;"); Ok(Some(location.data)) } else { @@ -461,151 +452,192 @@ async fn link_location_and_indexer_rules( Ok(()) } +#[instrument( + skip(node, library, location), + fields(library_id = %library.id, location_id = %location.id), + err, +)] pub async fn scan_location( node: &Arc, library: &Arc, location: location_with_indexer_rules::Data, location_scan_state: ScanState, -) -> Result<(), JobManagerError> { +) -> Result, sd_core_heavy_lifting::Error> { // TODO(N): This isn't gonna work with removable media and this will likely permanently break if the DB is restored from a backup. if location.instance_id != Some(library.config().await.instance_id) { - return Ok(()); + warn!("Tried to scan a location on a different instance"); + return Ok(None); } + let location_id = location.id; + let ctx = NodeContext { + node: Arc::clone(node), + library: Arc::clone(library), + }; + let location_base_data = location::Data::from(&location); - debug!("Scanning location with state: {location_scan_state:?}"); + debug!("Scanning location"); - match location_scan_state { + let job_id = match location_scan_state { ScanState::Pending | ScanState::Completed => { - JobBuilder::new(OldIndexerJobInit { - location, - sub_path: None, - }) - .with_action("scan_location") - .with_metadata(json!({"location": location_base_data.clone()})) - .build() - .queue_next(OldFileIdentifierJobInit { - location: location_base_data.clone(), - sub_path: None, - }) - .queue_next(OldMediaProcessorJobInit { - location: location_base_data, - sub_path: None, - regenerate_thumbnails: false, - regenerate_labels: false, - }) - .spawn(node, library) - .await + node.job_system + .dispatch( + JobEnqueuer::new(Indexer::new(location, None)?) + .with_action("scan_location") + .with_metadata(ReportInputMetadata::Location(location_base_data.clone())) + .enqueue_next(FileIdentifier::new(location_base_data.clone(), None)?) + .enqueue_next(MediaProcessor::new(location_base_data, None, false)?), + location_id, + ctx.clone(), + ) + .await? } ScanState::Indexed => { - JobBuilder::new(OldFileIdentifierJobInit { - location: location_base_data.clone(), - sub_path: None, - }) - .with_action("scan_location_already_indexed") - .with_metadata(json!({"location": location_base_data.clone()})) - .build() - .queue_next(OldMediaProcessorJobInit { - location: location_base_data, - sub_path: None, - regenerate_thumbnails: false, - regenerate_labels: false, - }) - .spawn(node, library) - .await + node.job_system + .dispatch( + JobEnqueuer::new(FileIdentifier::new(location_base_data.clone(), None)?) + .with_action("scan_location_already_indexed") + .with_metadata(ReportInputMetadata::Location(location_base_data.clone())) + .enqueue_next(MediaProcessor::new(location_base_data, None, false)?), + location_id, + ctx.clone(), + ) + .await? } ScanState::FilesIdentified => { - JobBuilder::new(OldMediaProcessorJobInit { - location: location_base_data.clone(), - sub_path: None, - regenerate_thumbnails: false, - regenerate_labels: false, - }) - .with_action("scan_location_files_already_identified") - .with_metadata(json!({"location": location_base_data})) - .build() - .spawn(node, library) - .await + node.job_system + .dispatch( + JobEnqueuer::new(MediaProcessor::new( + location_base_data.clone(), + None, + false, + )?) + .with_action("scan_location_files_already_identified") + .with_metadata(ReportInputMetadata::Location(location_base_data)), + location_id, + ctx.clone(), + ) + .await? } - } - .map_err(Into::into) + }; + + Ok(Some(job_id)) } +#[instrument( + skip_all, + fields( + library_id = %library.id, + location_id = %location.id, + sub_path = %sub_path.as_ref().display(), + ), + err, +)] pub async fn scan_location_sub_path( node: &Arc, library: &Arc, location: location_with_indexer_rules::Data, - sub_path: impl AsRef, -) -> Result<(), JobManagerError> { + sub_path: impl AsRef + Send, +) -> Result, sd_core_heavy_lifting::Error> { let sub_path = sub_path.as_ref().to_path_buf(); // TODO(N): This isn't gonna work with removable media and this will likely permanently break if the DB is restored from a backup. if location.instance_id != Some(library.config().await.instance_id) { - return Ok(()); + warn!("Tried to scan a location on a different instance"); + return Ok(None); } + let location_id = location.id; + let ctx = NodeContext { + node: Arc::clone(node), + library: Arc::clone(library), + }; + let location_base_data = location::Data::from(&location); - JobBuilder::new(OldIndexerJobInit { - location, - sub_path: Some(sub_path.clone()), - }) - .with_action("scan_location_sub_path") - .with_metadata(json!({ - "location": location_base_data.clone(), - "sub_path": sub_path.clone(), - })) - .build() - .queue_next(OldFileIdentifierJobInit { - location: location_base_data.clone(), - sub_path: Some(sub_path.clone()), - }) - .queue_next(OldMediaProcessorJobInit { - location: location_base_data, - sub_path: Some(sub_path), - regenerate_thumbnails: false, - regenerate_labels: false, - }) - .spawn(node, library) - .await - .map_err(Into::into) + debug!("Scanning location on a sub path"); + + node.job_system + .dispatch( + JobEnqueuer::new(Indexer::new(location, Some(sub_path.clone()))?) + .with_action("scan_location") + .with_metadata(ReportInputMetadata::Location(location_base_data.clone())) + .with_metadata(ReportInputMetadata::SubPath(sub_path.clone())) + .enqueue_next(FileIdentifier::new( + location_base_data.clone(), + Some(sub_path.clone()), + )?) + .enqueue_next(MediaProcessor::new( + location_base_data, + Some(sub_path), + false, + )?), + location_id, + ctx.clone(), + ) + .await + .map_err(Into::into) + .map(Some) } +#[instrument( + skip_all, + fields( + library_id = %library.id, + location_id = %location.id, + sub_path = %sub_path.as_ref().display(), + ), + err, +)] pub async fn light_scan_location( node: Arc, library: Arc, location: location_with_indexer_rules::Data, sub_path: impl AsRef, -) -> Result<(), JobError> { +) -> Result<(), sd_core_heavy_lifting::Error> { let sub_path = sub_path.as_ref().to_path_buf(); // TODO(N): This isn't gonna work with removable media and this will likely permanently break if the DB is restored from a backup. if location.instance_id != Some(library.config().await.instance_id) { + warn!("Tried to scan a location on a different instance"); return Ok(()); } let location_base_data = location::Data::from(&location); - indexer::old_shallow(&location, &sub_path, &node, &library).await?; - old_file_identifier::old_shallow(&location_base_data, &sub_path, &library).await?; - old_media_processor::old_shallow( - &location_base_data, - &sub_path, - &library, - #[cfg(feature = "ai")] - false, - &node, - ) - .await?; + let dispatcher = node.task_system.get_dispatcher(); + let ctx = NodeContext { node, library }; + + for e in indexer::shallow(location, &sub_path, &dispatcher, &ctx).await? { + error!(?e, "Shallow indexer errors;"); + } + + for e in + file_identifier::shallow(location_base_data.clone(), &sub_path, &dispatcher, &ctx).await? + { + error!(?e, "Shallow file identifier errors;"); + } + + for e in media_processor::shallow(location_base_data, &sub_path, &dispatcher, &ctx).await? { + error!(?e, "Shallow media processor errors;"); + } Ok(()) } +#[instrument( + skip_all, + fields( + library_id = %id, + location_path = %location_path.as_ref().display(), + ), + err, +)] pub async fn relink_location( Library { db, id, sync, .. }: &Library, location_path: impl AsRef, -) -> Result { +) -> Result { let location_path = location_path.as_ref(); let mut metadata = SpacedriveLocationMetadataFile::try_load(&location_path) .await? @@ -613,7 +645,7 @@ pub async fn relink_location( metadata.relink(*id, location_path).await?; - let pub_id = >::as_ref(&metadata.location_pub_id(*id)?).to_vec(); + let pub_id = uuid_to_bytes(&metadata.location_pub_id(*id)?); let path = location_path .to_str() .map(str::to_string) @@ -794,6 +826,7 @@ async fn create_location( })) } +#[instrument(skip(node, library), fields(library_id = %library.id), err)] pub async fn delete_location( node: &Node, library: &Arc, @@ -803,17 +836,11 @@ pub async fn delete_location( let start = Instant::now(); node.locations.remove(location_id, library.clone()).await?; - debug!( - "Elapsed time to remove location from node: {:?}", - start.elapsed() - ); + debug!(elapsed_time = ?start.elapsed(), "Removed location from node;"); let start = Instant::now(); delete_directory(library, location_id, None).await?; - debug!( - "Elapsed time to delete location file paths: {:?}", - start.elapsed() - ); + debug!(elapsed_time = ?start.elapsed(), "Deleted location file paths;"); let location = library .db @@ -846,10 +873,7 @@ pub async fn delete_location( } } } - debug!( - "Elapsed time to remove location metadata: {:?}", - start.elapsed() - ); + debug!(elapsed_time = ?start.elapsed(), "Removed location metadata;"); let start = Instant::now(); @@ -861,10 +885,7 @@ pub async fn delete_location( )]) .exec() .await?; - debug!( - "Elapsed time to delete indexer rules in location: {:?}", - start.elapsed() - ); + debug!(elapsed_time = ?start.elapsed(), "Deleted indexer rules in location;"); let start = Instant::now(); @@ -877,20 +898,18 @@ pub async fn delete_location( ) .await?; - debug!( - "Elapsed time to delete location from db: {:?}", - start.elapsed() - ); + debug!(elapsed_time = ?start.elapsed(), "Deleted location from db;"); invalidate_query!(library, "locations.list"); - info!("Location {location_id} deleted"); + info!("Location deleted"); Ok(()) } /// Will delete a directory recursively with Objects if left as orphans /// this function is used to delete a location and when ingesting directory deletion events +#[instrument(skip_all, err)] pub async fn delete_directory( library: &Library, location_id: location::id::Type, @@ -924,6 +943,7 @@ pub async fn delete_directory( Ok(()) } +#[instrument(skip_all, err)] async fn check_nested_location( location_path: impl AsRef, db: &PrismaClient, @@ -956,8 +976,8 @@ async fn check_nested_location( let is_a_child_location = potential_children.into_iter().any(|v| { let Some(location_path) = v.path else { warn!( - "Missing location path on location at check nested location", - v.id + location_id = %v.id, + "Missing location path on location at check nested location", ); return false; }; @@ -980,6 +1000,7 @@ async fn check_nested_location( Ok(parents_count > 0 || is_a_child_location) } +#[instrument(skip_all, err)] pub async fn update_location_size( location_id: location::id::Type, library: &Library, @@ -1028,6 +1049,7 @@ pub async fn update_location_size( Ok(()) } +#[instrument(skip_all, err)] pub async fn get_location_path_from_location_id( db: &PrismaClient, location_id: file_path::id::Type, @@ -1049,6 +1071,7 @@ pub async fn get_location_path_from_location_id( }) } +#[instrument(skip_all, err)] pub async fn create_file_path( crate::location::Library { db, sync, .. }: &crate::location::Library, IsolatedFilePathDataParts { @@ -1059,7 +1082,7 @@ pub async fn create_file_path( extension, .. }: IsolatedFilePathDataParts<'_>, - cas_id: Option, + cas_id: Option>, metadata: sd_core_file_path_helper::FilePathMetadata, ) -> Result { use sd_utils::db::inode_to_db; @@ -1091,7 +1114,10 @@ pub async fn create_file_path( ), location::connect(prisma::location::id::equals(location.id)), ), - ((cas_id::NAME, msgpack!(cas_id)), cas_id::set(cas_id)), + ( + (cas_id::NAME, msgpack!(cas_id)), + cas_id::set(cas_id.map(Into::into)), + ), ( (materialized_path::NAME, msgpack!(materialized_path)), materialized_path::set(Some(materialized_path.into())), @@ -1134,7 +1160,7 @@ pub async fn create_file_path( .unzip() }; - let pub_id = sd_utils::uuid_to_bytes(Uuid::new_v4()); + let pub_id = sd_utils::uuid_to_bytes(&Uuid::new_v4()); let created_path = sync .write_ops( diff --git a/core/src/location/non_indexed.rs b/core/src/location/non_indexed.rs index 9050d8f0c..04d040f1c 100644 --- a/core/src/location/non_indexed.rs +++ b/core/src/location/non_indexed.rs @@ -1,17 +1,16 @@ -use crate::{ - api::locations::ExplorerItem, - library::Library, - object::{ - cas::generate_cas_id, - media::old_thumbnail::{get_ephemeral_thumb_key, BatchToProcess, GenerateThumbnailArgs}, - }, - Node, -}; +use crate::{api::locations::ExplorerItem, context::NodeContext, library::Library, Node}; use sd_core_file_path_helper::{path_is_hidden, MetadataExt}; +use sd_core_heavy_lifting::{ + file_identifier::generate_cas_id, + media_processor::{ + self, get_thumbnails_directory, thumbnailer::NewThumbnailReporter, GenerateThumbnailArgs, + NewThumbnailsReporter, ThumbKey, + }, +}; use sd_core_indexer_rules::{ seed::{NO_HIDDEN, NO_SYSTEM_FILES}, - IndexerRule, RuleKind, + IndexerRule, IndexerRuler, RulerDecision, }; use sd_file_ext::{extensions::Extension, kind::ObjectKind}; @@ -28,14 +27,14 @@ use std::{ use chrono::{DateTime, Utc}; use futures::Stream; -use itertools::Either; +use itertools::{Either, Itertools}; use rspc::ErrorCode; use serde::Serialize; use specta::Type; use thiserror::Error; -use tokio::{io, sync::mpsc, task::JoinError}; +use tokio::{io, spawn, sync::mpsc, task::JoinError}; use tokio_stream::wrappers::ReceiverStream; -use tracing::{error, span, warn, Level}; +use tracing::{debug, error, span, warn, Level}; use super::normalize_path; @@ -64,12 +63,12 @@ impl From> for NonIndexedLocationError { } impl From for rspc::Error { - fn from(err: NonIndexedLocationError) -> Self { - match err { + fn from(e: NonIndexedLocationError) -> Self { + match e { NonIndexedLocationError::NotFound(_) => { - rspc::Error::with_cause(ErrorCode::NotFound, err.to_string(), err) + rspc::Error::with_cause(ErrorCode::NotFound, e.to_string(), e) } - _ => rspc::Error::with_cause(ErrorCode::InternalServerError, err.to_string(), err), + _ => rspc::Error::with_cause(ErrorCode::InternalServerError, e.to_string(), e), } } } @@ -121,12 +120,12 @@ pub async fn walk( let tx2 = tx.clone(); // We wanna process and let the caller use the stream. - let task = tokio::spawn(async move { + let task = spawn(async move { let path = &path; - let rules = chain_optional_iter( + let indexer_ruler = IndexerRuler::new(chain_optional_iter( [IndexerRule::from(NO_SYSTEM_FILES.deref())], [(!with_hidden_files).then(|| IndexerRule::from(NO_HIDDEN.deref()))], - ); + )); let mut thumbnails_to_generate = vec![]; // Generating thumbnails for PDFs is kinda slow, so we're leaving them for last in the batch @@ -145,21 +144,21 @@ pub async fn walk( } }; - match IndexerRule::apply_all(&rules, &entry_path).await { - Ok(rule_results) => { - // No OS Protected and No Hidden rules, must always be from this kind, should panic otherwise - if rule_results[&RuleKind::RejectFilesByGlob] - .iter() - .any(|reject| !reject) - { - continue; - } + match indexer_ruler + .evaluate_path(&entry_path, &entry.metadata) + .await + { + Ok(RulerDecision::Accept) => { /* Everything is awesome! */ } + + Ok(RulerDecision::Reject) => { + continue; } + Err(e) => { tx.send(Err(Either::Left(e.into()))).await?; continue; } - }; + } if entry.metadata.is_dir() { directories.push((entry_path, name, entry.metadata)); @@ -170,7 +169,7 @@ pub async fn walk( .file_stem() .and_then(|s| s.to_str().map(str::to_string)) else { - warn!("Failed to extract name from path: {}", &entry_path); + warn!(%entry_path, "Failed to extract name from path;"); continue; }; @@ -222,12 +221,12 @@ pub async fn walk( )); } - ( - Some(get_ephemeral_thumb_key(&cas_id)), - node.ephemeral_thumbnail_exists(&cas_id) - .await - .map_err(NonIndexedLocationError::from)?, - ) + let thumb_exists = node + .ephemeral_thumbnail_exists(&cas_id) + .await + .map_err(NonIndexedLocationError::from)?; + + (Some(ThumbKey::new_ephemeral(cas_id)), thumb_exists) } else { (None, false) } @@ -256,13 +255,35 @@ pub async fn walk( thumbnails_to_generate.extend(document_thumbnails_to_generate); - node.thumbnailer - .new_ephemeral_thumbnails_batch(BatchToProcess::new( - thumbnails_to_generate, - false, - false, - )) - .await; + let thumbnails_directory = Arc::new(get_thumbnails_directory(node.config.data_directory())); + let reporter: Arc = Arc::new(NewThumbnailsReporter { + ctx: NodeContext { + node: Arc::clone(&node), + library: Arc::clone(&library), + }, + }); + + if node + .task_system + .dispatch_many( + thumbnails_to_generate + .into_iter() + .chunks(10) + .into_iter() + .map(|chunk| { + media_processor::Thumbnailer::new_ephemeral( + Arc::clone(&thumbnails_directory), + chunk.collect(), + Arc::clone(&reporter), + ) + }) + .collect::>(), + ) + .await + .is_err() + { + debug!("Task system shutting down"); + } let mut locations = library .db @@ -311,13 +332,13 @@ pub async fn walk( Ok::<_, NonIndexedLocationError>(()) }); - tokio::spawn(async move { + spawn(async move { match task.await { Ok(Ok(())) => {} Ok(Err(e)) => { let _ = tx2.send(Err(Either::Left(e.into()))).await; } - Err(e) => error!("error joining tokio task: {}", e), + Err(e) => error!(?e, "error joining tokio task"), } }); diff --git a/core/src/node/config.rs b/core/src/node/config.rs index a908ee502..17098ca7c 100644 --- a/core/src/node/config.rs +++ b/core/src/node/config.rs @@ -1,6 +1,6 @@ use crate::{ api::{notifications::Notification, BackendFeature}, - object::media::old_thumbnail::preferences::ThumbnailerPreferences, + /*object::media::old_thumbnail::preferences::ThumbnailerPreferences,*/ util::version_manager::{Kind, ManagedVersion, VersionManager, VersionManagerError}, }; @@ -169,7 +169,8 @@ mod identity_serde { #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq, Type)] pub struct NodePreferences { - pub thumbnailer: ThumbnailerPreferences, + // pub thumbnailer: ThumbnailerPreferences, + // TODO(fogodev): introduce preferences to choose how many worker the task system should have } #[derive( @@ -193,7 +194,11 @@ impl ManagedVersion for NodeConfig { // SAFETY: This is just for display purposes so it doesn't matter if it's lossy Ok(hostname) => hostname.to_string_lossy().into_owned(), Err(e) => { - error!("Falling back to default node name as an error occurred getting your systems hostname: '{e:#?}'"); + error!( + ?e, + "Falling back to default node name as an error occurred getting your systems hostname;", + ); + "my-spacedrive".into() } }; @@ -309,7 +314,7 @@ impl NodeConfig { } _ => { - error!("Node config version is not handled: {:?}", current); + error!(current_version = ?current, "Node config version is not handled;"); return Err(VersionManagerError::UnexpectedMigration { current_version: current.int_value(), next_version: next.int_value(), @@ -378,11 +383,6 @@ impl Manager { self.config.read().await.clone() } - /// get a node config preferences watcher receiver - pub(crate) fn preferences_watcher(&self) -> watch::Receiver { - self.preferences_watcher_tx.subscribe() - } - /// data_directory returns the path to the directory storing the configuration data. pub(crate) fn data_directory(&self) -> PathBuf { self.data_directory_path.clone() diff --git a/core/src/object/cas.rs b/core/src/object/cas.rs deleted file mode 100644 index 43a1be0f4..000000000 --- a/core/src/object/cas.rs +++ /dev/null @@ -1,62 +0,0 @@ -use std::path::Path; - -use blake3::Hasher; -use static_assertions::const_assert; -use tokio::{ - fs::{self, File}, - io::{self, AsyncReadExt, AsyncSeekExt, SeekFrom}, -}; - -const SAMPLE_COUNT: u64 = 4; -const SAMPLE_SIZE: u64 = 1024 * 10; -const HEADER_OR_FOOTER_SIZE: u64 = 1024 * 8; - -// minimum file size of 100KiB, to avoid sample hashing for small files as they can be smaller than the total sample size -const MINIMUM_FILE_SIZE: u64 = 1024 * 100; - -// Asserting that nobody messed up our consts -const_assert!((HEADER_OR_FOOTER_SIZE * 2 + SAMPLE_COUNT * SAMPLE_SIZE) < MINIMUM_FILE_SIZE); - -// Asserting that the sample size is larger than header/footer size, as the same buffer is used for both -const_assert!(SAMPLE_SIZE > HEADER_OR_FOOTER_SIZE); - -pub async fn generate_cas_id(path: impl AsRef, size: u64) -> Result { - let mut hasher = Hasher::new(); - hasher.update(&size.to_le_bytes()); - - if size <= MINIMUM_FILE_SIZE { - // For small files, we hash the whole file - hasher.update(&fs::read(path).await?); - } else { - let mut file = File::open(path).await?; - let mut buf = vec![0; SAMPLE_SIZE as usize].into_boxed_slice(); - - // Hashing the header - let mut current_pos = file - .read_exact(&mut buf[..HEADER_OR_FOOTER_SIZE as usize]) - .await? as u64; - hasher.update(&buf[..HEADER_OR_FOOTER_SIZE as usize]); - - // Sample hashing the inner content of the file - let seek_jump = (size - HEADER_OR_FOOTER_SIZE * 2) / SAMPLE_COUNT; - loop { - file.read_exact(&mut buf).await?; - hasher.update(&buf); - - if current_pos >= (HEADER_OR_FOOTER_SIZE + seek_jump * (SAMPLE_COUNT - 1)) { - break; - } - - current_pos = file.seek(SeekFrom::Start(current_pos + seek_jump)).await?; - } - - // Hashing the footer - file.seek(SeekFrom::End(-(HEADER_OR_FOOTER_SIZE as i64))) - .await?; - file.read_exact(&mut buf[..HEADER_OR_FOOTER_SIZE as usize]) - .await?; - hasher.update(&buf[..HEADER_OR_FOOTER_SIZE as usize]); - } - - Ok(hasher.finalize().to_hex()[..16].to_string()) -} diff --git a/core/src/object/fs/old_copy.rs b/core/src/object/fs/old_copy.rs index cbe4c4f0a..41b1d1b1a 100644 --- a/core/src/object/fs/old_copy.rs +++ b/core/src/object/fs/old_copy.rs @@ -172,8 +172,8 @@ impl StatefulJob for OldFileCopierJobInit { Err(FileSystemJobsError::FilePathNotFound(path)) => { // FilePath doesn't exist in the database, it possibly wasn't indexed, so we skip it warn!( - "Skipping duplicating {} as it wasn't indexed", - path.display() + path = %path.display(), + "Skipping duplicating as it wasn't indexed;", ); } Err(e) => return Err(e.into()), @@ -208,9 +208,9 @@ impl StatefulJob for OldFileCopierJobInit { } Err(e) if e.kind() == io::ErrorKind::NotFound => { trace!( - "Copying from {} to {}", - source_file_data.full_path.display(), - target_full_path.display() + source = %source_file_data.full_path.display(), + target = %target_full_path.display(), + "Copying source -> target;", ); fs::copy(&source_file_data.full_path, &target_full_path) diff --git a/core/src/object/fs/old_cut.rs b/core/src/object/fs/old_cut.rs index 4135dc631..887d92b02 100644 --- a/core/src/object/fs/old_cut.rs +++ b/core/src/object/fs/old_cut.rs @@ -99,8 +99,8 @@ impl StatefulJob for OldFileCutterJobInit { match fs::metadata(&full_output).await { Ok(_) => { warn!( - "Skipping {} as it would be overwritten", - full_output.display() + output_path = %full_output.display(), + "Skipping as it would be overwritten;", ); Ok(JobRunErrors(vec![FileSystemJobsError::WouldOverwrite( @@ -111,9 +111,9 @@ impl StatefulJob for OldFileCutterJobInit { } Err(e) if e.kind() == io::ErrorKind::NotFound => { trace!( - "Cutting {} to {}", - file_data.full_path.display(), - full_output.display() + source = %file_data.full_path.display(), + target = %full_output.display(), + "Cutting source -> target;", ); fs::rename(&file_data.full_path, &full_output) diff --git a/core/src/object/fs/old_delete.rs b/core/src/object/fs/old_delete.rs index 0e33000d1..0b05a6103 100644 --- a/core/src/object/fs/old_delete.rs +++ b/core/src/object/fs/old_delete.rs @@ -84,9 +84,10 @@ impl StatefulJob for OldFileDeleterJobInit { Ok(()) => { /* Everything is awesome! */ } Err(e) if e.kind() == io::ErrorKind::NotFound => { warn!( - "File not found in the file system, will remove from database: {}", - step.full_path.display() + path = %step.full_path.display(), + "File not found in the file system, will remove from database;", ); + sync.write_op( db, sync.shared_delete(prisma_sync::file_path::SyncId { diff --git a/core/src/object/fs/old_erase.rs b/core/src/object/fs/old_erase.rs index e7fa68516..d04970590 100644 --- a/core/src/object/fs/old_erase.rs +++ b/core/src/object/fs/old_erase.rs @@ -155,9 +155,9 @@ impl StatefulJob for OldFileEraserJobInit { // .len(); trace!( - "Overwriting file: {} with {} passes", - step.full_path.display(), - init.passes + path = %step.full_path.display(), + passes = init.passes, + "Overwriting file;", ); // TODO: File is only being truncated and not actually erased, diff --git a/core/src/object/media/exif_metadata_extractor.rs b/core/src/object/media/exif_metadata_extractor.rs deleted file mode 100644 index 96a815c80..000000000 --- a/core/src/object/media/exif_metadata_extractor.rs +++ /dev/null @@ -1,164 +0,0 @@ -use crate::old_job::JobRunErrors; - -use sd_core_file_path_helper::IsolatedFilePathData; -use sd_core_prisma_helpers::file_path_for_media_processor; - -use sd_file_ext::extensions::{Extension, ImageExtension, ALL_IMAGE_EXTENSIONS}; -use sd_media_metadata::ExifMetadata; -use sd_prisma::prisma::{exif_data, location, PrismaClient}; - -use std::{collections::HashSet, path::Path}; - -use futures_concurrency::future::Join; -use once_cell::sync::Lazy; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tracing::error; - -use super::exif_data_image_to_query; - -#[derive(Error, Debug)] -pub enum ExifDataError { - // Internal errors - #[error("database error: {0}")] - Database(#[from] prisma_client_rust::QueryError), - #[error(transparent)] - MediaData(#[from] sd_media_metadata::Error), -} - -#[derive(Serialize, Deserialize, Default, Debug)] -pub struct OldExifDataExtractorMetadata { - pub extracted: u32, - pub skipped: u32, -} - -pub(super) static FILTERED_IMAGE_EXTENSIONS: Lazy> = Lazy::new(|| { - ALL_IMAGE_EXTENSIONS - .iter() - .cloned() - .filter(can_extract_exif_data_for_image) - .map(Extension::Image) - .collect() -}); - -pub const fn can_extract_exif_data_for_image(image_extension: &ImageExtension) -> bool { - use ImageExtension::*; - matches!( - image_extension, - Tiff | Dng | Jpeg | Jpg | Heif | Heifs | Heic | Avif | Avcs | Avci | Hif | Png | Webp - ) -} - -pub async fn extract_exif_data( - path: impl AsRef + Send, -) -> Result, ExifDataError> { - ExifMetadata::from_path(path).await.map_err(Into::into) -} - -pub async fn process( - files_paths: &[file_path_for_media_processor::Data], - location_id: location::id::Type, - location_path: impl AsRef, - db: &PrismaClient, - ctx_update_fn: &impl Fn(usize), -) -> Result<(OldExifDataExtractorMetadata, JobRunErrors), ExifDataError> { - let mut run_metadata = OldExifDataExtractorMetadata::default(); - if files_paths.is_empty() { - return Ok((run_metadata, JobRunErrors::default())); - } - - let location_path = location_path.as_ref(); - - let objects_already_with_exif_data = db - .exif_data() - .find_many(vec![exif_data::object_id::in_vec( - files_paths - .iter() - .filter_map(|file_path| file_path.object_id) - .collect(), - )]) - .select(exif_data::select!({ object_id })) - .exec() - .await?; - - if files_paths.len() == objects_already_with_exif_data.len() { - // All files already have media data, skipping - run_metadata.skipped = files_paths.len() as u32; - return Ok((run_metadata, JobRunErrors::default())); - } - - let objects_already_with_exif_data = objects_already_with_exif_data - .into_iter() - .map(|exif_data| exif_data.object_id) - .collect::>(); - - run_metadata.skipped = objects_already_with_exif_data.len() as u32; - - let (exif_datas, errors) = { - let maybe_exif_data = files_paths - .iter() - .enumerate() - .filter_map(|(idx, file_path)| { - file_path.object_id.and_then(|object_id| { - (!objects_already_with_exif_data.contains(&object_id)) - .then_some((idx, file_path, object_id)) - }) - }) - .filter_map(|(idx, file_path, object_id)| { - IsolatedFilePathData::try_from((location_id, file_path)) - .map_err(|e| error!("{e:#?}")) - .ok() - .map(|iso_file_path| (idx, location_path.join(iso_file_path), object_id)) - }) - .map(|(idx, path, object_id)| async move { - let res = extract_exif_data(&path).await; - ctx_update_fn(idx + 1); - (res, path, object_id) - }) - .collect::>() - .join() - .await; - - let total_exif_data = maybe_exif_data.len(); - - maybe_exif_data.into_iter().fold( - // In the good case, all exif data were extracted - (Vec::with_capacity(total_exif_data), Vec::new()), - |(mut exif_datas, mut errors), (maybe_exif_data, path, object_id)| { - match maybe_exif_data { - Ok(Some(exif_data)) => exif_datas.push((exif_data, object_id)), - Ok(None) => { - // No exif data on path, skipping - run_metadata.skipped += 1; - } - Err(e) => errors.push((e, path)), - } - (exif_datas, errors) - }, - ) - }; - - let created = db - .exif_data() - .create_many( - exif_datas - .into_iter() - .map(|(exif_data, object_id)| exif_data_image_to_query(exif_data, object_id)) - .collect(), - ) - .skip_duplicates() - .exec() - .await?; - - run_metadata.extracted = created as u32; - run_metadata.skipped += errors.len() as u32; - - Ok(( - run_metadata, - errors - .into_iter() - .map(|(e, path)| format!("Couldn't process file: \"{}\"; Error: {e}", path.display())) - .collect::>() - .into(), - )) -} diff --git a/core/src/object/media/ffmpeg_metadata_extractor.rs b/core/src/object/media/ffmpeg_metadata_extractor.rs deleted file mode 100644 index 754098ecc..000000000 --- a/core/src/object/media/ffmpeg_metadata_extractor.rs +++ /dev/null @@ -1,660 +0,0 @@ -use crate::old_job::JobRunErrors; - -use prisma_client_rust::QueryError; -use sd_core_file_path_helper::IsolatedFilePathData; -use sd_core_prisma_helpers::file_path_for_media_processor; - -use sd_file_ext::extensions::{ - AudioExtension, Extension, VideoExtension, ALL_AUDIO_EXTENSIONS, ALL_VIDEO_EXTENSIONS, -}; -use sd_media_metadata::{ - ffmpeg::{ - audio_props::AudioProps, - chapter::Chapter, - codec::{Codec, Props}, - metadata::Metadata, - program::Program, - stream::Stream, - video_props::VideoProps, - }, - FFmpegMetadata, -}; -use sd_prisma::prisma::{ - ffmpeg_data, ffmpeg_media_audio_props, ffmpeg_media_chapter, ffmpeg_media_codec, - ffmpeg_media_program, ffmpeg_media_stream, ffmpeg_media_video_props, location, object, - PrismaClient, -}; -use sd_utils::db::ffmpeg_data_field_to_db; - -use std::{ - collections::{HashMap, HashSet}, - path::Path, -}; - -use futures_concurrency::future::{Join, TryJoin}; -use once_cell::sync::Lazy; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tracing::error; - -#[derive(Error, Debug)] -pub enum FFmpegDataError { - // Internal errors - #[error("database error: {0}")] - Database(#[from] prisma_client_rust::QueryError), - #[error(transparent)] - MediaData(#[from] sd_media_metadata::Error), -} - -#[derive(Serialize, Deserialize, Default, Debug)] -pub struct OldFFmpegDataExtractorMetadata { - pub extracted: u32, - pub skipped: u32, -} - -pub(super) static FILTERED_AUDIO_AND_VIDEO_EXTENSIONS: Lazy> = Lazy::new(|| { - ALL_AUDIO_EXTENSIONS - .iter() - .copied() - .filter(can_extract_ffmpeg_data_for_audio) - .map(Extension::Audio) - .chain( - ALL_VIDEO_EXTENSIONS - .iter() - .copied() - .filter(can_extract_ffmpeg_data_for_video) - .map(Extension::Video), - ) - .collect() -}); - -pub const fn can_extract_ffmpeg_data_for_audio(audio_extension: &AudioExtension) -> bool { - use AudioExtension::*; - // TODO: Remove from here any extension which ffmpeg can't extract metadata from - matches!( - audio_extension, - Mp3 | Mp2 - | M4a | Wav | Aiff - | Aif | Flac | Ogg - | Oga | Opus | Wma - | Amr | Aac | Wv - | Voc | Tta | Loas - | Caf | Aptx | Adts - | Ast | Mid - ) -} - -pub const fn can_extract_ffmpeg_data_for_video(video_extension: &VideoExtension) -> bool { - use VideoExtension::*; - // TODO: Remove from here any extension which ffmpeg can't extract metadata from - matches!( - video_extension, - Avi | Avifs - | Qt | Mov | Swf - | Mjpeg | Ts | Mts - | Mpeg | Mxf | M2v - | Mpg | Mpe | M2ts - | Flv | Wm | _3gp - | M4v | Wmv | Asf - | Mp4 | Webm | Mkv - | Vob | Ogv | Wtv - | Hevc | F4v - ) -} - -pub async fn extract_ffmpeg_data( - path: impl AsRef + Send, -) -> Result { - FFmpegMetadata::from_path(path).await.map_err(Into::into) -} - -pub async fn process( - files_paths: &[file_path_for_media_processor::Data], - location_id: location::id::Type, - location_path: impl AsRef + Send, - db: &PrismaClient, - ctx_update_fn: &impl Fn(usize), -) -> Result<(OldFFmpegDataExtractorMetadata, JobRunErrors), FFmpegDataError> { - let mut run_metadata = OldFFmpegDataExtractorMetadata::default(); - if files_paths.is_empty() { - return Ok((run_metadata, JobRunErrors::default())); - } - - let location_path = location_path.as_ref(); - - let objects_already_with_ffmpeg_data = db - .ffmpeg_data() - .find_many(vec![ffmpeg_data::object_id::in_vec( - files_paths - .iter() - .filter_map(|file_path| file_path.object_id) - .collect(), - )]) - .select(ffmpeg_data::select!({ object_id })) - .exec() - .await?; - - if files_paths.len() == objects_already_with_ffmpeg_data.len() { - // All files already have media data, skipping - run_metadata.skipped = files_paths.len() as u32; - return Ok((run_metadata, JobRunErrors::default())); - } - - let objects_already_with_ffmpeg_data = objects_already_with_ffmpeg_data - .into_iter() - .map(|ffmpeg_data| ffmpeg_data.object_id) - .collect::>(); - - run_metadata.skipped = objects_already_with_ffmpeg_data.len() as u32; - - let mut errors = vec![]; - - let ffmpeg_datas = files_paths - .iter() - .enumerate() - .filter_map(|(idx, file_path)| { - file_path.object_id.and_then(|object_id| { - (!objects_already_with_ffmpeg_data.contains(&object_id)) - .then_some((idx, file_path, object_id)) - }) - }) - .filter_map(|(idx, file_path, object_id)| { - IsolatedFilePathData::try_from((location_id, file_path)) - .map_err(|e| error!("{e:#?}")) - .ok() - .map(|iso_file_path| (idx, location_path.join(iso_file_path), object_id)) - }) - .map(|(idx, path, object_id)| async move { - let res = extract_ffmpeg_data(&path).await; - ctx_update_fn(idx + 1); - (res, path, object_id) - }) - .collect::>() - .join() - .await - .into_iter() - .filter_map(|(res, path, object_id)| { - res.map(|ffmpeg_data| (ffmpeg_data, object_id)) - .map_err(|e| errors.push((e, path))) - .ok() - }) - .collect::>(); - - let created = save_ffmpeg_data(ffmpeg_datas, db).await?; - - run_metadata.extracted = created as u32; - run_metadata.skipped += errors.len() as u32; - - Ok(( - run_metadata, - errors - .into_iter() - .map(|(e, path)| format!("Couldn't process file: \"{}\"; Error: {e}", path.display())) - .collect::>() - .into(), - )) -} - -pub async fn save_ffmpeg_data( - ffmpeg_datas: impl IntoIterator, - db: &PrismaClient, -) -> Result { - ffmpeg_datas - .into_iter() - .map( - move |( - FFmpegMetadata { - formats, - duration, - start_time, - bit_rate, - chapters, - programs, - metadata, - }, - object_id, - )| { - db._transaction() - .with_timeout(30 * 1000) - .run(move |db| async move { - let data_id = create_ffmpeg_data( - formats, bit_rate, duration, start_time, metadata, object_id, &db, - ) - .await?; - - create_ffmpeg_chapters(data_id, chapters, &db).await?; - - let streams = create_ffmpeg_programs(data_id, programs, &db).await?; - - let codecs = create_ffmpeg_streams(data_id, streams, &db).await?; - - let (audio_props, video_props) = - create_ffmpeg_codecs(data_id, codecs, &db).await?; - - ( - create_ffmpeg_audio_props(audio_props, &db), - create_ffmpeg_video_props(video_props, &db), - ) - .try_join() - .await - .map(|_| ()) - }) - }, - ) - .collect::>() - .try_join() - .await - .map(|created| created.len() as u32) -} - -async fn create_ffmpeg_data( - formats: Vec, - bit_rate: (i32, u32), - duration: Option<(i32, u32)>, - start_time: Option<(i32, u32)>, - metadata: Metadata, - object_id: i32, - db: &PrismaClient, -) -> Result { - db.ffmpeg_data() - .create( - formats.join(","), - ffmpeg_data_field_to_db((bit_rate.0 as i64) << 32 | bit_rate.1 as i64), - object::id::equals(object_id), - vec![ - ffmpeg_data::duration::set( - duration.map(|(a, b)| ffmpeg_data_field_to_db((a as i64) << 32 | b as i64)), - ), - ffmpeg_data::start_time::set( - start_time.map(|(a, b)| ffmpeg_data_field_to_db((a as i64) << 32 | b as i64)), - ), - ffmpeg_data::metadata::set( - serde_json::to_vec(&metadata) - .map_err(|err| { - error!("Error reading FFmpegData metadata: {err:#?}"); - err - }) - .ok(), - ), - ], - ) - .select(ffmpeg_data::select!({ id })) - .exec() - .await - .map(|data| data.id) -} - -async fn create_ffmpeg_chapters( - ffmpeg_data_id: ffmpeg_data::id::Type, - chapters: Vec, - db: &PrismaClient, -) -> Result<(), QueryError> { - db.ffmpeg_media_chapter() - .create_many( - chapters - .into_iter() - .map( - |Chapter { - id: chapter_id, - start: (start_high, start_low), - end: (end_high, end_low), - time_base_den, - time_base_num, - metadata, - }| ffmpeg_media_chapter::CreateUnchecked { - chapter_id, - start: ffmpeg_data_field_to_db( - (start_high as i64) << 32 | start_low as i64, - ), - end: ffmpeg_data_field_to_db((end_high as i64) << 32 | end_low as i64), - time_base_den, - time_base_num, - ffmpeg_data_id, - _params: vec![ffmpeg_media_chapter::metadata::set( - serde_json::to_vec(&metadata) - .map_err(|err| { - error!("Error reading FFmpegMediaChapter metadata: {err:#?}"); - err - }) - .ok(), - )], - }, - ) - .collect(), - ) - .exec() - .await - .map(|_| ()) -} - -async fn create_ffmpeg_programs( - data_id: i32, - programs: Vec, - db: &PrismaClient, -) -> Result)>, QueryError> { - let (creates, streams_by_program_id) = - programs - .into_iter() - .map( - |Program { - id: program_id, - name, - metadata, - streams, - }| { - ( - ffmpeg_media_program::CreateUnchecked { - program_id, - ffmpeg_data_id: data_id, - _params: vec![ - ffmpeg_media_program::name::set(name.clone()), - ffmpeg_media_program::metadata::set( - serde_json::to_vec(&metadata) - .map_err(|err| { - error!("Error reading FFmpegMediaProgram metadata: {err:#?}"); - err - }) - .ok(), - ), - ], - }, - (program_id, streams), - ) - }, - ) - .unzip::<_, _, Vec<_>, Vec<_>>(); - - db.ffmpeg_media_program() - .create_many(creates) - .exec() - .await - .map(|_| streams_by_program_id) -} - -async fn create_ffmpeg_streams( - ffmpeg_data_id: ffmpeg_data::id::Type, - streams: Vec<(ffmpeg_media_program::program_id::Type, Vec)>, - db: &PrismaClient, -) -> Result< - Vec<( - ffmpeg_media_program::program_id::Type, - ffmpeg_media_stream::stream_id::Type, - Codec, - )>, - QueryError, -> { - let (creates, maybe_codecs) = streams - .into_iter() - .flat_map(|(program_id, streams)| { - streams.into_iter().map( - move |Stream { - id: stream_id, - name, - codec: maybe_codec, - aspect_ratio_num, - aspect_ratio_den, - frames_per_second_num, - frames_per_second_den, - time_base_real_den, - time_base_real_num, - dispositions, - metadata, - }| { - ( - ffmpeg_media_stream::CreateUnchecked { - stream_id, - aspect_ratio_num, - aspect_ratio_den, - frames_per_second_num, - frames_per_second_den, - time_base_real_den, - time_base_real_num, - program_id, - ffmpeg_data_id, - _params: vec![ - ffmpeg_media_stream::name::set(name), - ffmpeg_media_stream::dispositions::set( - (!dispositions.is_empty()).then_some(dispositions.join(",")), - ), - ffmpeg_media_stream::title::set(metadata.title.clone()), - ffmpeg_media_stream::encoder::set(metadata.encoder.clone()), - ffmpeg_media_stream::language::set(metadata.language.clone()), - ffmpeg_media_stream::metadata::set( - serde_json::to_vec(&metadata) - .map_err(|err| { - error!("Error reading FFmpegMediaStream metadata: {err:#?}"); - err - }) - .ok(), - ), - ], - }, - maybe_codec.map(|codec| (program_id, stream_id, codec)), - ) - }, - ) - }) - .unzip::<_, _, Vec<_>, Vec<_>>(); - - db.ffmpeg_media_stream() - .create_many(creates) - .exec() - .await - .map(|_| maybe_codecs.into_iter().flatten().collect()) -} - -async fn create_ffmpeg_codecs( - ffmpeg_data_id: ffmpeg_data::id::Type, - codecs: Vec<( - ffmpeg_media_program::program_id::Type, - ffmpeg_media_stream::stream_id::Type, - Codec, - )>, - db: &PrismaClient, -) -> Result< - ( - Vec<(ffmpeg_media_codec::id::Type, AudioProps)>, - Vec<(ffmpeg_media_codec::id::Type, VideoProps)>, - ), - QueryError, -> { - let expected_creates = codecs.len(); - - let (creates, mut audio_props, mut video_props) = codecs.into_iter().enumerate().fold( - ( - Vec::with_capacity(expected_creates), - HashMap::with_capacity(expected_creates), - HashMap::with_capacity(expected_creates), - ), - |(mut creates, mut audio_props, mut video_props), - ( - idx, - ( - program_id, - stream_id, - Codec { - kind, - sub_kind, - tag, - name, - profile, - bit_rate, - props: maybe_props, - }, - ), - )| { - creates.push(ffmpeg_media_codec::CreateUnchecked { - bit_rate, - stream_id, - program_id, - ffmpeg_data_id, - _params: vec![ - ffmpeg_media_codec::kind::set(kind), - ffmpeg_media_codec::sub_kind::set(sub_kind), - ffmpeg_media_codec::tag::set(tag), - ffmpeg_media_codec::name::set(name), - ffmpeg_media_codec::profile::set(profile), - ], - }); - - if let Some(props) = maybe_props { - match props { - Props::Audio(props) => { - audio_props.insert(idx, props); - } - Props::Video(props) => { - video_props.insert(idx, props); - } - Props::Subtitle(_) => { - // We don't care about subtitles props for now :D - } - } - } - - (creates, audio_props, video_props) - }, - ); - - let created_ids = creates - .into_iter() - .map( - |ffmpeg_media_codec::CreateUnchecked { - bit_rate, - stream_id, - program_id, - ffmpeg_data_id, - _params, - }| { - db.ffmpeg_media_codec() - .create_unchecked(bit_rate, stream_id, program_id, ffmpeg_data_id, _params) - .select(ffmpeg_media_codec::select!({ id })) - .exec() - }, - ) - .collect::>() - .try_join() - .await?; - - assert_eq!( - created_ids.len(), - expected_creates, - "Not all codecs were created and our invariant is broken!" - ); - - debug_assert!( - created_ids - .windows(2) - .all(|window| window[0].id < window[1].id), - "Codecs were created in a different order than we expected, our invariant is broken!" - ); - - Ok(created_ids.into_iter().enumerate().fold( - ( - Vec::with_capacity(audio_props.len()), - Vec::with_capacity(video_props.len()), - ), - |(mut a_props, mut v_props), (idx, codec_data)| { - if let Some(audio_props) = audio_props.remove(&idx) { - a_props.push((codec_data.id, audio_props)); - } else if let Some(video_props) = video_props.remove(&idx) { - v_props.push((codec_data.id, video_props)); - } - - (a_props, v_props) - }, - )) -} - -async fn create_ffmpeg_audio_props( - audio_props: Vec<(ffmpeg_media_codec::id::Type, AudioProps)>, - db: &PrismaClient, -) -> Result<(), QueryError> { - db.ffmpeg_media_audio_props() - .create_many( - audio_props - .into_iter() - .map( - |( - codec_id, - AudioProps { - delay, - padding, - sample_rate, - sample_format, - bit_per_sample, - channel_layout, - }, - )| ffmpeg_media_audio_props::CreateUnchecked { - delay, - padding, - codec_id, - _params: vec![ - ffmpeg_media_audio_props::sample_rate::set(sample_rate), - ffmpeg_media_audio_props::sample_format::set(sample_format), - ffmpeg_media_audio_props::bit_per_sample::set(bit_per_sample), - ffmpeg_media_audio_props::channel_layout::set(channel_layout), - ], - }, - ) - .collect(), - ) - .exec() - .await - .map(|_| ()) -} - -async fn create_ffmpeg_video_props( - video_props: Vec<(ffmpeg_media_codec::id::Type, VideoProps)>, - db: &PrismaClient, -) -> Result<(), QueryError> { - db.ffmpeg_media_video_props() - .create_many( - video_props - .into_iter() - .map( - |( - codec_id, - VideoProps { - pixel_format, - color_range, - bits_per_channel, - color_space, - color_primaries, - color_transfer, - field_order, - chroma_location, - width, - height, - aspect_ratio_num, - aspect_ratio_den, - properties, - }, - )| { - ffmpeg_media_video_props::CreateUnchecked { - width, - height, - codec_id, - _params: vec![ - ffmpeg_media_video_props::pixel_format::set(pixel_format), - ffmpeg_media_video_props::color_range::set(color_range), - ffmpeg_media_video_props::bits_per_channel::set(bits_per_channel), - ffmpeg_media_video_props::color_space::set(color_space), - ffmpeg_media_video_props::color_primaries::set(color_primaries), - ffmpeg_media_video_props::color_transfer::set(color_transfer), - ffmpeg_media_video_props::field_order::set(field_order), - ffmpeg_media_video_props::chroma_location::set(chroma_location), - ffmpeg_media_video_props::aspect_ratio_num::set(aspect_ratio_num), - ffmpeg_media_video_props::aspect_ratio_den::set(aspect_ratio_den), - ffmpeg_media_video_props::properties::set(Some( - properties.join(","), - )), - ], - } - }, - ) - .collect(), - ) - .exec() - .await - .map(|_| ()) -} diff --git a/core/src/object/media/mod.rs b/core/src/object/media/mod.rs deleted file mode 100644 index 271abe873..000000000 --- a/core/src/object/media/mod.rs +++ /dev/null @@ -1,278 +0,0 @@ -use sd_core_prisma_helpers::object_with_media_data; -use sd_media_metadata::{ - ffmpeg::{ - audio_props::AudioProps, - chapter::Chapter, - codec::{Codec, Props}, - program::Program, - stream::Stream, - video_props::VideoProps, - }, - ExifMetadata, FFmpegMetadata, -}; -use sd_prisma::prisma::{ - exif_data::*, ffmpeg_media_audio_props, ffmpeg_media_chapter, ffmpeg_media_video_props, -}; - -pub mod exif_metadata_extractor; -pub mod ffmpeg_metadata_extractor; -pub mod old_media_processor; -pub mod old_thumbnail; - -pub use old_media_processor::OldMediaProcessorJobInit; -use sd_utils::db::ffmpeg_data_field_from_db; - -pub fn exif_data_image_to_query(mdi: ExifMetadata, object_id: object_id::Type) -> CreateUnchecked { - CreateUnchecked { - object_id, - _params: vec![ - camera_data::set(serde_json::to_vec(&mdi.camera_data).ok()), - media_date::set(serde_json::to_vec(&mdi.date_taken).ok()), - resolution::set(serde_json::to_vec(&mdi.resolution).ok()), - media_location::set(serde_json::to_vec(&mdi.location).ok()), - artist::set(mdi.artist), - description::set(mdi.description), - copyright::set(mdi.copyright), - exif_version::set(mdi.exif_version), - epoch_time::set(mdi.date_taken.map(|x| x.unix_timestamp())), - ], - } -} - -pub fn exif_data_image_to_query_params( - mdi: ExifMetadata, -) -> (Vec<(&'static str, rmpv::Value)>, Vec) { - use sd_sync::option_sync_db_entry; - use sd_utils::chain_optional_iter; - - chain_optional_iter( - [], - [ - option_sync_db_entry!(serde_json::to_vec(&mdi.camera_data).ok(), camera_data), - option_sync_db_entry!(serde_json::to_vec(&mdi.date_taken).ok(), media_date), - option_sync_db_entry!(serde_json::to_vec(&mdi.location).ok(), media_location), - option_sync_db_entry!(mdi.artist, artist), - option_sync_db_entry!(mdi.description, description), - option_sync_db_entry!(mdi.copyright, copyright), - option_sync_db_entry!(mdi.exif_version, exif_version), - ], - ) - .into_iter() - .unzip() -} - -pub fn exif_media_data_from_prisma_data(data: sd_prisma::prisma::exif_data::Data) -> ExifMetadata { - ExifMetadata { - camera_data: from_slice_option_to_option(data.camera_data).unwrap_or_default(), - date_taken: from_slice_option_to_option(data.media_date).unwrap_or_default(), - resolution: from_slice_option_to_option(data.resolution).unwrap_or_default(), - location: from_slice_option_to_option(data.media_location), - artist: data.artist, - description: data.description, - copyright: data.copyright, - exif_version: data.exif_version, - } -} - -pub fn ffmpeg_data_from_prisma_data( - object_with_media_data::ffmpeg_data::Data { - formats, - duration, - start_time, - bit_rate, - metadata, - chapters, - programs, - .. - }: object_with_media_data::ffmpeg_data::Data, -) -> FFmpegMetadata { - FFmpegMetadata { - formats: formats.split(',').map(String::from).collect::>(), - duration: duration.map(|duration| { - let duration = ffmpeg_data_field_from_db(&duration); - ((duration >> 32) as i32, duration as u32) - }), - start_time: start_time.map(|start_time| { - let start_time = ffmpeg_data_field_from_db(&start_time); - ((start_time >> 32) as i32, start_time as u32) - }), - bit_rate: { - let bit_rate = ffmpeg_data_field_from_db(&bit_rate); - ((bit_rate >> 32) as i32, bit_rate as u32) - }, - chapters: chapters - .into_iter() - .map( - |ffmpeg_media_chapter::Data { - chapter_id, - start, - end, - time_base_den, - time_base_num, - metadata, - .. - }| Chapter { - id: chapter_id, - start: { - let start = ffmpeg_data_field_from_db(&start); - ((start >> 32) as i32, start as u32) - }, - end: { - let end = ffmpeg_data_field_from_db(&end); - ((end >> 32) as i32, end as u32) - }, - time_base_den, - time_base_num, - metadata: from_slice_option_to_option(metadata).unwrap_or_default(), - }, - ) - .collect(), - programs: programs - .into_iter() - .map( - |object_with_media_data::ffmpeg_data::programs::Data { - program_id, - name, - metadata, - streams, - .. - }| Program { - id: program_id, - name, - streams: streams - .into_iter() - .map( - |object_with_media_data::ffmpeg_data::programs::streams::Data { - stream_id, - name, - aspect_ratio_num, - aspect_ratio_den, - frames_per_second_num, - frames_per_second_den, - time_base_real_den, - time_base_real_num, - dispositions, - metadata, - codec, - .. - }| { - Stream { - id: stream_id, - name, - codec: codec.map( - |object_with_media_data::ffmpeg_data::programs::streams::codec::Data{ - kind, - sub_kind, - tag, - name, - profile, - bit_rate, - audio_props, - video_props, - .. - }| Codec { - kind, - sub_kind, - tag, - name, - profile, - bit_rate, - props: match (audio_props, video_props) { - ( - Some(ffmpeg_media_audio_props::Data { - delay, - padding, - sample_rate, - sample_format, - bit_per_sample, - channel_layout, - .. - }), - None, - ) => Some(Props::Audio(AudioProps { - delay, - padding, - sample_rate, - sample_format, - bit_per_sample, - channel_layout, - })), - ( - None, - Some(ffmpeg_media_video_props::Data { - pixel_format, - color_range, - bits_per_channel, - color_space, - color_primaries, - color_transfer, - field_order, - chroma_location, - width, - height, - aspect_ratio_num, - aspect_ratio_den, - properties, - .. - }), - ) => Some(Props::Video(VideoProps { - pixel_format, - color_range, - bits_per_channel, - color_space, - color_primaries, - color_transfer, - field_order, - chroma_location, - width, - height, - aspect_ratio_num, - aspect_ratio_den, - properties: properties - .map(|dispositions| { - dispositions - .split(',') - .map(String::from) - .collect::>() - }) - .unwrap_or_default(), - })), - _ => None, - }, - } - ), - aspect_ratio_num, - aspect_ratio_den, - frames_per_second_num, - frames_per_second_den, - time_base_real_den, - time_base_real_num, - dispositions: dispositions - .map(|dispositions| { - dispositions - .split(',') - .map(String::from) - .collect::>() - }) - .unwrap_or_default(), - metadata: from_slice_option_to_option(metadata).unwrap_or_default(), - } - }, - ) - .collect(), - metadata: from_slice_option_to_option(metadata).unwrap_or_default(), - }, - ) - .collect(), - metadata: from_slice_option_to_option(metadata).unwrap_or_default(), - } -} - -#[must_use] -fn from_slice_option_to_option( - value: Option>, -) -> Option { - value - .map(|x| serde_json::from_slice(&x).ok()) - .unwrap_or_default() -} diff --git a/core/src/object/media/old_media_processor/job.rs b/core/src/object/media/old_media_processor/job.rs deleted file mode 100644 index 938c21658..000000000 --- a/core/src/object/media/old_media_processor/job.rs +++ /dev/null @@ -1,679 +0,0 @@ -use crate::{ - invalidate_query, - library::Library, - location::ScanState, - object::media::ffmpeg_metadata_extractor, - old_job::{ - CurrentStep, JobError, JobInitOutput, JobReportUpdate, JobResult, JobStepOutput, - StatefulJob, WorkerContext, - }, - Node, -}; - -#[cfg(feature = "ai")] -use crate::old_job::JobRunErrors; - -use sd_core_file_path_helper::{ - ensure_file_path_exists, ensure_sub_path_is_directory, ensure_sub_path_is_in_location, - IsolatedFilePathData, -}; -use sd_core_prisma_helpers::file_path_for_media_processor; - -use sd_file_ext::extensions::Extension; -use sd_prisma::prisma::{location, PrismaClient}; -use sd_utils::db::maybe_missing; - -#[cfg(feature = "ai")] -use sd_ai::old_image_labeler::{BatchToken as ImageLabelerBatchToken, LabelerOutput}; - -#[cfg(feature = "ai")] -use std::sync::Arc; - -use std::{ - hash::Hash, - path::{Path, PathBuf}, - pin::pin, - time::Duration, -}; - -use async_channel as chan; -use futures::StreamExt; -use itertools::Itertools; -use prisma_client_rust::{raw, PrismaValue}; -use serde::{Deserialize, Serialize}; -use serde_json::json; -use tokio::time::sleep; -use tracing::{debug, error, info, trace, warn}; - -use super::{ - exif_metadata_extractor, - old_thumbnail::{self, GenerateThumbnailArgs}, - process_audio_and_video, process_images, BatchToProcess, MediaProcessorError, - OldMediaProcessorMetadata, -}; - -const BATCH_SIZE: usize = 10; - -#[derive(Serialize, Deserialize, Debug)] -pub struct OldMediaProcessorJobInit { - pub location: location::Data, - pub sub_path: Option, - pub regenerate_thumbnails: bool, - pub regenerate_labels: bool, -} - -impl Hash for OldMediaProcessorJobInit { - fn hash(&self, state: &mut H) { - self.location.id.hash(state); - if let Some(ref sub_path) = self.sub_path { - sub_path.hash(state); - } - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct OldMediaProcessorJobData { - location_path: PathBuf, - to_process_path: PathBuf, - #[serde(skip, default)] - maybe_thumbnailer_progress_rx: Option>, - #[cfg(feature = "ai")] - labeler_batch_token: ImageLabelerBatchToken, - #[cfg(feature = "ai")] - #[serde(skip, default)] - maybe_labels_rx: Option>, -} - -#[derive(Debug, Serialize, Deserialize)] -pub enum OldMediaProcessorJobStep { - ExtractImageMediaData(Vec), - ExtractAudioAndVideoMediaData(Vec), - WaitThumbnails(usize), - #[cfg(feature = "ai")] - WaitLabels(usize), -} - -#[async_trait::async_trait] -impl StatefulJob for OldMediaProcessorJobInit { - type Data = OldMediaProcessorJobData; - type Step = OldMediaProcessorJobStep; - type RunMetadata = OldMediaProcessorMetadata; - - const NAME: &'static str = "media_processor"; - const IS_BATCHED: bool = true; - - fn target_location(&self) -> location::id::Type { - self.location.id - } - - async fn init( - &self, - ctx: &WorkerContext, - data: &mut Option, - ) -> Result, JobError> { - let Library { - db, - #[cfg(feature = "ai")] - sync, - .. - } = ctx.library.as_ref(); - - let location_id = self.location.id; - let location_path = - maybe_missing(&self.location.path, "location.path").map(PathBuf::from)?; - - let (to_process_path, iso_file_path) = match &self.sub_path { - Some(sub_path) if sub_path != Path::new("") => { - let full_path = ensure_sub_path_is_in_location(&location_path, sub_path) - .await - .map_err(MediaProcessorError::from)?; - ensure_sub_path_is_directory(&location_path, sub_path) - .await - .map_err(MediaProcessorError::from)?; - - let sub_iso_file_path = - IsolatedFilePathData::new(location_id, &location_path, &full_path, true) - .map_err(MediaProcessorError::from)?; - - ensure_file_path_exists( - sub_path, - &sub_iso_file_path, - db, - MediaProcessorError::SubPathNotFound, - ) - .await?; - - (full_path, sub_iso_file_path) - } - _ => ( - location_path.to_path_buf(), - IsolatedFilePathData::new(location_id, &location_path, &location_path, true) - .map_err(MediaProcessorError::from)?, - ), - }; - - debug!( - "Searching for media files in location {location_id} at directory \"{iso_file_path}\"" - ); - - let thumbs_to_process_count = dispatch_thumbnails_for_processing( - location_id, - &location_path, - &iso_file_path, - &ctx.library, - &ctx.node, - self.regenerate_thumbnails, - ) - .await?; - - let maybe_thumbnailer_progress_rx = if thumbs_to_process_count > 0 { - let (progress_tx, progress_rx) = chan::unbounded(); - - ctx.node - .thumbnailer - .register_reporter(location_id, progress_tx) - .await; - - Some(progress_rx) - } else { - None - }; - - let file_paths_to_extract_exif_data = - get_files_for_image_media_data_extraction(db, &iso_file_path).await?; - let file_paths_to_extract_ffmpeg_data = - get_files_for_audio_and_video_media_data_extraction(db, &iso_file_path).await?; - - #[cfg(feature = "ai")] - let file_paths_for_labeling = - get_files_for_labeling(db, &iso_file_path, self.regenerate_labels).await?; - - #[cfg(feature = "ai")] - let total_files_for_labeling = file_paths_for_labeling.len(); - - #[cfg(feature = "ai")] - let (labeler_batch_token, labels_rx) = - if let Some(image_labeller) = ctx.node.old_image_labeller.as_ref() { - let (labeler_batch_token, labels_rx) = image_labeller - .new_resumable_batch( - location_id, - location_path.clone(), - file_paths_for_labeling, - Arc::clone(db), - sync.clone(), - ) - .await; - (labeler_batch_token, Some(labels_rx)) - } else { - (uuid::Uuid::new_v4(), None) - }; - - let total_files = - file_paths_to_extract_exif_data.len() + file_paths_to_extract_ffmpeg_data.len(); - - let chunked_files = file_paths_to_extract_exif_data - .into_iter() - .chunks(BATCH_SIZE) - .into_iter() - .map(|chunk| chunk.collect::>()) - .map(OldMediaProcessorJobStep::ExtractImageMediaData) - .chain( - file_paths_to_extract_ffmpeg_data - .into_iter() - .chunks(BATCH_SIZE) - .into_iter() - .map(|chunk| chunk.collect::>()) - .map(OldMediaProcessorJobStep::ExtractAudioAndVideoMediaData), - ) - .chain( - [(thumbs_to_process_count > 0).then_some( - OldMediaProcessorJobStep::WaitThumbnails(thumbs_to_process_count as usize), - )] - .into_iter() - .flatten(), - ) - .chain( - [ - #[cfg(feature = "ai")] - { - (total_files_for_labeling > 0).then_some( - OldMediaProcessorJobStep::WaitLabels(total_files_for_labeling), - ) - }, - #[cfg(not(feature = "ai"))] - { - None - }, - ] - .into_iter() - .flatten(), - ) - .collect::>(); - - ctx.progress(vec![ - JobReportUpdate::TaskCount(total_files), - JobReportUpdate::Phase("media_data".to_string()), - JobReportUpdate::Message(format!( - "Preparing to process {total_files} files in {} chunks", - chunked_files.len() - )), - ]); - - *data = Some(OldMediaProcessorJobData { - location_path, - to_process_path, - maybe_thumbnailer_progress_rx, - #[cfg(feature = "ai")] - labeler_batch_token, - #[cfg(feature = "ai")] - maybe_labels_rx: labels_rx, - }); - - Ok(( - Self::RunMetadata { - thumbs_processed: thumbs_to_process_count, - ..Default::default() - }, - chunked_files, - ) - .into()) - } - - async fn execute_step( - &self, - ctx: &WorkerContext, - CurrentStep { step, step_number }: CurrentStep<'_, Self::Step>, - data: &Self::Data, - _: &Self::RunMetadata, - ) -> Result, JobError> { - match step { - OldMediaProcessorJobStep::ExtractImageMediaData(file_paths) => process_images( - file_paths, - self.location.id, - &data.location_path, - &ctx.library.db, - &|completed_count| { - ctx.progress(vec![JobReportUpdate::CompletedTaskCount( - step_number * BATCH_SIZE + completed_count, - )]); - }, - ) - .await - .map(Into::into) - .map_err(Into::into), - - OldMediaProcessorJobStep::ExtractAudioAndVideoMediaData(file_paths) => { - process_audio_and_video( - file_paths, - self.location.id, - &data.location_path, - &ctx.library.db, - &|completed_count| { - ctx.progress(vec![JobReportUpdate::CompletedTaskCount( - step_number * BATCH_SIZE + completed_count, - )]); - }, - ) - .await - .map(Into::into) - .map_err(Into::into) - } - - OldMediaProcessorJobStep::WaitThumbnails(total_thumbs) => { - ctx.progress(vec![ - JobReportUpdate::TaskCount(*total_thumbs), - JobReportUpdate::Phase("thumbnails".to_string()), - JobReportUpdate::Message(format!( - "Waiting for processing of {total_thumbs} thumbnails", - )), - ]); - - let mut progress_rx = pin!(if let Some(progress_rx) = - data.maybe_thumbnailer_progress_rx.clone() - { - progress_rx - } else { - let (progress_tx, progress_rx) = chan::unbounded(); - - ctx.node - .thumbnailer - .register_reporter(self.location.id, progress_tx) - .await; - - progress_rx - }); - - let mut total_completed = 0; - - while let Some((completed, total)) = progress_rx.next().await { - trace!("Received progress update from thumbnailer: {completed}/{total}",); - ctx.progress(vec![JobReportUpdate::CompletedTaskCount( - completed as usize, - )]); - total_completed = completed; - } - - if progress_rx.is_closed() && total_completed < *total_thumbs as u32 { - warn!( - "Thumbnailer progress reporter channel closed before all thumbnails were \ - processed, job will wait a bit waiting for a shutdown signal from manager" - ); - sleep(Duration::from_secs(5)).await; - } - - Ok(None.into()) - } - - #[cfg(feature = "ai")] - OldMediaProcessorJobStep::WaitLabels(total_labels) => { - let Some(image_labeller) = ctx.node.old_image_labeller.as_ref() else { - let err = "AI system is disabled due to a previous error, skipping labels job"; - error!(err); - return Ok(JobRunErrors(vec![err.to_string()]).into()); - }; - - ctx.progress(vec![ - JobReportUpdate::TaskCount(*total_labels), - JobReportUpdate::Phase("labels".to_string()), - JobReportUpdate::Message( - format!("Extracting labels for {total_labels} files",), - ), - ]); - - let mut labels_rx = pin!(if let Some(labels_rx) = data.maybe_labels_rx.clone() { - labels_rx - } else { - match image_labeller - .resume_batch( - data.labeler_batch_token, - Arc::clone(&ctx.library.db), - ctx.library.sync.clone(), - ) - .await - { - Ok(labels_rx) => labels_rx, - Err(e) => return Ok(JobRunErrors(vec![e.to_string()]).into()), - } - }); - - let mut total_labeled = 0; - - let mut errors = Vec::new(); - - while let Some(LabelerOutput { - file_path_id, - has_new_labels, - result, - }) = labels_rx.next().await - { - total_labeled += 1; - ctx.progress(vec![JobReportUpdate::CompletedTaskCount(total_labeled)]); - - if let Err(e) = result { - error!( - "Failed to generate labels : {e:#?}", - file_path_id - ); - - errors.push(e.to_string()); - } else if has_new_labels { - // invalidate_query!(&ctx.library, "labels.count"); // TODO: This query doesn't exist on main yet - } - } - - invalidate_query!(&ctx.library, "labels.list"); - invalidate_query!(&ctx.library, "labels.getForObject"); - invalidate_query!(&ctx.library, "labels.getWithObjects"); - - if !errors.is_empty() { - Ok(JobRunErrors(errors).into()) - } else { - Ok(None.into()) - } - } - } - } - - async fn finalize( - &self, - ctx: &WorkerContext, - data: &Option, - run_metadata: &Self::RunMetadata, - ) -> JobResult { - info!( - "Finished media processing for location {} at {}", - self.location.id, - data.as_ref() - .expect("critical error: missing data on job state") - .to_process_path - .display() - ); - - if run_metadata.exif_data.extracted > 0 || run_metadata.ffmpeg_data.extracted > 0 { - invalidate_query!(ctx.library, "search.paths"); - } - - ctx.library - .db - .location() - .update( - location::id::equals(self.location.id), - vec![location::scan_state::set(ScanState::Completed as i32)], - ) - .exec() - .await - .map_err(MediaProcessorError::from)?; - - Ok(Some(json!({"init: ": self, "run_metadata": run_metadata}))) - } -} - -async fn dispatch_thumbnails_for_processing( - location_id: location::id::Type, - location_path: impl AsRef, - parent_iso_file_path: &IsolatedFilePathData<'_>, - library: &Library, - node: &Node, - should_regenerate: bool, -) -> Result { - let Library { db, .. } = library; - - let location_path = location_path.as_ref(); - - let mut file_paths = get_all_children_files_by_extensions( - db, - parent_iso_file_path, - &old_thumbnail::ALL_THUMBNAILABLE_EXTENSIONS, - ) - .await?; - - if file_paths.is_empty() { - return Ok(0); - } - - let first_materialized_path = file_paths[0].materialized_path.clone(); - - // Only the first materialized_path should be processed in foreground - let different_materialized_path_idx = file_paths - .iter() - .position(|file_path| file_path.materialized_path != first_materialized_path); - - let background_thumbs_args = different_materialized_path_idx - .map(|idx| { - file_paths - .split_off(idx) - .into_iter() - .filter_map(|file_path| prepare_args(location_id, location_path, file_path)) - .collect::>() - }) - .unwrap_or_default(); - - let foreground_thumbs_args = file_paths - .into_iter() - .filter_map(|file_path| prepare_args(location_id, location_path, file_path)) - .collect::>(); - - let thumbs_count = background_thumbs_args.len() + foreground_thumbs_args.len(); - - debug!( - "Dispatching {thumbs_count} thumbnails to be processed, {} in foreground and {} in background", - foreground_thumbs_args.len(), - background_thumbs_args.len() - ); - - if !foreground_thumbs_args.is_empty() { - node.thumbnailer - .new_indexed_thumbnails_tracked_batch( - BatchToProcess::new(foreground_thumbs_args, should_regenerate, false), - library.id, - location_id, - ) - .await; - } - - if !background_thumbs_args.is_empty() { - node.thumbnailer - .new_indexed_thumbnails_tracked_batch( - BatchToProcess::new(background_thumbs_args, should_regenerate, true), - library.id, - location_id, - ) - .await; - } - - Ok(thumbs_count as u32) -} - -async fn get_files_for_image_media_data_extraction( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, -) -> Result, MediaProcessorError> { - get_all_children_files_by_extensions( - db, - parent_iso_file_path, - &exif_metadata_extractor::FILTERED_IMAGE_EXTENSIONS, - ) - .await - .map_err(Into::into) -} - -async fn get_files_for_audio_and_video_media_data_extraction( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, -) -> Result, MediaProcessorError> { - get_all_children_files_by_extensions( - db, - parent_iso_file_path, - &ffmpeg_metadata_extractor::FILTERED_AUDIO_AND_VIDEO_EXTENSIONS, - ) - .await - .map_err(Into::into) -} - -#[cfg(feature = "ai")] -async fn get_files_for_labeling( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, - regenerate_labels: bool, -) -> Result, MediaProcessorError> { - // FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite - // We have no data coming from the user, so this is sql injection safe - db._query_raw(raw!( - &format!( - "SELECT id, materialized_path, is_dir, name, extension, cas_id, object_id - FROM file_path f - WHERE - location_id={{}} - AND cas_id IS NOT NULL - AND LOWER(extension) IN ({}) - AND materialized_path LIKE {{}} - {} - ORDER BY materialized_path ASC", - // Ordering by materialized_path so we can prioritize processing the first files - // in the above part of the directories tree - &exif_metadata_extractor::FILTERED_IMAGE_EXTENSIONS - .iter() - .map(|ext| format!("LOWER('{ext}')")) - .collect::>() - .join(","), - if !regenerate_labels { - "AND NOT EXISTS (SELECT 1 FROM label_on_object WHERE object_id = f.object_id)" - } else { - "" - } - ), - PrismaValue::Int(parent_iso_file_path.location_id()), - PrismaValue::String(format!( - "{}%", - parent_iso_file_path - .materialized_path_for_children() - .expect("sub path iso_file_path must be a directory") - )) - )) - .exec() - .await - .map_err(Into::into) -} - -async fn get_all_children_files_by_extensions( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, - extensions: &[Extension], -) -> Result, MediaProcessorError> { - // FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite - // We have no data coming from the user, so this is sql injection safe - db._query_raw(raw!( - &format!( - "SELECT id, materialized_path, is_dir, name, extension, cas_id, object_id - FROM file_path - WHERE - location_id={{}} - AND cas_id IS NOT NULL - AND LOWER(extension) IN ({}) - AND materialized_path LIKE {{}} - ORDER BY materialized_path ASC", - // Ordering by materialized_path so we can prioritize processing the first files - // in the above part of the directories tree - extensions - .iter() - .map(|ext| format!("LOWER('{ext}')")) - .collect::>() - .join(",") - ), - PrismaValue::Int(parent_iso_file_path.location_id()), - PrismaValue::String(format!( - "{}%", - parent_iso_file_path - .materialized_path_for_children() - .expect("sub path iso_file_path must be a directory") - )) - )) - .exec() - .await - .map_err(Into::into) -} - -fn prepare_args( - location_id: location::id::Type, - location_path: &Path, // This function is only used internally once, so we can pass &Path as a parameter - file_path: file_path_for_media_processor::Data, -) -> Option { - let file_path_id = file_path.id; - - let Ok(cas_id) = maybe_missing(&file_path.cas_id, "file_path.cas_id").cloned() else { - error!("Missing cas_id for file_path "); - return None; - }; - - let Ok(iso_file_path) = IsolatedFilePathData::try_from((location_id, file_path)).map_err(|e| { - error!("Failed to extract isolated file path data from file path : {e:#?}"); - }) else { - return None; - }; - - Some(GenerateThumbnailArgs::new( - iso_file_path.extension().to_string(), - cas_id, - location_path.join(&iso_file_path), - )) -} diff --git a/core/src/object/media/old_media_processor/mod.rs b/core/src/object/media/old_media_processor/mod.rs deleted file mode 100644 index b89010ec7..000000000 --- a/core/src/object/media/old_media_processor/mod.rs +++ /dev/null @@ -1,109 +0,0 @@ -use crate::old_job::{JobRunErrors, JobRunMetadata}; - -use sd_core_file_path_helper::FilePathError; -use sd_core_prisma_helpers::file_path_for_media_processor; - -use sd_prisma::prisma::{location, PrismaClient}; - -use std::path::Path; - -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tracing::error; - -use super::{ - exif_metadata_extractor::{self, ExifDataError, OldExifDataExtractorMetadata}, - ffmpeg_metadata_extractor::{self, FFmpegDataError, OldFFmpegDataExtractorMetadata}, - old_thumbnail::{self, BatchToProcess, ThumbnailerError}, -}; - -mod job; -mod shallow; - -pub use job::OldMediaProcessorJobInit; -pub use shallow::old_shallow; - -#[derive(Error, Debug)] -pub enum MediaProcessorError { - #[error("sub path not found: ", .0.display())] - SubPathNotFound(Box), - - #[error("database error: {0}")] - Database(#[from] prisma_client_rust::QueryError), - #[error(transparent)] - FilePath(#[from] FilePathError), - - #[error(transparent)] - Thumbnailer(#[from] ThumbnailerError), - #[error(transparent)] - ExifMediaDataExtractor(#[from] ExifDataError), - #[error(transparent)] - FFmpegDataExtractor(#[from] FFmpegDataError), -} - -#[derive(Debug, Serialize, Deserialize, Default)] -pub struct OldMediaProcessorMetadata { - exif_data: OldExifDataExtractorMetadata, - ffmpeg_data: OldFFmpegDataExtractorMetadata, - thumbs_processed: u32, - labels_extracted: u32, -} - -impl From for OldMediaProcessorMetadata { - fn from(exif_data: OldExifDataExtractorMetadata) -> Self { - Self { - exif_data, - ffmpeg_data: Default::default(), - thumbs_processed: 0, - labels_extracted: 0, - } - } -} - -impl From for OldMediaProcessorMetadata { - fn from(ffmpeg_data: OldFFmpegDataExtractorMetadata) -> Self { - Self { - exif_data: Default::default(), - ffmpeg_data, - thumbs_processed: 0, - labels_extracted: 0, - } - } -} - -impl JobRunMetadata for OldMediaProcessorMetadata { - fn update(&mut self, new_data: Self) { - self.exif_data.extracted += new_data.exif_data.extracted; - self.exif_data.skipped += new_data.exif_data.skipped; - self.ffmpeg_data.extracted += new_data.ffmpeg_data.extracted; - self.ffmpeg_data.skipped += new_data.ffmpeg_data.skipped; - self.thumbs_processed += new_data.thumbs_processed; - self.labels_extracted += new_data.labels_extracted; - } -} - -pub async fn process_images( - files_paths: &[file_path_for_media_processor::Data], - location_id: location::id::Type, - location_path: impl AsRef + Send, - db: &PrismaClient, - ctx_update_fn: &impl Fn(usize), -) -> Result<(OldMediaProcessorMetadata, JobRunErrors), MediaProcessorError> { - exif_metadata_extractor::process(files_paths, location_id, location_path, db, ctx_update_fn) - .await - .map(|(exif_extraction_metadata, errors)| (exif_extraction_metadata.into(), errors)) - .map_err(Into::into) -} - -pub async fn process_audio_and_video( - files_paths: &[file_path_for_media_processor::Data], - location_id: location::id::Type, - location_path: impl AsRef + Send, - db: &PrismaClient, - ctx_update_fn: &impl Fn(usize), -) -> Result<(OldMediaProcessorMetadata, JobRunErrors), MediaProcessorError> { - ffmpeg_metadata_extractor::process(files_paths, location_id, location_path, db, ctx_update_fn) - .await - .map(|(ffmpeg_extraction_metadata, errors)| (ffmpeg_extraction_metadata.into(), errors)) - .map_err(Into::into) -} diff --git a/core/src/object/media/old_media_processor/shallow.rs b/core/src/object/media/old_media_processor/shallow.rs deleted file mode 100644 index 12197ebb2..000000000 --- a/core/src/object/media/old_media_processor/shallow.rs +++ /dev/null @@ -1,367 +0,0 @@ -use crate::{ - invalidate_query, - library::Library, - old_job::{JobError, JobRunMetadata}, - Node, -}; - -use sd_core_file_path_helper::{ - ensure_file_path_exists, ensure_sub_path_is_directory, ensure_sub_path_is_in_location, - IsolatedFilePathData, -}; -use sd_core_prisma_helpers::file_path_for_media_processor; - -use sd_file_ext::extensions::Extension; -use sd_prisma::prisma::{location, PrismaClient}; -use sd_utils::db::maybe_missing; - -#[cfg(feature = "ai")] -use sd_ai::old_image_labeler::LabelerOutput; - -use std::path::{Path, PathBuf}; - -#[cfg(feature = "ai")] -use std::sync::Arc; - -use itertools::Itertools; -use prisma_client_rust::{raw, PrismaValue}; -use tracing::{debug, error}; - -#[cfg(feature = "ai")] -use futures::StreamExt; - -use super::{ - exif_metadata_extractor, ffmpeg_metadata_extractor, - old_thumbnail::{self, BatchToProcess, GenerateThumbnailArgs}, - MediaProcessorError, OldMediaProcessorMetadata, -}; - -const BATCH_SIZE: usize = 10; - -pub async fn old_shallow( - location: &location::Data, - sub_path: &PathBuf, - library @ Library { - db, - #[cfg(feature = "ai")] - sync, - .. - }: &Library, - #[cfg(feature = "ai")] regenerate_labels: bool, - node: &Node, -) -> Result<(), JobError> { - let location_id = location.id; - let location_path = maybe_missing(&location.path, "location.path").map(PathBuf::from)?; - - let iso_file_path = if sub_path != Path::new("") { - let full_path = ensure_sub_path_is_in_location(&location_path, &sub_path) - .await - .map_err(MediaProcessorError::from)?; - ensure_sub_path_is_directory(&location_path, &sub_path) - .await - .map_err(MediaProcessorError::from)?; - - let sub_iso_file_path = - IsolatedFilePathData::new(location_id, &location_path, &full_path, true) - .map_err(MediaProcessorError::from)?; - - ensure_file_path_exists( - &sub_path, - &sub_iso_file_path, - db, - MediaProcessorError::SubPathNotFound, - ) - .await?; - - sub_iso_file_path - } else { - IsolatedFilePathData::new(location_id, &location_path, &location_path, true) - .map_err(MediaProcessorError::from)? - }; - - debug!("Searching for media in location {location_id} at path {iso_file_path}"); - - dispatch_thumbnails_for_processing( - location.id, - &location_path, - &iso_file_path, - library, - node, - false, - ) - .await?; - - let file_paths_to_extract_exif_data = - get_files_for_exif_media_data_extraction(db, &iso_file_path).await?; - let file_paths_to_extract_ffmpeg_data = - get_files_for_ffmpeg_media_data_extraction(db, &iso_file_path).await?; - - #[cfg(feature = "ai")] - let file_paths_for_labelling = - get_files_for_labeling(db, &iso_file_path, regenerate_labels).await?; - - #[cfg(feature = "ai")] - let has_labels = !file_paths_for_labelling.is_empty(); - - let total_files = - file_paths_to_extract_exif_data.len() + file_paths_to_extract_ffmpeg_data.len(); - - let chunked_files_to_extract_exif_data = file_paths_to_extract_exif_data - .into_iter() - .chunks(BATCH_SIZE) - .into_iter() - .map(Iterator::collect) - .collect::>>(); - - let chunked_files_to_extract_ffmpeg_data = file_paths_to_extract_ffmpeg_data - .into_iter() - .chunks(BATCH_SIZE) - .into_iter() - .map(Iterator::collect) - .collect::>>(); - - debug!( - "Preparing to process {total_files} files in {} chunks", - chunked_files_to_extract_exif_data.len() + chunked_files_to_extract_ffmpeg_data.len() - ); - - #[cfg(feature = "ai")] - // Check if we have an image labeller and has_labels then enqueue a new batch - let labels_rx = node.old_image_labeller.as_ref().and_then(|image_labeller| { - has_labels.then(|| { - image_labeller.new_batch( - location_id, - location_path.clone(), - file_paths_for_labelling, - Arc::clone(db), - sync.clone(), - ) - }) - }); - - let mut run_metadata = OldMediaProcessorMetadata::default(); - - for files in chunked_files_to_extract_exif_data { - let (more_run_metadata, errors) = - exif_metadata_extractor::process(&files, location.id, &location_path, db, &|_| {}) - .await - .map_err(MediaProcessorError::from)?; - - run_metadata.update(more_run_metadata.into()); - - if !errors.is_empty() { - error!("Errors processing chunk of image media data shallow extraction:\n{errors}"); - } - } - - for files in chunked_files_to_extract_ffmpeg_data { - let (more_run_metadata, errors) = - ffmpeg_metadata_extractor::process(&files, location.id, &location_path, db, &|_| {}) - .await - .map_err(MediaProcessorError::from)?; - - run_metadata.update(more_run_metadata.into()); - - if !errors.is_empty() { - error!("Errors processing chunk of audio or video media data shallow extraction:\n{errors}"); - } - } - - debug!("Media shallow processor run metadata: {run_metadata:?}"); - - if run_metadata.exif_data.extracted > 0 || run_metadata.ffmpeg_data.extracted > 0 { - invalidate_query!(library, "search.paths"); - invalidate_query!(library, "search.objects"); - } - - #[cfg(feature = "ai")] - { - if has_labels { - if let Some(labels_rx) = labels_rx { - labels_rx - .await - .for_each( - |LabelerOutput { - file_path_id, - has_new_labels, - result, - }| async move { - if let Err(e) = result { - error!( - "Failed to generate labels : {e:#?}" - ); - } else if has_new_labels { - // invalidate_query!(library, "labels.count"); // TODO: This query doesn't exist on main yet - } - }, - ) - .await; - - invalidate_query!(library, "labels.list"); - invalidate_query!(library, "labels.getForObject"); - invalidate_query!(library, "labels.getWithObjects"); - } - } - } - - Ok(()) -} - -async fn get_files_for_exif_media_data_extraction( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, -) -> Result, MediaProcessorError> { - get_files_by_extensions( - db, - parent_iso_file_path, - &exif_metadata_extractor::FILTERED_IMAGE_EXTENSIONS, - ) - .await - .map_err(Into::into) -} - -async fn get_files_for_ffmpeg_media_data_extraction( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, -) -> Result, MediaProcessorError> { - get_files_by_extensions( - db, - parent_iso_file_path, - &ffmpeg_metadata_extractor::FILTERED_AUDIO_AND_VIDEO_EXTENSIONS, - ) - .await - .map_err(Into::into) -} - -#[cfg(feature = "ai")] -async fn get_files_for_labeling( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, - regenerate_labels: bool, -) -> Result, MediaProcessorError> { - // FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite - // We have no data coming from the user, so this is sql injection safe - db._query_raw(raw!( - &format!( - "SELECT id, materialized_path, is_dir, name, extension, cas_id, object_id - FROM file_path f - WHERE - location_id={{}} - AND cas_id IS NOT NULL - AND LOWER(extension) IN ({}) - AND materialized_path = {{}} - {}", - &exif_metadata_extractor::FILTERED_IMAGE_EXTENSIONS - .iter() - .map(|ext| format!("LOWER('{ext}')")) - .collect::>() - .join(","), - if !regenerate_labels { - "AND NOT EXISTS (SELECT 1 FROM label_on_object WHERE object_id = f.object_id)" - } else { - "" - } - ), - PrismaValue::Int(parent_iso_file_path.location_id()), - PrismaValue::String( - parent_iso_file_path - .materialized_path_for_children() - .expect("sub path iso_file_path must be a directory") - ) - )) - .exec() - .await - .map_err(Into::into) -} - -async fn dispatch_thumbnails_for_processing( - location_id: location::id::Type, - location_path: impl AsRef, - parent_iso_file_path: &IsolatedFilePathData<'_>, - library: &Library, - node: &Node, - should_regenerate: bool, -) -> Result<(), MediaProcessorError> { - let Library { db, .. } = library; - - let location_path = location_path.as_ref(); - - let file_paths = get_files_by_extensions( - db, - parent_iso_file_path, - &old_thumbnail::ALL_THUMBNAILABLE_EXTENSIONS, - ) - .await?; - - let current_batch = file_paths - .into_iter() - .filter_map(|file_path| { - if let Some(cas_id) = file_path.cas_id.as_ref() { - Some((cas_id.clone(), file_path)) - } else { - error!("File path has no cas_id, skipping", file_path.id); - None - } - }) - .filter_map(|(cas_id, file_path)| { - let file_path_id = file_path.id; - IsolatedFilePathData::try_from((location_id, file_path)) - .map_err(|e| { - error!("Failed to extract isolated file path data from file path : {e:#?}"); - }) - .ok() - .map(|iso_file_path| (cas_id, iso_file_path)) - }) - .map(|(cas_id, iso_file_path)| { - let full_path = location_path.join(&iso_file_path); - - GenerateThumbnailArgs::new(iso_file_path.extension().to_string(), cas_id, full_path) - }) - .collect::>(); - - // Let's not send an empty batch lol - if !current_batch.is_empty() { - node.thumbnailer - .new_indexed_thumbnails_batch( - BatchToProcess::new(current_batch, should_regenerate, false), - library.id, - ) - .await; - } - - Ok(()) -} - -async fn get_files_by_extensions( - db: &PrismaClient, - parent_iso_file_path: &IsolatedFilePathData<'_>, - extensions: &[Extension], -) -> Result, MediaProcessorError> { - // FIXME: Had to use format! macro because PCR doesn't support IN with Vec for SQLite - // We have no data coming from the user, so this is sql injection safe - db._query_raw(raw!( - &format!( - "SELECT id, materialized_path, is_dir, name, extension, cas_id, object_id - FROM file_path - WHERE - location_id={{}} - AND cas_id IS NOT NULL - AND LOWER(extension) IN ({}) - AND materialized_path = {{}}", - extensions - .iter() - .map(|ext| format!("LOWER('{ext}')")) - .collect::>() - .join(",") - ), - PrismaValue::Int(parent_iso_file_path.location_id()), - PrismaValue::String( - parent_iso_file_path - .materialized_path_for_children() - .expect("sub path iso_file_path must be a directory") - ) - )) - .exec() - .await - .map_err(Into::into) -} diff --git a/core/src/object/media/old_thumbnail/clean_up.rs b/core/src/object/media/old_thumbnail/clean_up.rs index 70a4c7f4d..0d65409be 100644 --- a/core/src/object/media/old_thumbnail/clean_up.rs +++ b/core/src/object/media/old_thumbnail/clean_up.rs @@ -11,6 +11,8 @@ use tracing::{debug, error}; use super::{ThumbnailerError, EPHEMERAL_DIR, WEBP_EXTENSION}; +// TODO(fogodev) Introduce a task using the new task system to clean up the thumbnails from time to time. + pub(super) async fn process_ephemeral_clean_up( thumbnails_directory: Arc, existing_ephemeral_thumbs: HashSet, @@ -51,9 +53,10 @@ pub(super) async fn process_ephemeral_clean_up( { to_remove.push(async move { debug!( - "Removing stale ephemeral thumbnail: {}", - thumb_path.display() + thumb_path = %thumb_path.display(), + "Removing stale ephemeral thumbnail;", ); + fs::remove_file(&thumb_path).await.map_err(|e| { ThumbnailerError::FileIO(FileIOError::from((thumb_path, e))) }) @@ -67,14 +70,14 @@ pub(super) async fn process_ephemeral_clean_up( }) .await .map_or_else( - |e| error!("Join error on ephemeral clean up: {e:#?}",), + |e| error!(?e, "Join error on ephemeral clean up;",), |fetching_res| { fetching_res.map_or_else( - |e| error!("Error fetching ephemeral thumbs to be removed: {e:#?}"), + |e| error!(?e, "Error fetching ephemeral thumbs to be removed;"), |remove_results| { remove_results.into_iter().for_each(|remove_res| { if let Err(e) = remove_res { - error!("Error on ephemeral clean up: {e:#?}"); + error!(?e, "Error on ephemeral clean up;"); } }) }, @@ -140,9 +143,10 @@ pub(super) async fn process_indexed_clean_up( { to_remove.push(async move { debug!( - "Removing stale indexed thumbnail: {}", - thumb_path.display() + thumb_path = %thumb_path.display(), + "Removing stale indexed thumbnail;", ); + fs::remove_file(&thumb_path).await.map_err(|e| { ThumbnailerError::FileIO(FileIOError::from((thumb_path, e))) }) @@ -161,18 +165,18 @@ pub(super) async fn process_indexed_clean_up( .into_iter() .filter_map(|join_res| { join_res - .map_err(|e| error!("Join error on indexed clean up: {e:#?}")) + .map_err(|e| error!(?e, "Join error on indexed clean up;")) .ok() }) .filter_map(|fetching_res| { fetching_res - .map_err(|e| error!("Error fetching indexed thumbs to be removed: {e:#?}")) + .map_err(|e| error!(?e, "Error fetching indexed thumbs to be removed;")) .ok() }) .for_each(|remove_results| { remove_results.into_iter().for_each(|remove_res| { if let Err(e) = remove_res { - error!("Error on indexed clean up: {e:#?}"); + error!(?e, "Error on indexed clean up;"); } }) }) diff --git a/core/src/object/media/old_thumbnail/directory.rs b/core/src/object/media/old_thumbnail/directory.rs index 38db7adf5..d6d00caca 100644 --- a/core/src/object/media/old_thumbnail/directory.rs +++ b/core/src/object/media/old_thumbnail/directory.rs @@ -27,6 +27,8 @@ use super::{ VERSION_FILE, WEBP_EXTENSION, }; +// TODO(fogodev): Move this logic to be used alongside the NodeConfig or other Node part to run at app startup + #[derive( IntEnum, Debug, Clone, Copy, Eq, PartialEq, strum::Display, Serialize_repr, Deserialize_repr, )] @@ -56,7 +58,7 @@ pub(super) async fn init_thumbnail_dir( debug!("Initializing thumbnail directory"); let thumbnails_directory = data_dir.as_ref().join(THUMBNAIL_CACHE_DIR_NAME); - debug!("Thumbnail directory: {:?}", thumbnails_directory); + debug!(thumbnails_directory = %thumbnails_directory.display()); // create thumbnails base directory fs::create_dir_all(&thumbnails_directory) @@ -89,7 +91,7 @@ pub(super) async fn init_thumbnail_dir( }; if let Err(e) = process_migration(thumbnails_directory, databases).await { - error!("Failed to migrate thumbnails: {e:#?}"); + error!(?e, "Failed to migrate thumbnails;"); } } }); @@ -133,7 +135,8 @@ async fn process_migration( } _ => { - error!("Thumbnail version is not handled: {:?}", current); + error!(current_version = ?current, "Thumbnail version is not handled;"); + Err(VersionManagerError::UnexpectedMigration { current_version: current.int_value(), next_version: next.int_value(), @@ -187,10 +190,7 @@ async fn move_to_shards(thumbnails_directory: impl AsRef) -> Result<(), Th } } - info!( - "Moved {} webp files to their respective shard folders.", - count - ); + info!(%count, "Moved webp files to their respective shard folders;"); Ok(()) } @@ -237,9 +237,9 @@ async fn segregate_thumbnails_by_library( async move { trace!( - "Moving thumbnail from old location to new location: {} -> {}", - old.display(), - new.display() + old_location = %old.display(), + new_location = %new.display(), + "Moving thumbnail from old location to new location;", ); match fs::rename(&old, new).await { @@ -271,8 +271,10 @@ async fn segregate_thumbnails_by_library( let moved_count = to_move.try_join().await?.into_iter().sum::(); info!( - "Created {shards_created_count} shards and moved {moved_count} \ - thumbnails to library folder {library_id}" + %shards_created_count, + %moved_count, + %library_id + "Created shards and moved thumbnails to library folder;", ); Ok::<_, ThumbnailerError>(()) @@ -332,9 +334,9 @@ async fn segregate_thumbnails_by_library( to_move.push(async move { trace!( - "Moving thumbnail from old location to new location: {} -> {}", - thumb_path.display(), - new_ephemeral_shard.display() + old_location = %thumb_path.display(), + new_location = %new_ephemeral_shard.display(), + "Moving thumbnail from old location to new location;" ); fs::rename(&thumb_path, &new_ephemeral_shard) @@ -361,7 +363,7 @@ async fn segregate_thumbnails_by_library( let moved_shard = to_move.try_join().await?.len(); - info!("Moved {moved_shard} shards to the ephemeral directory"); + info!(%moved_shards, "Moved shards to the ephemeral directory;"); empty_shards .into_iter() @@ -369,7 +371,7 @@ async fn segregate_thumbnails_by_library( path.file_name() .map_or(false, |name| name.len() == 2) .then_some(async move { - trace!("Removing empty shard directory: {}", path.display()); + trace!(path = path.display(), "Removing empty shard directory;"); fs::remove_dir(&path) .await .map_err(|e| FileIOError::from((path, e))) diff --git a/core/src/object/media/old_thumbnail/mod.rs b/core/src/object/media/old_thumbnail/mod.rs deleted file mode 100644 index b55cf833d..000000000 --- a/core/src/object/media/old_thumbnail/mod.rs +++ /dev/null @@ -1,205 +0,0 @@ -use crate::{library::LibraryId, util::version_manager::VersionManagerError, Node}; - -use sd_file_ext::extensions::{ - DocumentExtension, Extension, ImageExtension, ALL_DOCUMENT_EXTENSIONS, ALL_IMAGE_EXTENSIONS, -}; -use sd_utils::error::FileIOError; - -#[cfg(feature = "ffmpeg")] -use sd_file_ext::extensions::{VideoExtension, ALL_VIDEO_EXTENSIONS}; - -use std::{ - path::{Path, PathBuf}, - time::Duration, -}; - -use once_cell::sync::Lazy; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tokio::task; -use tracing::error; - -mod clean_up; -mod directory; -pub mod old_actor; -pub mod preferences; -mod process; -mod shard; -mod state; -mod worker; - -pub use process::{BatchToProcess, GenerateThumbnailArgs}; -pub use shard::get_shard_hex; - -use directory::ThumbnailVersion; - -// Files names constants -const THUMBNAIL_CACHE_DIR_NAME: &str = "thumbnails"; -const SAVE_STATE_FILE: &str = "thumbs_to_process.bin"; -const VERSION_FILE: &str = "version.txt"; -pub const WEBP_EXTENSION: &str = "webp"; -const EPHEMERAL_DIR: &str = "ephemeral"; - -/// This is the target pixel count for all thumbnails to be resized to, and it is eventually downscaled -/// to [`TARGET_QUALITY`]. -const TARGET_PX: f32 = 1048576.0; // 1024x1024 - -/// This is the target quality that we render thumbnails at, it is a float between 0-100 -/// and is treated as a percentage (so 60% in this case, or it's the same as multiplying by `0.6`). -const TARGET_QUALITY: f32 = 60.0; - -// Some time constants -const ONE_SEC: Duration = Duration::from_secs(1); -const THIRTY_SECS: Duration = Duration::from_secs(30); -const HALF_HOUR: Duration = Duration::from_secs(30 * 60); - -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] -pub enum ThumbnailKind { - Ephemeral, - Indexed(LibraryId), -} - -pub fn get_indexed_thumbnail_path(node: &Node, cas_id: &str, library_id: LibraryId) -> PathBuf { - get_thumbnail_path(node, cas_id, ThumbnailKind::Indexed(library_id)) -} - -pub fn get_ephemeral_thumbnail_path(node: &Node, cas_id: &str) -> PathBuf { - get_thumbnail_path(node, cas_id, ThumbnailKind::Ephemeral) -} - -/// This does not check if a thumbnail exists, it just returns the path that it would exist at -fn get_thumbnail_path(node: &Node, cas_id: &str, kind: ThumbnailKind) -> PathBuf { - let mut thumb_path = node.config.data_directory(); - - thumb_path.push(THUMBNAIL_CACHE_DIR_NAME); - match kind { - ThumbnailKind::Ephemeral => thumb_path.push(EPHEMERAL_DIR), - ThumbnailKind::Indexed(library_id) => { - thumb_path.push(library_id.to_string()); - } - } - thumb_path.push(get_shard_hex(cas_id)); - thumb_path.push(cas_id); - thumb_path.set_extension(WEBP_EXTENSION); - - thumb_path -} - -pub fn get_indexed_thumb_key(cas_id: &str, library_id: LibraryId) -> Vec { - get_thumb_key(cas_id, ThumbnailKind::Indexed(library_id)) -} - -pub fn get_ephemeral_thumb_key(cas_id: &str) -> Vec { - get_thumb_key(cas_id, ThumbnailKind::Ephemeral) -} - -// this is used to pass the relevant data to the frontend so it can request the thumbnail -// it supports extending the shard hex to support deeper directory structures in the future -fn get_thumb_key(cas_id: &str, kind: ThumbnailKind) -> Vec { - vec![ - match kind { - ThumbnailKind::Ephemeral => String::from(EPHEMERAL_DIR), - ThumbnailKind::Indexed(library_id) => library_id.to_string(), - }, - get_shard_hex(cas_id).to_string(), - cas_id.to_string(), - ] -} - -#[cfg(feature = "ffmpeg")] -pub(super) static THUMBNAILABLE_VIDEO_EXTENSIONS: Lazy> = Lazy::new(|| { - ALL_VIDEO_EXTENSIONS - .iter() - .cloned() - .filter(can_generate_thumbnail_for_video) - .map(Extension::Video) - .collect() -}); - -pub(super) static THUMBNAILABLE_EXTENSIONS: Lazy> = Lazy::new(|| { - ALL_IMAGE_EXTENSIONS - .iter() - .cloned() - .filter(can_generate_thumbnail_for_image) - .map(Extension::Image) - .chain( - ALL_DOCUMENT_EXTENSIONS - .iter() - .cloned() - .filter(can_generate_thumbnail_for_document) - .map(Extension::Document), - ) - .collect() -}); - -pub(super) static ALL_THUMBNAILABLE_EXTENSIONS: Lazy> = Lazy::new(|| { - #[cfg(feature = "ffmpeg")] - return THUMBNAILABLE_EXTENSIONS - .iter() - .cloned() - .chain(THUMBNAILABLE_VIDEO_EXTENSIONS.iter().cloned()) - .collect(); - - #[cfg(not(feature = "ffmpeg"))] - THUMBNAILABLE_EXTENSIONS.clone() -}); - -#[derive(Error, Debug)] -pub enum ThumbnailerError { - // Internal errors - #[error("database error: {0}")] - Database(#[from] prisma_client_rust::QueryError), - #[error(transparent)] - FileIO(#[from] FileIOError), - #[error(transparent)] - VersionManager(#[from] VersionManagerError), - #[error("failed to encode webp")] - WebPEncoding { path: Box, reason: String }, - #[error("error while converting the image")] - SdImages { - path: Box, - error: sd_images::Error, - }, - #[error("failed to execute converting task: {0}")] - Task(#[from] task::JoinError), - #[cfg(feature = "ffmpeg")] - #[error(transparent)] - FFmpeg(#[from] sd_ffmpeg::Error), - #[error("thumbnail generation timed out for {}", .0.display())] - TimedOut(Box), -} - -#[derive(Debug, Serialize, Deserialize, Clone, Copy)] -pub enum ThumbnailerEntryKind { - Image, - #[cfg(feature = "ffmpeg")] - Video, -} - -#[derive(Serialize, Deserialize, Default, Debug)] -pub struct ThumbnailerMetadata { - pub created: u32, - pub skipped: u32, -} - -#[cfg(feature = "ffmpeg")] -pub const fn can_generate_thumbnail_for_video(video_extension: &VideoExtension) -> bool { - use VideoExtension::*; - // File extensions that are specifically not supported by the thumbnailer - !matches!(video_extension, Mpg | Swf | M2v | Hevc | M2ts | Mts | Ts) -} - -pub const fn can_generate_thumbnail_for_image(image_extension: &ImageExtension) -> bool { - use ImageExtension::*; - - matches!( - image_extension, - Jpg | Jpeg | Png | Webp | Gif | Svg | Heic | Heics | Heif | Heifs | Avif | Bmp | Ico - ) -} - -pub const fn can_generate_thumbnail_for_document(document_extension: &DocumentExtension) -> bool { - use DocumentExtension::*; - - matches!(document_extension, Pdf) -} diff --git a/core/src/object/media/old_thumbnail/old_actor.rs b/core/src/object/media/old_thumbnail/old_actor.rs deleted file mode 100644 index 8996c81f3..000000000 --- a/core/src/object/media/old_thumbnail/old_actor.rs +++ /dev/null @@ -1,335 +0,0 @@ -use crate::{ - api::CoreEvent, - library::{Libraries, LibraryId, LibraryManagerEvent}, - node::config::NodePreferences, -}; - -use sd_prisma::prisma::{location, PrismaClient}; -use sd_utils::error::{FileIOError, NonUtf8PathError}; - -use std::{ - path::{Path, PathBuf}, - sync::Arc, -}; - -use async_channel as chan; -use once_cell::sync::OnceCell; -use thiserror::Error; -use tokio::{ - fs, spawn, - sync::{broadcast, oneshot, watch, Mutex}, - time::{sleep, Instant}, -}; -use tracing::{error, trace}; -use uuid::Uuid; - -use super::{ - directory::init_thumbnail_dir, - process::{generate_thumbnail, ThumbData}, - state::RegisterReporter, - worker::{old_worker, WorkerChannels}, - BatchToProcess, ThumbnailKind, ThumbnailerError, ONE_SEC, THUMBNAIL_CACHE_DIR_NAME, -}; - -static AVAILABLE_PARALLELISM: OnceCell = OnceCell::new(); - -#[derive(Error, Debug)] -pub(super) enum ActorError { - #[error("database error")] - Database(#[from] prisma_client_rust::QueryError), - #[error(transparent)] - FileIO(#[from] FileIOError), - #[error(transparent)] - NonUtf8Path(#[from] NonUtf8PathError), -} - -#[derive(Debug)] -pub(super) enum DatabaseMessage { - Add(Uuid, Arc), - Update(Uuid, Arc), - Remove(Uuid), -} - -// Thumbnails directory have the following structure: -// thumbnails/ -// ├── version.txt -// ├── thumbs_to_process.bin # processing save state -// ├── ephemeral/ # ephemeral ones have it's own directory -// │ └── [0..3]/ # sharding -// │ └── .webp -// └── / # we segregate thumbnails by library -// └── [0..3]/ # sharding -// └── .webp -pub struct OldThumbnailer { - thumbnails_directory: Arc, - cas_ids_to_delete_tx: chan::Sender<(Vec, ThumbnailKind)>, - thumbnails_to_generate_tx: chan::Sender<(BatchToProcess, ThumbnailKind)>, - progress_reporter_tx: chan::Sender, - last_single_thumb_generated: Mutex, - reporter: broadcast::Sender, - cancel_tx: chan::Sender>, -} - -impl OldThumbnailer { - pub async fn new( - data_dir: impl AsRef, - libraries_manager: Arc, - reporter: broadcast::Sender, - node_preferences_rx: watch::Receiver, - ) -> Self { - let data_dir = data_dir.as_ref(); - let thumbnails_directory = Arc::new( - init_thumbnail_dir(data_dir, Arc::clone(&libraries_manager)) - .await - .unwrap_or_else(|e| { - error!("Failed to initialize thumbnail directory: {e:#?}"); - data_dir.join(THUMBNAIL_CACHE_DIR_NAME) - }), - ); - - let (progress_management_tx, progress_management_rx) = chan::bounded(16); - - let (databases_tx, databases_rx) = chan::bounded(4); - let (thumbnails_to_generate_tx, ephemeral_thumbnails_to_generate_rx) = chan::unbounded(); - let (cas_ids_to_delete_tx, cas_ids_to_delete_rx) = chan::bounded(16); - let (cancel_tx, cancel_rx) = chan::bounded(1); - - AVAILABLE_PARALLELISM - .set(std::thread::available_parallelism().map_or_else( - |e| { - error!("Failed to get available parallelism: {e:#?}"); - 4 - }, - |non_zero| non_zero.get(), - )) - .ok(); - - spawn({ - let progress_management_rx = progress_management_rx.clone(); - let cancel_rx = cancel_rx.clone(); - let thumbnails_directory = Arc::clone(&thumbnails_directory); - let reporter = reporter.clone(); - let node_preferences = node_preferences_rx.clone(); - - async move { - while let Err(e) = spawn(old_worker( - *AVAILABLE_PARALLELISM - .get() - .expect("BATCH_SIZE is set at thumbnailer new method"), - node_preferences.clone(), - reporter.clone(), - thumbnails_directory.clone(), - WorkerChannels { - progress_management_rx: progress_management_rx.clone(), - databases_rx: databases_rx.clone(), - cas_ids_to_delete_rx: cas_ids_to_delete_rx.clone(), - thumbnails_to_generate_rx: ephemeral_thumbnails_to_generate_rx.clone(), - cancel_rx: cancel_rx.clone(), - }, - )) - .await - { - error!( - "Error on Thumbnail Remover Actor; \ - Error: {e}; \ - Restarting the worker loop...", - ); - } - } - }); - - spawn({ - let rx = libraries_manager.rx.clone(); - let thumbnails_directory = Arc::clone(&thumbnails_directory); - - async move { - let subscribe_res = rx - .subscribe(|event| { - let databases_tx = databases_tx.clone(); - - let thumbnails_directory = &thumbnails_directory; - - async move { - match event { - LibraryManagerEvent::Load(library) => { - let library_dir = - thumbnails_directory.join(library.id.to_string()); - - if let Err(e) = fs::create_dir_all(&library_dir).await { - error!( - "Failed to create library dir for thumbnails: {:#?}", - FileIOError::from((library_dir, e)) - ); - } - - databases_tx - .send(DatabaseMessage::Add( - library.id, - Arc::clone(&library.db), - )) - .await - .expect("critical thumbnailer error: databases channel closed on send add") - } - - LibraryManagerEvent::Edit(library) - | LibraryManagerEvent::InstancesModified(library) => databases_tx - .send(DatabaseMessage::Update( - library.id, - Arc::clone(&library.db), - )) - .await - .expect("critical thumbnailer error: databases channel closed on send update"), - - LibraryManagerEvent::Delete(library) => databases_tx - .send(DatabaseMessage::Remove(library.id)) - .await - .expect("critical thumbnailer error: databases channel closed on send delete"), - } - } - }) - .await; - - if subscribe_res.is_err() { - error!("Thumbnailer actor has crashed...") - } - } - }); - - Self { - thumbnails_directory, - cas_ids_to_delete_tx, - thumbnails_to_generate_tx, - progress_reporter_tx: progress_management_tx, - last_single_thumb_generated: Mutex::new(Instant::now()), - reporter, - cancel_tx, - } - } - - #[inline] - async fn new_batch(&self, batch: BatchToProcess, kind: ThumbnailKind) { - if !batch.batch.is_empty() { - self.thumbnails_to_generate_tx - .send((batch, kind)) - .await - .expect("critical thumbnailer error: failed to send new batch"); - } else { - trace!("Empty batch received, skipping..."); - } - } - - #[inline] - pub async fn new_ephemeral_thumbnails_batch(&self, batch: BatchToProcess) { - self.new_batch(batch, ThumbnailKind::Ephemeral).await - } - - #[inline] - pub async fn new_indexed_thumbnails_batch(&self, batch: BatchToProcess, library_id: LibraryId) { - self.new_batch(batch, ThumbnailKind::Indexed(library_id)) - .await - } - - #[inline] - pub async fn new_indexed_thumbnails_tracked_batch( - &self, - mut batch: BatchToProcess, - library_id: LibraryId, - location_id: location::id::Type, - ) { - batch.location_id = Some(location_id); - - self.new_batch(batch, ThumbnailKind::Indexed(library_id)) - .await; - } - - #[inline] - pub async fn register_reporter( - &self, - location_id: location::id::Type, - progress_tx: chan::Sender<(u32, u32)>, - ) { - self.progress_reporter_tx - .send((location_id, progress_tx)) - .await - .expect("critical thumbnailer error: failed to send register reporter fn"); - } - - #[inline] - async fn remove_cas_ids(&self, cas_ids: Vec, kind: ThumbnailKind) { - self.cas_ids_to_delete_tx - .send((cas_ids, kind)) - .await - .expect("critical thumbnailer error: failed to send cas ids to delete"); - } - - #[inline] - pub async fn remove_ephemeral_cas_ids(&self, cas_ids: Vec) { - self.remove_cas_ids(cas_ids, ThumbnailKind::Ephemeral).await - } - - #[inline] - pub async fn remove_indexed_cas_ids(&self, cas_ids: Vec, library_id: LibraryId) { - self.remove_cas_ids(cas_ids, ThumbnailKind::Indexed(library_id)) - .await - } - - #[inline] - pub async fn shutdown(&self) { - let (tx, rx) = oneshot::channel(); - self.cancel_tx - .send(tx) - .await - .expect("critical thumbnailer error: failed to send shutdown signal"); - - rx.await - .expect("critical thumbnailer error: failed to receive shutdown signal response"); - } - - /// WARNING!!!! DON'T USE THIS METHOD IN A LOOP!!!!!!!!!!!!! It will be pretty slow on purpose! - pub async fn generate_single_indexed_thumbnail( - &self, - extension: &str, - cas_id: String, - path: impl AsRef, - library_id: LibraryId, - ) -> Result<(), ThumbnailerError> { - self.generate_single_thumbnail(extension, cas_id, path, ThumbnailKind::Indexed(library_id)) - .await - } - - async fn generate_single_thumbnail( - &self, - extension: &str, - cas_id: String, - path: impl AsRef, - kind: ThumbnailKind, - ) -> Result<(), ThumbnailerError> { - let mut last_single_thumb_generated_guard = self.last_single_thumb_generated.lock().await; - - let elapsed = Instant::now() - *last_single_thumb_generated_guard; - if elapsed < ONE_SEC { - // This will choke up in case someone try to use this method in a loop, otherwise - // it will consume all the machine resources like a gluton monster from hell - sleep(ONE_SEC - elapsed).await; - } - - let res = generate_thumbnail( - self.thumbnails_directory.as_ref().clone(), - ThumbData { - extension, - cas_id, - path, - in_background: false, - should_regenerate: false, - kind, - }, - self.reporter.clone(), - ) - .await - .map(|_| ()); - - *last_single_thumb_generated_guard = Instant::now(); - - res - } -} diff --git a/core/src/object/media/old_thumbnail/preferences.rs b/core/src/object/media/old_thumbnail/preferences.rs deleted file mode 100644 index 39c116e0c..000000000 --- a/core/src/object/media/old_thumbnail/preferences.rs +++ /dev/null @@ -1,34 +0,0 @@ -use serde::{Deserialize, Serialize}; -use specta::Type; - -#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Eq, Type)] -pub struct ThumbnailerPreferences { - background_processing_percentage: u8, // 0-100 -} - -impl Default for ThumbnailerPreferences { - fn default() -> Self { - Self { - background_processing_percentage: 50, // 50% of CPU cores available - } - } -} - -impl ThumbnailerPreferences { - pub fn background_processing_percentage(&self) -> u8 { - self.background_processing_percentage - } - - pub fn set_background_processing_percentage( - &mut self, - mut background_processing_percentage: u8, - ) -> &mut Self { - if background_processing_percentage > 100 { - background_processing_percentage = 100; - } - - self.background_processing_percentage = background_processing_percentage; - - self - } -} diff --git a/core/src/object/media/old_thumbnail/process.rs b/core/src/object/media/old_thumbnail/process.rs deleted file mode 100644 index 136680551..000000000 --- a/core/src/object/media/old_thumbnail/process.rs +++ /dev/null @@ -1,483 +0,0 @@ -use crate::api::CoreEvent; - -use sd_file_ext::extensions::{DocumentExtension, ImageExtension}; -use sd_images::{format_image, scale_dimensions, ConvertibleExtension}; -use sd_media_metadata::exif::Orientation; -use sd_prisma::prisma::location; -use sd_utils::error::FileIOError; - -use std::{ - collections::VecDeque, - ffi::OsString, - ops::Deref, - path::{Path, PathBuf}, - str::FromStr, - sync::Arc, -}; - -use async_channel as chan; -use futures_concurrency::future::{Join, Race}; -use image::{imageops, DynamicImage, GenericImageView}; -use serde::{Deserialize, Serialize}; -use tokio::{ - fs, io, - sync::{broadcast, oneshot, Semaphore}, - task::{spawn, spawn_blocking}, - time::timeout, -}; -use tokio_stream::StreamExt; -use tracing::{debug, error, trace, warn}; -use webp::Encoder; - -use super::{ - can_generate_thumbnail_for_document, can_generate_thumbnail_for_image, get_thumb_key, - preferences::ThumbnailerPreferences, shard::get_shard_hex, ThumbnailKind, ThumbnailerError, - EPHEMERAL_DIR, TARGET_PX, TARGET_QUALITY, THIRTY_SECS, WEBP_EXTENSION, -}; - -#[derive(Debug, Serialize, Deserialize)] -pub struct GenerateThumbnailArgs { - pub extension: String, - pub cas_id: String, - pub path: PathBuf, -} - -impl GenerateThumbnailArgs { - pub fn new(extension: String, cas_id: String, path: PathBuf) -> Self { - Self { - extension, - cas_id, - path, - } - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct BatchToProcess { - pub(super) batch: Vec, - pub(super) should_regenerate: bool, - pub(super) in_background: bool, - pub(super) location_id: Option, -} - -impl BatchToProcess { - pub fn new( - batch: Vec, - should_regenerate: bool, - in_background: bool, - ) -> Self { - Self { - batch, - should_regenerate, - in_background, - location_id: None, - } - } -} - -pub(super) struct ProcessorControlChannels { - pub stop_rx: chan::Receiver>, - pub done_tx: oneshot::Sender<()>, - pub batch_report_progress_tx: chan::Sender<(location::id::Type, u32)>, -} - -pub(super) async fn batch_processor( - thumbnails_directory: Arc, - ( - BatchToProcess { - batch, - should_regenerate, - in_background, - location_id, - }, - kind, - ): (BatchToProcess, ThumbnailKind), - generated_ephemeral_thumbs_file_names_tx: chan::Sender>, - ProcessorControlChannels { - stop_rx, - done_tx, - batch_report_progress_tx, - }: ProcessorControlChannels, - leftovers_tx: chan::Sender<(BatchToProcess, ThumbnailKind)>, - reporter: broadcast::Sender, - (available_parallelism, thumbnailer_preferences): (usize, ThumbnailerPreferences), -) { - let in_parallel_count = if !in_background { - available_parallelism - } else { - usize::max( - // If the user sets the background processing percentage to 0, we still want to process at least sequentially - thumbnailer_preferences.background_processing_percentage() as usize - * available_parallelism - / 100, - 1, - ) - }; - - debug!( - "Processing thumbnails batch of kind {kind:?} with size {} in {}, \ - at most {in_parallel_count} thumbnails at a time", - batch.len(), - if in_background { - "background" - } else { - "foreground" - }, - ); - - let semaphore = Arc::new(Semaphore::new(in_parallel_count)); - - let batch_size = batch.len(); - - // Transforming to `VecDeque` so we don't need to move anything as we consume from the beginning - // This from is guaranteed to be O(1) - let mut queue = VecDeque::from(batch); - - enum RaceOutputs { - Processed, - Stop(oneshot::Sender<()>), - } - - let (maybe_cas_ids_tx, maybe_cas_ids_rx) = if kind == ThumbnailKind::Ephemeral { - let (tx, rx) = chan::bounded(batch_size); - (Some(tx), Some(rx)) - } else { - (None, None) - }; - - let maybe_stopped_tx = if let RaceOutputs::Stop(stopped_tx) = ( - async { - let mut join_handles = Vec::with_capacity(batch_size); - - while !queue.is_empty() { - let permit = Arc::clone(&semaphore) - .acquire_owned() - .await - .expect("this semaphore never closes"); - - let GenerateThumbnailArgs { - extension, - cas_id, - path, - } = queue.pop_front().expect("queue is not empty"); - - // As we got a permit, then there is available CPU to process this thumbnail - join_handles.push(spawn({ - let reporter = reporter.clone(); - let thumbnails_directory = thumbnails_directory.as_ref().clone(); - let report_progress_tx = batch_report_progress_tx.clone(); - let maybe_cas_ids_tx = maybe_cas_ids_tx.clone(); - - async move { - let res = timeout(THIRTY_SECS, async { - generate_thumbnail( - thumbnails_directory, - ThumbData { - extension: &extension, - cas_id, - path: &path, - in_background, - should_regenerate, - kind, - }, - reporter, - ) - .await - .map(|cas_id| { - // this send_blocking never blocks as we have a bounded channel with - // the same capacity as the batch size, so there is always a space - // in the queue - if let Some(cas_ids_tx) = maybe_cas_ids_tx { - if cas_ids_tx - .send_blocking(OsString::from(format!("{}.webp", cas_id))) - .is_err() - { - warn!("No one to listen to generated ephemeral thumbnail cas id"); - } - } - }) - }) - .await - .unwrap_or_else(|_| { - Err(ThumbnailerError::TimedOut(path.into_boxed_path())) - }); - - if let Some(location_id) = location_id { - report_progress_tx.send((location_id, 1)).await.ok(); - } - - drop(permit); - - res - } - })); - } - - for res in join_handles.join().await { - match res { - Ok(Ok(())) => { /* Everything is awesome! */ } - Ok(Err(e)) => { - error!( - "Failed to generate thumbnail for {} location: {e:#?}", - if let ThumbnailKind::Ephemeral = kind { - "ephemeral" - } else { - "indexed" - } - ) - } - Err(e) => { - error!("Failed to join thumbnail generation task: {e:#?}"); - } - } - } - - if let Some(cas_ids_tx) = &maybe_cas_ids_tx { - cas_ids_tx.close(); - } - - trace!("Processed batch with {batch_size} thumbnails"); - - RaceOutputs::Processed - }, - async { - let tx = stop_rx - .recv() - .await - .expect("Critical error on thumbnails actor"); - trace!("Received a stop signal"); - RaceOutputs::Stop(tx) - }, - ) - .race() - .await - { - // Our queue is always contiguous, so this `from` is free - let leftovers = Vec::from(queue); - - trace!( - "Stopped with {} thumbnails left to process", - leftovers.len() - ); - if !leftovers.is_empty() - && leftovers_tx - .send(( - BatchToProcess { - batch: leftovers, - should_regenerate, - in_background: true, // Leftovers should always be in background - location_id, - }, - kind, - )) - .await - .is_err() - { - error!("Thumbnail actor is dead: Failed to send leftovers") - } - - if let Some(cas_ids_tx) = &maybe_cas_ids_tx { - cas_ids_tx.close(); - } - - Some(stopped_tx) - } else { - None - }; - - if let Some(cas_ids_rx) = maybe_cas_ids_rx { - if generated_ephemeral_thumbs_file_names_tx - .send(cas_ids_rx.collect().await) - .await - .is_err() - { - error!("Thumbnail actor is dead: Failed to send generated cas ids") - } - } - - if let Some(stopped_tx) = maybe_stopped_tx { - stopped_tx.send(()).ok(); - } else { - trace!("Finished batch!"); - } - - done_tx.send(()).ok(); -} - -pub(super) struct ThumbData<'ext, P: AsRef> { - pub extension: &'ext str, - pub cas_id: String, - pub path: P, - pub in_background: bool, - pub should_regenerate: bool, - pub kind: ThumbnailKind, -} - -pub(super) async fn generate_thumbnail( - thumbnails_directory: PathBuf, - ThumbData { - extension, - cas_id, - path, - in_background, - should_regenerate, - kind, - }: ThumbData<'_, impl AsRef>, - reporter: broadcast::Sender, -) -> Result { - let path = path.as_ref(); - trace!("Generating thumbnail for {}", path.display()); - - let mut output_path = thumbnails_directory; - match kind { - ThumbnailKind::Ephemeral => output_path.push(EPHEMERAL_DIR), - ThumbnailKind::Indexed(library_id) => output_path.push(library_id.to_string()), - }; - output_path.push(get_shard_hex(&cas_id)); - output_path.push(&cas_id); - output_path.set_extension(WEBP_EXTENSION); - - if let Err(e) = fs::metadata(&output_path).await { - if e.kind() != io::ErrorKind::NotFound { - error!( - "Failed to check if thumbnail exists, but we will try to generate it anyway: {e:#?}" - ); - } - // Otherwise we good, thumbnail doesn't exist so we can generate it - } else if !should_regenerate { - trace!( - "Skipping thumbnail generation for {} because it already exists", - path.display() - ); - return Ok(cas_id); - } - - if let Ok(extension) = ImageExtension::from_str(extension) { - if can_generate_thumbnail_for_image(&extension) { - generate_image_thumbnail(&path, &output_path).await?; - } - } else if let Ok(extension) = DocumentExtension::from_str(extension) { - if can_generate_thumbnail_for_document(&extension) { - generate_image_thumbnail(&path, &output_path).await?; - } - } - - #[cfg(feature = "ffmpeg")] - { - use crate::object::media::old_thumbnail::can_generate_thumbnail_for_video; - use sd_file_ext::extensions::VideoExtension; - - if let Ok(extension) = VideoExtension::from_str(extension) { - if can_generate_thumbnail_for_video(&extension) { - generate_video_thumbnail(&path, &output_path).await?; - } - } - } - // This if is REALLY needed, due to the sheer performance of the thumbnailer, - // I restricted to only send events notifying for thumbnails in the current - // opened directory, sending events for the entire location turns into a - // humongous bottleneck in the frontend lol, since it doesn't even knows - // what to do with thumbnails for inner directories lol - // - fogodev - if !in_background { - trace!("Emitting new thumbnail event"); - if reporter - .send(CoreEvent::NewThumbnail { - thumb_key: get_thumb_key(&cas_id, kind), - }) - .is_err() - { - warn!("Error sending event to Node's event bus"); - } - } - - trace!("Generated thumbnail for {}", path.display()); - - Ok(cas_id) -} - -async fn generate_image_thumbnail( - file_path: impl AsRef, - output_path: impl AsRef, -) -> Result<(), ThumbnailerError> { - let file_path = file_path.as_ref().to_path_buf(); - - let webp = spawn_blocking(move || -> Result<_, ThumbnailerError> { - let mut img = format_image(&file_path).map_err(|e| ThumbnailerError::SdImages { - path: file_path.clone().into_boxed_path(), - error: e, - })?; - - let (w, h) = img.dimensions(); - let (w_scaled, h_scaled) = scale_dimensions(w as f32, h as f32, TARGET_PX); - - // Optionally, resize the existing photo and convert back into DynamicImage - if w != w_scaled && h != h_scaled { - img = DynamicImage::ImageRgba8(imageops::resize( - &img, - w_scaled, - h_scaled, - imageops::FilterType::Triangle, - )); - } - - // this corrects the rotation/flip of the image based on the *available* exif data - // not all images have exif data, so we don't error. we also don't rotate HEIF as that's against the spec - if let Some(orientation) = Orientation::from_path(&file_path) { - if ConvertibleExtension::try_from(file_path.as_ref()) - .expect("we already checked if the image was convertible") - .should_rotate() - { - img = orientation.correct_thumbnail(img); - } - } - - // Create the WebP encoder for the above image - let encoder = - Encoder::from_image(&img).map_err(|reason| ThumbnailerError::WebPEncoding { - path: file_path.into_boxed_path(), - reason: reason.to_string(), - })?; - - // Type WebPMemory is !Send, which makes the Future in this function !Send, - // this make us `deref` to have a `&[u8]` and then `to_owned` to make a Vec - // which implies on a unwanted clone... - Ok(encoder.encode(TARGET_QUALITY).deref().to_owned()) - }) - .await??; - - let output_path = output_path.as_ref(); - - if let Some(shard_dir) = output_path.parent() { - fs::create_dir_all(shard_dir) - .await - .map_err(|e| FileIOError::from((shard_dir, e)))?; - } else { - error!( - "Failed to get parent directory of '{}' for sharding parent directory", - output_path.display() - ); - } - - fs::write(output_path, &webp) - .await - .map_err(|e| FileIOError::from((output_path, e))) - .map_err(Into::into) -} - -#[cfg(feature = "ffmpeg")] -async fn generate_video_thumbnail( - file_path: impl AsRef + Send, - output_path: impl AsRef + Send, -) -> Result<(), ThumbnailerError> { - use sd_ffmpeg::{to_thumbnail, ThumbnailSize}; - - to_thumbnail( - file_path, - output_path, - ThumbnailSize::Scale(1024), - TARGET_QUALITY, - ) - .await - .map_err(Into::into) -} diff --git a/core/src/object/media/old_thumbnail/shard.rs b/core/src/object/media/old_thumbnail/shard.rs deleted file mode 100644 index be61e2034..000000000 --- a/core/src/object/media/old_thumbnail/shard.rs +++ /dev/null @@ -1,13 +0,0 @@ -/// The practice of dividing files into hex coded folders, often called "sharding," -/// is mainly used to optimize file system performance. File systems can start to slow down -/// as the number of files in a directory increases. Thus, it's often beneficial to split -/// files into multiple directories to avoid this performance degradation. - -/// `get_shard_hex` takes a cas_id (a hexadecimal hash) as input and returns the first -/// three characters of the hash as the directory name. Because we're using these first -/// three characters of a the hash, this will give us 4096 (16^3) possible directories, -/// named 000 to fff. -pub fn get_shard_hex(cas_id: &str) -> &str { - // Use the first three characters of the hash as the directory name - &cas_id[0..3] -} diff --git a/core/src/object/media/old_thumbnail/state.rs b/core/src/object/media/old_thumbnail/state.rs deleted file mode 100644 index 7cce8d1df..000000000 --- a/core/src/object/media/old_thumbnail/state.rs +++ /dev/null @@ -1,225 +0,0 @@ -use crate::library::LibraryId; - -use sd_prisma::prisma::location; -use sd_utils::error::FileIOError; - -use std::{ - collections::{hash_map::Entry, HashMap, HashSet, VecDeque}, - ffi::OsString, - path::Path, -}; - -use async_channel as chan; -use futures_concurrency::future::TryJoin; -use serde::{Deserialize, Serialize}; -use tokio::{fs, io}; -use tracing::{error, info, trace}; - -use super::{ - get_shard_hex, old_actor::ActorError, BatchToProcess, ThumbnailKind, EPHEMERAL_DIR, - SAVE_STATE_FILE, -}; - -#[derive(Debug, Serialize, Deserialize)] -pub(super) struct OldThumbsProcessingSaveState { - pub(super) bookkeeper: BookKeeper, - pub(super) ephemeral_file_names: HashSet, - // This queues doubles as LIFO and FIFO, assuming LIFO in case of users asking for a new batch - // by entering a new directory in the explorer, otherwise processing as FIFO - pub(super) queue: VecDeque<(BatchToProcess, ThumbnailKind)>, - // These below are FIFO queues, so we can process leftovers from the previous batch first - pub(super) indexed_leftovers_queue: VecDeque<(BatchToProcess, LibraryId)>, - pub(super) ephemeral_leftovers_queue: VecDeque, -} - -impl Default for OldThumbsProcessingSaveState { - fn default() -> Self { - Self { - bookkeeper: BookKeeper::default(), - ephemeral_file_names: HashSet::with_capacity(128), - queue: VecDeque::with_capacity(32), - indexed_leftovers_queue: VecDeque::with_capacity(8), - ephemeral_leftovers_queue: VecDeque::with_capacity(8), - } - } -} - -impl OldThumbsProcessingSaveState { - pub(super) async fn load(thumbnails_directory: impl AsRef) -> Self { - let resume_file = thumbnails_directory.as_ref().join(SAVE_STATE_FILE); - - match fs::read(&resume_file).await { - Ok(bytes) => { - let this = rmp_serde::from_slice::(&bytes).unwrap_or_else(|e| { - error!("Failed to deserialize save state at thumbnailer actor: {e:#?}"); - Self::default() - }); - - if let Err(e) = fs::remove_file(&resume_file).await { - error!( - "Failed to remove save state file at thumbnailer actor: {:#?}", - FileIOError::from((resume_file, e)) - ); - } - - info!( - "Resuming thumbnailer actor state: Existing ephemeral thumbs: {}; \ - Queued batches waiting processing: {}", - this.ephemeral_file_names.len(), - this.queue.len() - + this.indexed_leftovers_queue.len() - + this.ephemeral_leftovers_queue.len() - ); - - this - } - Err(e) if e.kind() == io::ErrorKind::NotFound => { - trace!("No save state found at thumbnailer actor"); - Self::default() - } - Err(e) => { - error!( - "Failed to read save state at thumbnailer actor: {:#?}", - FileIOError::from((resume_file, e)) - ); - Self::default() - } - } - } - - pub(super) async fn store(self, thumbnails_directory: impl AsRef) { - let resume_file = thumbnails_directory.as_ref().join(SAVE_STATE_FILE); - - info!( - "Saving thumbnailer actor state: Existing ephemeral thumbs: {}; \ - Queued batches waiting processing: {}", - self.ephemeral_file_names.len(), - self.queue.len() - + self.indexed_leftovers_queue.len() - + self.ephemeral_leftovers_queue.len() - ); - - let Ok(bytes) = rmp_serde::to_vec_named(&self).map_err(|e| { - error!("Failed to serialize save state at thumbnailer actor: {e:#?}"); - }) else { - return; - }; - - if let Err(e) = fs::write(&resume_file, bytes).await { - error!( - "Failed to write save state at thumbnailer actor: {:#?}", - FileIOError::from((resume_file, e)) - ); - } - } -} - -pub(super) async fn remove_by_cas_ids( - thumbnails_directory: &Path, - cas_ids: Vec, - kind: ThumbnailKind, -) -> Result<(), ActorError> { - let base_dir = match kind { - ThumbnailKind::Ephemeral => thumbnails_directory.join(EPHEMERAL_DIR), - ThumbnailKind::Indexed(library_id) => thumbnails_directory.join(library_id.to_string()), - }; - - cas_ids - .into_iter() - .map(|cas_id| { - let thumbnail_path = base_dir.join(format!("{}/{cas_id}.webp", get_shard_hex(&cas_id))); - - trace!("Removing thumbnail: {}", thumbnail_path.display()); - - async move { - match fs::remove_file(&thumbnail_path).await { - Ok(()) => Ok(()), - Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(()), - Err(e) => Err(FileIOError::from((thumbnail_path, e))), - } - } - }) - .collect::>() - .try_join() - .await?; - - Ok(()) -} - -pub(super) type RegisterReporter = (location::id::Type, chan::Sender<(u32, u32)>); - -#[derive(Debug, Serialize, Deserialize)] -pub(super) struct BookKeeper { - work_progress: HashMap, // (pending, total) - - // We can't save reporter function or a channel to disk, the job must ask again to be registered - #[serde(skip, default)] - reporter_by_location: HashMap>, -} -impl Default for BookKeeper { - fn default() -> Self { - Self { - work_progress: HashMap::with_capacity(8), - reporter_by_location: HashMap::with_capacity(8), - } - } -} - -impl BookKeeper { - pub(super) async fn add_work(&mut self, location_id: location::id::Type, thumbs_count: u32) { - let (in_progress, total) = match self.work_progress.entry(location_id) { - Entry::Occupied(mut entry) => { - let (in_progress, total) = entry.get_mut(); - - *total += thumbs_count; - - (*in_progress, *total) - } - Entry::Vacant(entry) => { - entry.insert((0, thumbs_count)); - - (0, thumbs_count) - } - }; - - if let Some(progress_tx) = self.reporter_by_location.get(&location_id) { - if progress_tx.send((in_progress, total)).await.is_err() { - error!( - "Failed to send progress update to reporter on location " - ); - } - } - } - - pub(super) fn register_reporter( - &mut self, - location_id: location::id::Type, - reporter_tx: chan::Sender<(u32, u32)>, - ) { - self.reporter_by_location.insert(location_id, reporter_tx); - } - - pub(super) async fn add_progress(&mut self, location_id: location::id::Type, progress: u32) { - if let Some((current_progress, total)) = self.work_progress.get_mut(&location_id) { - *current_progress += progress; - - if *current_progress == *total { - if let Some(progress_tx) = self.reporter_by_location.remove(&location_id) { - if progress_tx.send((*current_progress, *total)).await.is_err() { - error!( - "Failed to send progress update to reporter on location " - ); - } - } - - self.work_progress.remove(&location_id); - } else if let Some(progress_tx) = self.reporter_by_location.get(&location_id) { - if progress_tx.send((*current_progress, *total)).await.is_err() { - error!( - "Failed to send progress update to reporter on location " - ); - } - } - } - } -} diff --git a/core/src/object/media/old_thumbnail/worker.rs b/core/src/object/media/old_thumbnail/worker.rs deleted file mode 100644 index c3696028a..000000000 --- a/core/src/object/media/old_thumbnail/worker.rs +++ /dev/null @@ -1,350 +0,0 @@ -use crate::{api::CoreEvent, node::config::NodePreferences}; - -use sd_prisma::prisma::location; - -use std::{collections::HashMap, ffi::OsString, path::PathBuf, pin::pin, sync::Arc}; - -use async_channel as chan; -use futures_concurrency::stream::Merge; -use tokio::{ - spawn, - sync::{broadcast, oneshot, watch}, - time::{interval, interval_at, timeout, Instant, MissedTickBehavior}, -}; -use tokio_stream::{ - wrappers::{IntervalStream, WatchStream}, - StreamExt, -}; -use tracing::{debug, error, trace}; - -use super::{ - clean_up::{process_ephemeral_clean_up, process_indexed_clean_up}, - old_actor::DatabaseMessage, - preferences::ThumbnailerPreferences, - process::{batch_processor, ProcessorControlChannels}, - state::{remove_by_cas_ids, OldThumbsProcessingSaveState, RegisterReporter}, - BatchToProcess, ThumbnailKind, HALF_HOUR, ONE_SEC, THIRTY_SECS, -}; - -#[derive(Debug, Clone)] -pub(super) struct WorkerChannels { - pub(super) progress_management_rx: chan::Receiver, - pub(super) databases_rx: chan::Receiver, - pub(super) cas_ids_to_delete_rx: chan::Receiver<(Vec, ThumbnailKind)>, - pub(super) thumbnails_to_generate_rx: chan::Receiver<(BatchToProcess, ThumbnailKind)>, - pub(super) cancel_rx: chan::Receiver>, -} - -pub(super) async fn old_worker( - available_parallelism: usize, - node_preferences_rx: watch::Receiver, - reporter: broadcast::Sender, - thumbnails_directory: Arc, - WorkerChannels { - progress_management_rx, - databases_rx, - cas_ids_to_delete_rx, - thumbnails_to_generate_rx, - cancel_rx, - }: WorkerChannels, -) { - let mut to_remove_interval = interval_at(Instant::now() + THIRTY_SECS, HALF_HOUR); - to_remove_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - - let mut idle_interval = interval(ONE_SEC); - idle_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - - let mut databases = HashMap::new(); - - #[derive(Debug)] - enum StreamMessage { - RemovalTick, - ToDelete((Vec, ThumbnailKind)), - Database(DatabaseMessage), - NewBatch((BatchToProcess, ThumbnailKind)), - Leftovers((BatchToProcess, ThumbnailKind)), - NewEphemeralThumbnailsFilenames(Vec), - ProgressManagement(RegisterReporter), - BatchProgress((location::id::Type, u32)), - Shutdown(oneshot::Sender<()>), - UpdatedPreferences(ThumbnailerPreferences), - IdleTick, - } - - let OldThumbsProcessingSaveState { - mut bookkeeper, - mut ephemeral_file_names, - mut queue, - mut indexed_leftovers_queue, - mut ephemeral_leftovers_queue, - } = OldThumbsProcessingSaveState::load(thumbnails_directory.as_ref()).await; - - let (generated_ephemeral_thumbnails_tx, ephemeral_thumbnails_cas_ids_rx) = chan::bounded(32); - let (leftovers_tx, leftovers_rx) = chan::bounded(8); - let (batch_report_progress_tx, batch_report_progress_rx) = chan::bounded(8); - let (stop_older_processing_tx, stop_older_processing_rx) = chan::bounded(1); - - let mut shutdown_leftovers_rx = pin!(leftovers_rx.clone()); - let mut shutdown_batch_report_progress_rx = pin!(batch_report_progress_rx.clone()); - - let mut current_batch_processing_rx: Option> = None; - - let mut msg_stream = pin!(( - IntervalStream::new(to_remove_interval).map(|_| StreamMessage::RemovalTick), - cas_ids_to_delete_rx.map(StreamMessage::ToDelete), - databases_rx.map(StreamMessage::Database), - thumbnails_to_generate_rx.map(StreamMessage::NewBatch), - leftovers_rx.map(StreamMessage::Leftovers), - ephemeral_thumbnails_cas_ids_rx.map(StreamMessage::NewEphemeralThumbnailsFilenames), - progress_management_rx.map(StreamMessage::ProgressManagement), - batch_report_progress_rx.map(StreamMessage::BatchProgress), - cancel_rx.map(StreamMessage::Shutdown), - IntervalStream::new(idle_interval).map(|_| StreamMessage::IdleTick), - WatchStream::new(node_preferences_rx).map(|node_preferences| { - StreamMessage::UpdatedPreferences(node_preferences.thumbnailer) - }), - ) - .merge()); - - let mut thumbnailer_preferences = ThumbnailerPreferences::default(); - - while let Some(msg) = msg_stream.next().await { - match msg { - StreamMessage::IdleTick => { - if let Some(done_rx) = current_batch_processing_rx.as_mut() { - // Checking if the previous run finished or was aborted to clean state - match done_rx.try_recv() { - Ok(()) | Err(oneshot::error::TryRecvError::Closed) => { - current_batch_processing_rx = None; - } - - Err(oneshot::error::TryRecvError::Empty) => { - // The previous run is still running - continue; - } - } - } - - if current_batch_processing_rx.is_none() - && (!queue.is_empty() - || !indexed_leftovers_queue.is_empty() - || !ephemeral_leftovers_queue.is_empty()) - { - let (done_tx, done_rx) = oneshot::channel(); - current_batch_processing_rx = Some(done_rx); - - let batch_and_kind = if let Some(batch_and_kind) = queue.pop_front() { - batch_and_kind - } else if let Some((batch, library_id)) = indexed_leftovers_queue.pop_front() { - // indexed leftovers have bigger priority - (batch, ThumbnailKind::Indexed(library_id)) - } else if let Some(batch) = ephemeral_leftovers_queue.pop_front() { - (batch, ThumbnailKind::Ephemeral) - } else { - continue; - }; - - spawn(batch_processor( - thumbnails_directory.clone(), - batch_and_kind, - generated_ephemeral_thumbnails_tx.clone(), - ProcessorControlChannels { - stop_rx: stop_older_processing_rx.clone(), - done_tx, - batch_report_progress_tx: batch_report_progress_tx.clone(), - }, - leftovers_tx.clone(), - reporter.clone(), - (available_parallelism, thumbnailer_preferences.clone()), - )); - } - } - - StreamMessage::RemovalTick => { - // For any of them we process a clean up if a time since the last one already passed - if !databases.is_empty() { - spawn(process_indexed_clean_up( - thumbnails_directory.clone(), - databases - .iter() - .map(|(id, db)| (*id, Arc::clone(db))) - .collect::>(), - )); - } - - if !ephemeral_file_names.is_empty() { - spawn(process_ephemeral_clean_up( - thumbnails_directory.clone(), - ephemeral_file_names.clone(), - )); - } - } - - StreamMessage::ToDelete((cas_ids, kind)) => { - if !cas_ids.is_empty() { - if let Err(e) = remove_by_cas_ids(&thumbnails_directory, cas_ids, kind).await { - error!("Got an error when trying to remove thumbnails: {e:#?}"); - } - } - } - - StreamMessage::NewBatch((batch, kind)) => { - let in_background = batch.in_background; - - if let Some(location_id) = batch.location_id { - bookkeeper - .add_work(location_id, batch.batch.len() as u32) - .await; - } - - trace!( - "New {kind:?} batch to process in {}, size: {}", - if in_background { - "background" - } else { - "foreground" - }, - batch.batch.len() - ); - - if in_background { - queue.push_back((batch, kind)); - } else { - // If a processing must be in foreground, then it takes maximum priority - queue.push_front((batch, kind)); - } - - // Only sends stop signal if there is a batch being processed - if !in_background { - stop_batch( - ¤t_batch_processing_rx, - &stop_older_processing_tx, - &stop_older_processing_rx, - ) - .await; - } - } - - StreamMessage::Leftovers((batch, ThumbnailKind::Indexed(library_id))) => { - indexed_leftovers_queue.push_back((batch, library_id)) - } - - StreamMessage::Leftovers((batch, ThumbnailKind::Ephemeral)) => { - ephemeral_leftovers_queue.push_back(batch) - } - - StreamMessage::Database(DatabaseMessage::Add(id, db)) - | StreamMessage::Database(DatabaseMessage::Update(id, db)) => { - databases.insert(id, db); - } - - StreamMessage::Database(DatabaseMessage::Remove(id)) => { - databases.remove(&id); - } - - StreamMessage::NewEphemeralThumbnailsFilenames(new_ephemeral_thumbs) => { - trace!("New ephemeral thumbnails: {}", new_ephemeral_thumbs.len()); - ephemeral_file_names.extend(new_ephemeral_thumbs); - } - - StreamMessage::BatchProgress((location_id, progressed)) => { - bookkeeper.add_progress(location_id, progressed).await; - } - - StreamMessage::Shutdown(cancel_tx) => { - debug!("Thumbnail actor is shutting down..."); - let start = Instant::now(); - - stop_batch( - ¤t_batch_processing_rx, - &stop_older_processing_tx, - &stop_older_processing_rx, - ) - .await; - - // Closing the leftovers channel to stop the batch processor as we already sent - // an stop signal - leftovers_tx.close(); - while let Some((batch, kind)) = shutdown_leftovers_rx.next().await { - match kind { - ThumbnailKind::Indexed(library_id) => { - indexed_leftovers_queue.push_back((batch, library_id)) - } - ThumbnailKind::Ephemeral => ephemeral_leftovers_queue.push_back(batch), - } - } - - // Consuming the last progress reports to keep everything up to date - shutdown_batch_report_progress_rx.close(); - while let Some((location_id, progressed)) = - shutdown_batch_report_progress_rx.next().await - { - bookkeeper.add_progress(location_id, progressed).await; - } - - // Saving state - OldThumbsProcessingSaveState { - bookkeeper, - ephemeral_file_names, - queue, - indexed_leftovers_queue, - ephemeral_leftovers_queue, - } - .store(thumbnails_directory.as_ref()) - .await; - - // Signaling that we're done shutting down - cancel_tx.send(()).ok(); - - debug!("Thumbnailer has been shutdown in {:?}", start.elapsed()); - return; - } - - StreamMessage::ProgressManagement((location_id, progress_tx)) => { - bookkeeper.register_reporter(location_id, progress_tx); - } - - StreamMessage::UpdatedPreferences(preferences) => { - thumbnailer_preferences = preferences; - stop_batch( - ¤t_batch_processing_rx, - &stop_older_processing_tx, - &stop_older_processing_rx, - ) - .await; - } - } - } -} - -#[inline] -async fn stop_batch( - current_batch_processing_rx: &Option>, - stop_older_processing_tx: &chan::Sender>, - stop_older_processing_rx: &chan::Receiver>, -) { - // First stopping the current batch processing - if current_batch_processing_rx.is_some() { - trace!("Sending stop signal to older processing"); - - let (tx, rx) = oneshot::channel(); - - match stop_older_processing_tx.try_send(tx) { - Ok(()) => { - // We put a timeout here to avoid a deadlock in case the older processing already - // finished its batch - if timeout(ONE_SEC, rx).await.is_err() { - stop_older_processing_rx.recv().await.ok(); - } - } - Err(e) if e.is_full() => { - // The last signal we sent happened after a batch was already processed - // So we clean the channel and we're good to go. - stop_older_processing_rx.recv().await.ok(); - } - Err(_) => { - error!("Thumbnail actor died when trying to stop older processing"); - } - } - } -} diff --git a/core/src/object/mod.rs b/core/src/object/mod.rs index 08c41f1a4..e4de76f43 100644 --- a/core/src/object/mod.rs +++ b/core/src/object/mod.rs @@ -1,29 +1,3 @@ -use sd_prisma::prisma::{file_path, object}; - -use serde::{Deserialize, Serialize}; -use specta::Type; - -pub mod cas; pub mod fs; -pub mod media; -pub mod old_file_identifier; -pub mod old_orphan_remover; pub mod tag; pub mod validation; - -// Objects are primarily created by the identifier from Paths -// Some Objects are purely virtual, unless they have one or more associated Paths, which refer to a file found in a Location -// Objects are what can be added to Spaces - -// The response to provide the Explorer when looking at Objects -#[derive(Debug, Serialize, Deserialize, Type)] -pub struct ObjectsForExplorer { - pub objects: Vec, - // pub context: ExplorerContext, -} - -#[derive(Debug, Serialize, Deserialize, Type)] -pub enum ObjectData { - Object(Box), - Path(Box), -} diff --git a/core/src/object/old_file_identifier/mod.rs b/core/src/object/old_file_identifier/mod.rs deleted file mode 100644 index a98495420..000000000 --- a/core/src/object/old_file_identifier/mod.rs +++ /dev/null @@ -1,404 +0,0 @@ -use crate::{library::Library, object::cas::generate_cas_id, old_job::JobError}; - -use sd_core_file_path_helper::{FilePathError, IsolatedFilePathData}; -use sd_core_prisma_helpers::{file_path_for_file_identifier, object_for_file_identifier}; - -use sd_file_ext::{extensions::Extension, kind::ObjectKind}; -use sd_prisma::{ - prisma::{file_path, location, object, PrismaClient}, - prisma_sync, -}; -use sd_sync::{CRDTOperation, OperationFactory}; -use sd_utils::{db::maybe_missing, error::FileIOError, msgpack, uuid_to_bytes}; - -use std::{ - collections::{HashMap, HashSet}, - fmt::Debug, - path::Path, -}; - -use futures::future::join_all; -use tokio::fs; -use tracing::{error, trace}; -use uuid::Uuid; - -pub mod old_file_identifier_job; -mod shallow; - -pub use shallow::*; - -// we break these jobs into chunks of 100 to improve performance -const CHUNK_SIZE: usize = 100; - -#[derive(thiserror::Error, Debug)] -pub enum FileIdentifierJobError { - #[error("received sub path not in database: ", .0.display())] - SubPathNotFound(Box), - - // Internal Errors - #[error(transparent)] - FilePathError(#[from] FilePathError), - #[error("database error: {0}")] - Database(#[from] prisma_client_rust::QueryError), -} - -#[derive(Debug, Clone)] -pub struct FileMetadata { - pub cas_id: Option, - pub kind: ObjectKind, - pub fs_metadata: std::fs::Metadata, -} - -impl FileMetadata { - /// Assembles `create_unchecked` params for a given file path - pub async fn new( - location_path: impl AsRef, - iso_file_path: &IsolatedFilePathData<'_>, // TODO: use dedicated CreateUnchecked type - ) -> Result { - let path = location_path.as_ref().join(iso_file_path); - - let fs_metadata = fs::metadata(&path) - .await - .map_err(|e| FileIOError::from((&path, e)))?; - - assert!( - !fs_metadata.is_dir(), - "We can't generate cas_id for directories" - ); - - // derive Object kind - let kind = Extension::resolve_conflicting(&path, false) - .await - .map(Into::into) - .unwrap_or(ObjectKind::Unknown); - - let cas_id = if fs_metadata.len() != 0 { - generate_cas_id(&path, fs_metadata.len()) - .await - .map(Some) - .map_err(|e| FileIOError::from((&path, e)))? - } else { - // We can't do shit with empty files - None - }; - - trace!("Analyzed file: {path:?} {cas_id:?} {kind:?}"); - - Ok(FileMetadata { - cas_id, - kind, - fs_metadata, - }) - } -} - -async fn identifier_job_step( - Library { db, sync, .. }: &Library, - location: &location::Data, - file_paths: &[file_path_for_file_identifier::Data], -) -> Result<(usize, usize), JobError> { - let location_path = maybe_missing(&location.path, "location.path").map(Path::new)?; - - let file_paths_metadatas = join_all( - file_paths - .iter() - .filter_map(|file_path| { - IsolatedFilePathData::try_from((location.id, file_path)) - .map(|iso_file_path| (iso_file_path, file_path)) - .map_err(|e| error!("Failed to extract isolated file path data: {e:#?}")) - .ok() - }) - .map(|(iso_file_path, file_path)| async move { - FileMetadata::new(&location_path, &iso_file_path) - .await - .map(|metadata| { - ( - // SAFETY: This should never happen - Uuid::from_slice(&file_path.pub_id) - .expect("file_path.pub_id is invalid!"), - (metadata, file_path), - ) - }) - .map_err(|e| { - #[cfg(target_os = "windows")] - { - // Handle case where file is on-demand (NTFS only) - if e.source.raw_os_error().map_or(false, |code| code == 362) { - error!("Failed to extract metadata from on-demand file: {e:#?}"); - } else { - error!("Failed to extract file metadata: {e:#?}") - } - } - - #[cfg(not(target_os = "windows"))] - { - error!("Failed to extract file metadata: {e:#?}"); - } - }) - .ok() - }), - ) - .await - .into_iter() - .flatten() - .collect::>(); - - let unique_cas_ids = file_paths_metadatas - .values() - .filter_map(|(metadata, _)| metadata.cas_id.clone()) - .collect::>() - .into_iter() - .collect(); - - // Assign cas_id to each file path - sync.write_ops( - db, - file_paths_metadatas - .iter() - .map(|(pub_id, (metadata, _))| { - ( - sync.shared_update( - prisma_sync::file_path::SyncId { - pub_id: sd_utils::uuid_to_bytes(*pub_id), - }, - file_path::cas_id::NAME, - msgpack!(&metadata.cas_id), - ), - db.file_path().update( - file_path::pub_id::equals(sd_utils::uuid_to_bytes(*pub_id)), - vec![file_path::cas_id::set(metadata.cas_id.clone())], - ), - ) - }) - .unzip::<_, _, _, Vec<_>>(), - ) - .await?; - - // Retrieves objects that are already connected to file paths with the same id - let existing_objects = db - .object() - .find_many(vec![object::file_paths::some(vec![ - file_path::cas_id::in_vec(unique_cas_ids), - ])]) - .select(object_for_file_identifier::select()) - .exec() - .await?; - - let existing_object_cas_ids = existing_objects - .iter() - .flat_map(|object| { - object - .file_paths - .iter() - .filter_map(|file_path| file_path.cas_id.as_ref()) - }) - .collect::>(); - - // Attempt to associate each file path with an object that has been - // connected to file paths with the same cas_id - let updated_file_paths = sync - .write_ops( - db, - file_paths_metadatas - .iter() - .filter_map(|(pub_id, (metadata, file_path))| { - // Filtering out files without cas_id due to being empty - metadata - .cas_id - .is_some() - .then_some((pub_id, (metadata, file_path))) - }) - .flat_map(|(pub_id, (metadata, _))| { - existing_objects - .iter() - .find(|object| { - object - .file_paths - .iter() - .any(|file_path| file_path.cas_id == metadata.cas_id) - }) - .map(|object| (*pub_id, object)) - }) - .map(|(pub_id, object)| { - let (crdt_op, db_op) = connect_file_path_to_object( - pub_id, - // SAFETY: This pub_id is generated by the uuid lib, but we have to store bytes in sqlite - Uuid::from_slice(&object.pub_id).expect("uuid bytes are invalid"), - sync, - db, - ); - - (crdt_op, db_op.select(file_path::select!({ pub_id }))) - }) - .unzip::<_, _, Vec<_>, Vec<_>>(), - ) - .await?; - - trace!( - "Found {} existing Objects in Library, linking file paths...", - existing_objects.len() - ); - - // extract objects that don't already exist in the database - let file_paths_requiring_new_object = file_paths_metadatas - .into_iter() - .filter(|(_, (FileMetadata { cas_id, .. }, _))| { - cas_id - .as_ref() - .map(|cas_id| !existing_object_cas_ids.contains(cas_id)) - .unwrap_or(true) - }) - .collect::>(); - - let total_created = if !file_paths_requiring_new_object.is_empty() { - trace!( - "Creating {} new Objects in Library", - file_paths_requiring_new_object.len(), - ); - - let (object_create_args, file_path_update_args): (Vec<_>, Vec<_>) = - file_paths_requiring_new_object - .iter() - .map( - |( - file_path_pub_id, - ( - FileMetadata { kind, .. }, - file_path_for_file_identifier::Data { date_created, .. }, - ), - )| { - let object_pub_id = Uuid::new_v4(); - let sync_id = || prisma_sync::object::SyncId { - pub_id: sd_utils::uuid_to_bytes(object_pub_id), - }; - - let kind = *kind as i32; - - let (sync_params, db_params): (Vec<_>, Vec<_>) = [ - ( - (object::date_created::NAME, msgpack!(date_created)), - object::date_created::set(*date_created), - ), - ( - (object::kind::NAME, msgpack!(kind)), - object::kind::set(Some(kind)), - ), - ] - .into_iter() - .unzip(); - - ( - ( - sync.shared_create(sync_id(), sync_params), - object::create_unchecked(uuid_to_bytes(object_pub_id), db_params), - ), - { - let (crdt_op, db_op) = connect_file_path_to_object( - *file_path_pub_id, - object_pub_id, - sync, - db, - ); - - (crdt_op, db_op.select(file_path::select!({ pub_id }))) - }, - ) - }, - ) - .unzip(); - - // create new object records with assembled values - let total_created_files = sync - .write_ops(db, { - let (sync, db_params): (Vec<_>, Vec<_>) = object_create_args.into_iter().unzip(); - - ( - sync.into_iter().flatten().collect(), - db.object().create_many(db_params), - ) - }) - .await - .unwrap_or_else(|e| { - error!("Error inserting files: {:#?}", e); - 0 - }); - - trace!("Created {} new Objects in Library", total_created_files); - - if total_created_files > 0 { - trace!("Updating file paths with created objects"); - - sync.write_ops(db, { - let data: (Vec<_>, Vec<_>) = file_path_update_args.into_iter().unzip(); - - data - }) - .await?; - - trace!("Updated file paths with created objects"); - } - - total_created_files as usize - } else { - 0 - }; - - Ok((total_created, updated_file_paths.len())) -} - -fn connect_file_path_to_object<'db>( - file_path_id: Uuid, - object_id: Uuid, - sync: &crate::sync::Manager, - db: &'db PrismaClient, -) -> (CRDTOperation, file_path::UpdateQuery<'db>) { - #[cfg(debug_assertions)] - trace!("Connecting to "); - - let vec_id = object_id.as_bytes().to_vec(); - - ( - sync.shared_update( - prisma_sync::file_path::SyncId { - pub_id: sd_utils::uuid_to_bytes(file_path_id), - }, - file_path::object::NAME, - msgpack!(prisma_sync::object::SyncId { - pub_id: vec_id.clone() - }), - ), - db.file_path().update( - file_path::pub_id::equals(sd_utils::uuid_to_bytes(file_path_id)), - vec![file_path::object::connect(object::pub_id::equals(vec_id))], - ), - ) -} - -async fn process_identifier_file_paths( - location: &location::Data, - file_paths: &[file_path_for_file_identifier::Data], - step_number: usize, - cursor: file_path::id::Type, - library: &Library, - orphan_count: usize, -) -> Result<(usize, usize, file_path::id::Type), JobError> { - trace!( - "Processing {:?} orphan Paths. ({} completed of {})", - file_paths.len(), - step_number, - orphan_count - ); - - let (total_objects_created, total_objects_linked) = - identifier_job_step(library, location, file_paths).await?; - - Ok(( - total_objects_created, - total_objects_linked, - // returns a new cursor to the last row of this chunk or the current one - file_paths - .last() - .map(|last_row| last_row.id) - .unwrap_or(cursor), - )) -} diff --git a/core/src/object/old_file_identifier/old_file_identifier_job.rs b/core/src/object/old_file_identifier/old_file_identifier_job.rs deleted file mode 100644 index 69494b3fd..000000000 --- a/core/src/object/old_file_identifier/old_file_identifier_job.rs +++ /dev/null @@ -1,339 +0,0 @@ -use crate::{ - api::CoreEvent, - library::Library, - location::ScanState, - old_job::{ - CurrentStep, JobError, JobInitOutput, JobReportUpdate, JobResult, JobRunMetadata, - JobStepOutput, StatefulJob, WorkerContext, - }, -}; - -use sd_core_file_path_helper::{ - ensure_file_path_exists, ensure_sub_path_is_directory, ensure_sub_path_is_in_location, - IsolatedFilePathData, -}; -use sd_core_prisma_helpers::file_path_for_file_identifier; - -use sd_prisma::prisma::{file_path, location, PrismaClient, SortOrder}; -use sd_utils::db::maybe_missing; - -use std::{ - hash::{Hash, Hasher}, - path::{Path, PathBuf}, -}; - -use prisma_client_rust::or; -use serde::{Deserialize, Serialize}; -use serde_json::json; -use tracing::{debug, info, trace}; - -use super::{process_identifier_file_paths, FileIdentifierJobError, CHUNK_SIZE}; - -/// `FileIdentifierJobInit` takes file_paths without an object_id from a location -/// or starting from a `sub_path` getting every descendent from this `sub_path` -/// and uniquely identifies them: -/// - first: generating the cas_id and extracting metadata -/// - finally: creating unique object records, and linking them to their file_paths -#[derive(Serialize, Deserialize, Clone, Debug)] -pub struct OldFileIdentifierJobInit { - pub location: location::Data, - pub sub_path: Option, // subpath to start from -} - -impl Hash for OldFileIdentifierJobInit { - fn hash(&self, state: &mut H) { - self.location.id.hash(state); - if let Some(ref sub_path) = self.sub_path { - sub_path.hash(state); - } - } -} - -#[derive(Serialize, Deserialize, Debug)] -pub struct OldFileIdentifierJobData { - location_path: PathBuf, - maybe_sub_iso_file_path: Option>, -} - -#[derive(Serialize, Deserialize, Default, Debug)] -pub struct OldFileIdentifierJobRunMetadata { - cursor: file_path::id::Type, - total_orphan_paths: usize, - total_objects_created: usize, - total_objects_linked: usize, - total_objects_ignored: usize, -} - -impl JobRunMetadata for OldFileIdentifierJobRunMetadata { - fn update(&mut self, new_data: Self) { - self.total_orphan_paths += new_data.total_orphan_paths; - self.total_objects_created += new_data.total_objects_created; - self.total_objects_linked += new_data.total_objects_linked; - self.total_objects_ignored += new_data.total_objects_ignored; - self.cursor = new_data.cursor; - } -} - -#[async_trait::async_trait] -impl StatefulJob for OldFileIdentifierJobInit { - type Data = OldFileIdentifierJobData; - type Step = (); - type RunMetadata = OldFileIdentifierJobRunMetadata; - - const NAME: &'static str = "file_identifier"; - const IS_BATCHED: bool = true; - - fn target_location(&self) -> location::id::Type { - self.location.id - } - - async fn init( - &self, - ctx: &WorkerContext, - data: &mut Option, - ) -> Result, JobError> { - let init = self; - let Library { db, .. } = &*ctx.library; - - debug!("Identifying orphan File Paths..."); - - let location_id = init.location.id; - - let location_path = maybe_missing(&init.location.path, "location.path").map(Path::new)?; - - let maybe_sub_iso_file_path = match &init.sub_path { - Some(sub_path) if sub_path != Path::new("") => { - let full_path = ensure_sub_path_is_in_location(location_path, sub_path) - .await - .map_err(FileIdentifierJobError::from)?; - ensure_sub_path_is_directory(location_path, sub_path) - .await - .map_err(FileIdentifierJobError::from)?; - - let sub_iso_file_path = - IsolatedFilePathData::new(location_id, location_path, &full_path, true) - .map_err(FileIdentifierJobError::from)?; - - ensure_file_path_exists( - sub_path, - &sub_iso_file_path, - db, - FileIdentifierJobError::SubPathNotFound, - ) - .await?; - - Some(sub_iso_file_path) - } - _ => None, - }; - - let orphan_count = - count_orphan_file_paths(db, location_id, &maybe_sub_iso_file_path).await?; - - // Initializing `state.data` here because we need a complete state in case of early finish - *data = Some(OldFileIdentifierJobData { - location_path: location_path.to_path_buf(), - maybe_sub_iso_file_path, - }); - - let data = data.as_ref().expect("we just set it"); - - if orphan_count == 0 { - return Err(JobError::EarlyFinish { - name: ::NAME.to_string(), - reason: "Found no orphan file paths to process".to_string(), - }); - } - - debug!("Found {} orphan file paths", orphan_count); - - let task_count = (orphan_count as f64 / CHUNK_SIZE as f64).ceil() as usize; - debug!( - "Found {} orphan Paths. Will execute {} tasks...", - orphan_count, task_count - ); - - let first_path = db - .file_path() - .find_first(orphan_path_filters( - location_id, - None, - &data.maybe_sub_iso_file_path, - )) - .select(file_path::select!({ id })) - .exec() - .await? - .expect("We already validated before that there are orphans `file_path`s"); - - ctx.progress(vec![ - JobReportUpdate::TaskCount(orphan_count), - JobReportUpdate::Message(format!("Found {orphan_count} files to be identified")), - ]); - - Ok(( - OldFileIdentifierJobRunMetadata { - total_orphan_paths: orphan_count, - cursor: first_path.id, - ..Default::default() - }, - vec![(); task_count], - ) - .into()) - } - - async fn execute_step( - &self, - ctx: &WorkerContext, - CurrentStep { step_number, .. }: CurrentStep<'_, Self::Step>, - data: &Self::Data, - run_metadata: &Self::RunMetadata, - ) -> Result, JobError> { - let init = self; - let location = &init.location; - - let mut new_metadata = Self::RunMetadata::default(); - - // get chunk of orphans to process - let file_paths = get_orphan_file_paths( - &ctx.library.db, - location.id, - run_metadata.cursor, - &data.maybe_sub_iso_file_path, - ) - .await?; - - // if no file paths found, abort entire job early, there is nothing to do - // if we hit this error, there is something wrong with the data/query - if file_paths.is_empty() { - return Err(JobError::EarlyFinish { - name: ::NAME.to_string(), - reason: "Expected orphan Paths not returned from database query for this chunk" - .to_string(), - }); - } - - let (total_objects_created, total_objects_linked, new_cursor) = - process_identifier_file_paths( - location, - &file_paths, - step_number, - run_metadata.cursor, - &ctx.library, - run_metadata.total_orphan_paths, - ) - .await?; - - new_metadata.total_objects_created = total_objects_created; - new_metadata.total_objects_linked = total_objects_linked; - new_metadata.cursor = new_cursor; - - // send an array of ids to let clients know new objects were identified - ctx.node.emit(CoreEvent::NewIdentifiedObjects { - file_path_ids: file_paths.iter().map(|fp| fp.id).collect(), - }); - - ctx.progress(vec![ - JobReportUpdate::CompletedTaskCount(step_number * CHUNK_SIZE + file_paths.len()), - JobReportUpdate::Message(format!( - "Processed {} of {} orphan Paths", - step_number * CHUNK_SIZE, - run_metadata.total_orphan_paths - )), - ]); - - Ok(new_metadata.into()) - } - - async fn finalize( - &self, - ctx: &WorkerContext, - _data: &Option, - run_metadata: &Self::RunMetadata, - ) -> JobResult { - let init = self; - info!("Finalizing identifier job: {:?}", &run_metadata); - - ctx.library - .db - .location() - .update( - location::id::equals(init.location.id), - vec![location::scan_state::set(ScanState::FilesIdentified as i32)], - ) - .exec() - .await - .map_err(FileIdentifierJobError::from)?; - - Ok(Some(json!({"init: ": init, "run_metadata": run_metadata}))) - } -} - -fn orphan_path_filters( - location_id: location::id::Type, - file_path_id: Option, - maybe_sub_iso_file_path: &Option>, -) -> Vec { - sd_utils::chain_optional_iter( - [ - or!( - file_path::object_id::equals(None), - file_path::cas_id::equals(None) - ), - file_path::is_dir::equals(Some(false)), - file_path::location_id::equals(Some(location_id)), - file_path::size_in_bytes_bytes::not(Some(0u64.to_be_bytes().to_vec())), - ], - [ - // this is a workaround for the cursor not working properly - file_path_id.map(file_path::id::gte), - maybe_sub_iso_file_path.as_ref().map(|sub_iso_file_path| { - file_path::materialized_path::starts_with( - sub_iso_file_path - .materialized_path_for_children() - .expect("sub path iso_file_path must be a directory"), - ) - }), - ], - ) -} - -async fn count_orphan_file_paths( - db: &PrismaClient, - location_id: location::id::Type, - maybe_sub_materialized_path: &Option>, -) -> Result { - db.file_path() - .count(orphan_path_filters( - location_id, - None, - maybe_sub_materialized_path, - )) - .exec() - .await - .map(|c| c as usize) -} - -async fn get_orphan_file_paths( - db: &PrismaClient, - location_id: location::id::Type, - file_path_id: file_path::id::Type, - maybe_sub_materialized_path: &Option>, -) -> Result, prisma_client_rust::QueryError> { - trace!( - "Querying {} orphan Paths at cursor: {:?}", - CHUNK_SIZE, - file_path_id - ); - db.file_path() - .find_many(orphan_path_filters( - location_id, - Some(file_path_id), - maybe_sub_materialized_path, - )) - .order_by(file_path::id::order(SortOrder::Asc)) - .take(CHUNK_SIZE as i64) - // .skip(1) - .select(file_path_for_file_identifier::select()) - .exec() - .await -} diff --git a/core/src/object/old_file_identifier/shallow.rs b/core/src/object/old_file_identifier/shallow.rs deleted file mode 100644 index 04355be15..000000000 --- a/core/src/object/old_file_identifier/shallow.rs +++ /dev/null @@ -1,182 +0,0 @@ -use crate::{invalidate_query, library::Library, old_job::JobError}; - -use sd_core_file_path_helper::{ - ensure_file_path_exists, ensure_sub_path_is_directory, ensure_sub_path_is_in_location, - IsolatedFilePathData, -}; -use sd_core_prisma_helpers::file_path_for_file_identifier; - -use sd_prisma::prisma::{file_path, location, PrismaClient, SortOrder}; -use sd_utils::db::maybe_missing; - -use std::path::{Path, PathBuf}; - -use prisma_client_rust::or; -use serde::{Deserialize, Serialize}; -use tracing::{trace, warn}; - -use super::{process_identifier_file_paths, FileIdentifierJobError, CHUNK_SIZE}; - -#[derive(Serialize, Deserialize)] -pub struct ShallowFileIdentifierJobState { - cursor: file_path::id::Type, - sub_iso_file_path: IsolatedFilePathData<'static>, -} - -pub async fn old_shallow( - location: &location::Data, - sub_path: &PathBuf, - library: &Library, -) -> Result<(), JobError> { - let Library { db, .. } = library; - - warn!("Identifying orphan File Paths..."); - - let location_id = location.id; - let location_path = maybe_missing(&location.path, "location.path").map(Path::new)?; - - let sub_iso_file_path = if sub_path != Path::new("") { - let full_path = ensure_sub_path_is_in_location(location_path, &sub_path) - .await - .map_err(FileIdentifierJobError::from)?; - ensure_sub_path_is_directory(location_path, &sub_path) - .await - .map_err(FileIdentifierJobError::from)?; - - let sub_iso_file_path = - IsolatedFilePathData::new(location_id, location_path, &full_path, true) - .map_err(FileIdentifierJobError::from)?; - - ensure_file_path_exists( - &sub_path, - &sub_iso_file_path, - db, - FileIdentifierJobError::SubPathNotFound, - ) - .await?; - - sub_iso_file_path - } else { - IsolatedFilePathData::new(location_id, location_path, location_path, true) - .map_err(FileIdentifierJobError::from)? - }; - - let orphan_count = count_orphan_file_paths(db, location_id, &sub_iso_file_path).await?; - - if orphan_count == 0 { - return Ok(()); - } - - let task_count = (orphan_count as f64 / CHUNK_SIZE as f64).ceil() as usize; - warn!( - "Found {} orphan Paths. Will execute {} tasks...", - orphan_count, task_count - ); - - let Some(first_path) = db - .file_path() - .find_first(orphan_path_filters(location_id, None, &sub_iso_file_path)) - // .order_by(file_path::id::order(Direction::Asc)) - .select(file_path::select!({ id })) - .exec() - .await? - else { - warn!("No orphan Paths found due to another Job finishing first"); - return Ok(()); - }; - - // Initializing `state.data` here because we need a complete state in case of early finish - let mut data = ShallowFileIdentifierJobState { - cursor: first_path.id, - sub_iso_file_path, - }; - - for step_number in 0..task_count { - let ShallowFileIdentifierJobState { - cursor, - sub_iso_file_path, - } = &mut data; - - // get chunk of orphans to process - let file_paths = - get_orphan_file_paths(&library.db, location.id, *cursor, sub_iso_file_path).await?; - - let (_, _, new_cursor) = process_identifier_file_paths( - location, - &file_paths, - step_number, - *cursor, - library, - orphan_count, - ) - .await?; - *cursor = new_cursor; - } - - invalidate_query!(library, "search.paths"); - invalidate_query!(library, "search.objects"); - - Ok(()) -} - -fn orphan_path_filters( - location_id: location::id::Type, - file_path_id: Option, - sub_iso_file_path: &IsolatedFilePathData<'_>, -) -> Vec { - sd_utils::chain_optional_iter( - [ - or!( - file_path::object_id::equals(None), - file_path::cas_id::equals(None) - ), - file_path::is_dir::equals(Some(false)), - file_path::location_id::equals(Some(location_id)), - file_path::materialized_path::equals(Some( - sub_iso_file_path - .materialized_path_for_children() - .expect("sub path for shallow identifier must be a directory"), - )), - file_path::size_in_bytes_bytes::not(Some(0u64.to_be_bytes().to_vec())), - ], - [file_path_id.map(file_path::id::gte)], - ) -} - -async fn count_orphan_file_paths( - db: &PrismaClient, - location_id: location::id::Type, - sub_iso_file_path: &IsolatedFilePathData<'_>, -) -> Result { - db.file_path() - .count(orphan_path_filters(location_id, None, sub_iso_file_path)) - .exec() - .await - .map(|c| c as usize) -} - -async fn get_orphan_file_paths( - db: &PrismaClient, - location_id: location::id::Type, - file_path_id_cursor: file_path::id::Type, - sub_iso_file_path: &IsolatedFilePathData<'_>, -) -> Result, prisma_client_rust::QueryError> { - trace!( - "Querying {} orphan Paths at cursor: {:?}", - CHUNK_SIZE, - file_path_id_cursor - ); - db.file_path() - .find_many(orphan_path_filters( - location_id, - Some(file_path_id_cursor), - sub_iso_file_path, - )) - .order_by(file_path::id::order(SortOrder::Asc)) - // .cursor(cursor.into()) - .take(CHUNK_SIZE as i64) - // .skip(1) - .select(file_path_for_file_identifier::select()) - .exec() - .await -} diff --git a/core/src/object/old_orphan_remover.rs b/core/src/object/old_orphan_remover.rs index 35b316842..25acf9852 100644 --- a/core/src/object/old_orphan_remover.rs +++ b/core/src/object/old_orphan_remover.rs @@ -9,6 +9,8 @@ use tokio::{ }; use tracing::{error, trace}; +// TODO(fogodev): To be rewritten using new task system + const TEN_SECONDS: Duration = Duration::from_secs(10); const ONE_MINUTE: Duration = Duration::from_secs(60); @@ -73,7 +75,7 @@ impl OrphanRemoverActor { .map(|object| object.id) .collect::>() }) - .map_err(|e| error!("Failed to fetch orphaned objects: {e:#?}")) + .map_err(|e| error!(?e, "Failed to fetch orphaned objects;")) else { break; }; @@ -82,7 +84,10 @@ impl OrphanRemoverActor { break; } - trace!("Removing {} orphaned objects", objects_ids.len()); + trace!( + orphans_count = objects_ids.len(), + "Removing orphaned objects;" + ); if let Err(e) = db ._batch(( @@ -93,7 +98,7 @@ impl OrphanRemoverActor { )) .await { - error!("Failed to remove orphaned objects: {e:#?}"); + error!(?e, "Failed to remove orphaned objects;"); break; } } diff --git a/core/src/object/tag/mod.rs b/core/src/object/tag/mod.rs index 594ddd1cd..82b82c3d7 100644 --- a/core/src/object/tag/mod.rs +++ b/core/src/object/tag/mod.rs @@ -4,7 +4,6 @@ use sd_prisma::{prisma::tag, prisma_sync}; use sd_sync::*; use chrono::{DateTime, FixedOffset, Utc}; - use serde::Deserialize; use specta::Type; use uuid::Uuid; diff --git a/core/src/object/validation/old_validator_job.rs b/core/src/object/validation/old_validator_job.rs index 4cbafcfab..d90fc56cb 100644 --- a/core/src/object/validation/old_validator_job.rs +++ b/core/src/object/validation/old_validator_job.rs @@ -189,13 +189,10 @@ impl StatefulJob for OldObjectValidatorJobInit { .expect("critical error: missing data on job state"); info!( - "finalizing validator job at {}{}: {} tasks", - data.location_path.display(), - init.sub_path - .as_ref() - .map(|p| format!("{}", p.display())) - .unwrap_or_default(), - data.task_count + location_path = %data.location_path.display(), + sub_path = ?init.sub_path.as_ref().map(|p| p.display()), + task_count = data.task_count, + "finalizing validator job;", ); Ok(Some(json!({ "init": init }))) diff --git a/core/src/old_job/error.rs b/core/src/old_job/error.rs index 5cc4f9604..e5315916c 100644 --- a/core/src/old_job/error.rs +++ b/core/src/old_job/error.rs @@ -1,8 +1,8 @@ use crate::{ - location::{indexer::IndexerError, LocationError}, + location::{/*indexer::IndexerError,*/ LocationError}, object::{ - fs::error::FileSystemJobsError, media::old_media_processor::MediaProcessorError, - old_file_identifier::FileIdentifierJobError, validation::ValidatorError, + fs::error::FileSystemJobsError, /*media::old_media_processor::MediaProcessorError,*/ + /*old_file_identifier::FileIdentifierJobError,*/ validation::ValidatorError, }, }; @@ -57,12 +57,6 @@ pub enum JobError { // Specific job errors #[error(transparent)] - Indexer(#[from] IndexerError), - #[error(transparent)] - MediaProcessor(#[from] MediaProcessorError), - #[error(transparent)] - FileIdentifier(#[from] FileIdentifierJobError), - #[error(transparent)] Validator(#[from] ValidatorError), #[error(transparent)] FileSystemJobsError(#[from] FileSystemJobsError), diff --git a/core/src/old_job/manager.rs b/core/src/old_job/manager.rs index 9d373c05c..2770eb7e5 100644 --- a/core/src/old_job/manager.rs +++ b/core/src/old_job/manager.rs @@ -1,16 +1,13 @@ use crate::{ library::Library, - location::indexer::old_indexer_job::OldIndexerJobInit, object::{ fs::{ old_copy::OldFileCopierJobInit, old_cut::OldFileCutterJobInit, old_delete::OldFileDeleterJobInit, old_erase::OldFileEraserJobInit, }, - media::old_media_processor::OldMediaProcessorJobInit, - old_file_identifier::old_file_identifier_job::OldFileIdentifierJobInit, validation::old_validator_job::OldObjectValidatorJobInit, }, - old_job::{worker::Worker, DynJob, Job, JobError}, + old_job::{worker::Worker, DynJob, JobError, OldJob}, Node, }; @@ -24,10 +21,10 @@ use std::{ use futures::future::join_all; use prisma_client_rust::operator::or; use tokio::sync::{mpsc, oneshot, RwLock}; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, info, instrument, warn}; use uuid::Uuid; -use super::{JobIdentity, JobManagerError, JobReport, JobStatus, StatefulJob}; +use super::{JobIdentity, JobManagerError, JobStatus, OldJobReport, StatefulJob}; const MAX_WORKERS: usize = 5; @@ -66,9 +63,8 @@ impl Actor { } } -/// JobManager handles queueing and executing jobs using the `DynJob` -/// Handling persisting JobReports to the database, pause/resuming, and -/// +/// JobManager handles queueing and executing jobs using the [`DynJob`] +/// Handling persisting JobReports to the database, pause/resuming pub struct OldJobs { current_jobs_hashes: RwLock>, job_queue: RwLock>>, @@ -97,12 +93,17 @@ impl OldJobs { ) } + #[instrument( + skip_all, + fields(library_id = %library.id, job_name = %job.name(), job_hash = %job.hash()), + err, + )] /// Ingests a new job and dispatches it if possible, queues it otherwise. pub async fn ingest( self: Arc, node: &Arc, library: &Arc, - job: Box>, + job: Box>, ) -> Result<(), JobManagerError> { let job_hash = job.hash(); @@ -113,17 +114,17 @@ impl OldJobs { }); } - debug!( - "Ingesting job: ", - job.name(), - job_hash - ); + debug!("Ingesting job;"); self.current_jobs_hashes.write().await.insert(job_hash); self.dispatch(node, library, job).await; Ok(()) } + #[instrument( + skip_all, + fields(library_id = %library.id, job_name = %job.name(), job_hash = %job.hash()), + )] /// Dispatches a job to a worker if under MAX_WORKERS limit, queues it otherwise. async fn dispatch( self: Arc, @@ -138,7 +139,7 @@ impl OldJobs { .expect("critical error: missing job on worker"); if running_workers.len() < MAX_WORKERS { - info!("Running job: {:?}", job.name()); + info!("Running job"); let worker_id = job_report.parent_id.unwrap_or(job_report.id); @@ -153,21 +154,17 @@ impl OldJobs { .await .map_or_else( |e| { - error!("Error spawning worker: {:#?}", e); + error!(?e, "Error spawning worker;"); }, |worker| { running_workers.insert(worker_id, worker); }, ); } else { - debug!( - "Queueing job: ", - job.name(), - job.hash() - ); + debug!("Queueing job"); if let Err(e) = job_report.create(library).await { // It's alright to just log here, as will try to create the report on run if it wasn't created before - error!("Error creating job report: {:#?}", e); + error!(?e, "Error creating job report;"); } // Put the report back, or it will be lost forever @@ -218,11 +215,12 @@ impl OldJobs { }); } + #[instrument(skip(self))] /// Pause a specific job. pub async fn pause(&self, job_id: Uuid) -> Result<(), JobManagerError> { // Look up the worker for the given job ID. if let Some(worker) = self.running_workers.read().await.get(&job_id) { - debug!("Pausing job: {:#?}", worker.report()); + debug!(report = ?worker.report(), "Pausing job;"); // Set the pause signal in the worker. worker.pause().await; @@ -236,7 +234,7 @@ impl OldJobs { pub async fn resume(&self, job_id: Uuid) -> Result<(), JobManagerError> { // Look up the worker for the given job ID. if let Some(worker) = self.running_workers.read().await.get(&job_id) { - debug!("Resuming job: {:?}", worker.report()); + debug!(report = ?worker.report(), "Resuming job;"); // Set the pause signal in the worker. worker.resume().await; @@ -251,7 +249,7 @@ impl OldJobs { pub async fn cancel(&self, job_id: Uuid) -> Result<(), JobManagerError> { // Look up the worker for the given job ID. if let Some(worker) = self.running_workers.read().await.get(&job_id) { - debug!("Canceling job: {:#?}", worker.report()); + debug!(report = ?worker.report(), "Canceling job;"); // Set the cancel signal in the worker. worker.cancel().await; @@ -285,24 +283,36 @@ impl OldJobs { .exec() .await? .into_iter() - .map(JobReport::try_from); + .map(OldJobReport::try_from); for job in all_jobs { let job = job?; match initialize_resumable_job(job.clone(), None) { Ok(resumable_job) => { - info!("Resuming job: {} with uuid {}", job.name, job.id); + info!(%job.name, %job.id, "Resuming job;"); Arc::clone(&self) .dispatch(node, library, resumable_job) .await; } - Err(err) => { + Err(JobError::UnknownJobName(_, job_name)) + if matches!( + job_name.as_str(), + "indexer" | "file_identifier" | "media_processor" + ) => + { + debug!(%job_name, "Moved to new job system"); + } + Err(e) => { warn!( - "Failed to initialize job: {} with uuid {}, error: {:?}", - job.name, job.id, err + %job.name, + %job.id, + ?e, + "Failed to initialize job;", ); - info!("Cancelling job: {} with uuid {}", job.name, job.id); + + info!(%job.name, %job.id, "Cancelling job;"); + library .db .job() @@ -319,7 +329,7 @@ impl OldJobs { } // get all active jobs, including paused jobs organized by job id - pub async fn get_active_reports_with_id(&self) -> HashMap { + pub async fn get_active_reports_with_id(&self) -> HashMap { self.running_workers .read() .await @@ -332,7 +342,7 @@ impl OldJobs { } // get all running jobs, excluding paused jobs organized by action - pub async fn get_running_reports(&self) -> HashMap { + pub async fn get_running_reports(&self) -> HashMap { self.running_workers .read() .await @@ -382,23 +392,21 @@ mod macros { } /// This function is used to initialize a DynJob from a job report. fn initialize_resumable_job( - job_report: JobReport, + job_report: OldJobReport, next_jobs: Option>>, ) -> Result, JobError> { dispatch_call_to_job_by_name!( job_report.name.as_str(), - T -> Job::::new_from_report(job_report, next_jobs), + T -> OldJob::::new_from_report(job_report, next_jobs), default = { error!( - "Unknown job type: {}, id: {}", - job_report.name, job_report.id + %job_report.name, + %job_report.id, + "Unknown job type;", ); Err(JobError::UnknownJobName(job_report.id, job_report.name)) }, jobs = [ - OldMediaProcessorJobInit, - OldIndexerJobInit, - OldFileIdentifierJobInit, OldObjectValidatorJobInit, OldFileCutterJobInit, OldFileCopierJobInit, diff --git a/core/src/old_job/mod.rs b/core/src/old_job/mod.rs index ab22d1672..42190645d 100644 --- a/core/src/old_job/mod.rs +++ b/core/src/old_job/mod.rs @@ -20,7 +20,7 @@ use tokio::{ spawn, task::{JoinError, JoinHandle}, }; -use tracing::{debug, error, info, trace, warn}; +use tracing::{debug, error, info, instrument, trace, warn}; use uuid::Uuid; mod error; @@ -133,8 +133,8 @@ pub trait StatefulJob: pub trait DynJob: Send + Sync { fn id(&self) -> Uuid; fn parent_id(&self) -> Option; - fn report(&self) -> &Option; - fn report_mut(&mut self) -> &mut Option; + fn report(&self) -> &Option; + fn report_mut(&mut self) -> &mut Option; fn name(&self) -> &'static str; async fn run( &mut self, @@ -149,20 +149,23 @@ pub trait DynJob: Send + Sync { async fn cancel_children(&mut self, library: &Library) -> Result<(), JobError>; } -pub struct JobBuilder { +pub struct OldJob { id: Uuid, - init: SJob, - report_builder: JobReportBuilder, + hash: u64, + report: Option, + state: Option>, + next_jobs: VecDeque>, } -impl JobBuilder { - pub fn build(self) -> Box> { - Box::new(Job:: { - id: self.id, - hash: ::hash(&self.init), - report: Some(self.report_builder.build()), +impl OldJob { + pub fn new(init: SJob) -> Box { + let id = Uuid::new_v4(); + Box::new(OldJob:: { + id, + hash: ::hash(&init), + report: Some(JobReportBuilder::new(id, SJob::NAME.to_string()).build()), state: Some(JobState { - init: self.init, + init, data: None, steps: VecDeque::new(), step_number: 0, @@ -172,67 +175,9 @@ impl JobBuilder { }) } - pub fn new(init: SJob) -> Self { - let id = Uuid::new_v4(); - Self { - id, - init, - report_builder: JobReportBuilder::new(id, SJob::NAME.to_string()), - } - } - - pub fn with_action(mut self, action: impl AsRef) -> Self { - self.report_builder = self.report_builder.with_action(action); - self - } - - pub fn with_parent_id(mut self, parent_id: Uuid) -> Self { - self.report_builder = self.report_builder.with_parent_id(parent_id); - self - } - - pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self { - self.report_builder = self.report_builder.with_metadata(metadata); - self - } -} - -pub struct Job { - id: Uuid, - hash: u64, - report: Option, - state: Option>, - next_jobs: VecDeque>, -} - -impl Job { - pub fn new(init: SJob) -> Box { - JobBuilder::new(init).build() - } - - pub fn queue_next(mut self: Box, init: NextSJob) -> Box - where - NextSJob: StatefulJob + 'static, - { - let next_job_order = self.next_jobs.len() + 1; - - let mut child_job_builder = JobBuilder::new(init).with_parent_id(self.id); - - if let Some(parent_report) = self.report() { - if let Some(parent_action) = &parent_report.action { - child_job_builder = - child_job_builder.with_action(format!("{parent_action}-{next_job_order}")); - } - } - - self.next_jobs.push_back(child_job_builder.build()); - - self - } - // this function returns an ingestible job instance from a job report pub fn new_from_report( - mut report: JobReport, + mut report: OldJobReport, next_jobs: Option>>, ) -> Result, JobError> { let state = rmp_serde::from_slice::>( @@ -435,7 +380,7 @@ impl From> for JobStepOutput DynJob for Job { +impl DynJob for OldJob { fn id(&self) -> Uuid { // SAFETY: This method is using during queueing, so we still have a report self.report() @@ -448,11 +393,11 @@ impl DynJob for Job { self.report.as_ref().and_then(|r| r.parent_id) } - fn report(&self) -> &Option { + fn report(&self) -> &Option { &self.report } - fn report_mut(&mut self) -> &mut Option { + fn report_mut(&mut self) -> &mut Option { &mut self.report } @@ -460,6 +405,8 @@ impl DynJob for Job { ::NAME } + #[instrument(skip_all, fields(job_name = %self.name(), job_id = %self.id()), err)] + #[allow(clippy::blocks_in_conditions)] // Due to `err` on instrument above async fn run( &mut self, ctx: WorkerContext, @@ -468,7 +415,7 @@ impl DynJob for Job { let job_name = self.name(); let job_id = self.id; let mut errors = vec![]; - info!("Starting Job "); + info!("Starting Job"); let JobState { init, @@ -542,7 +489,7 @@ impl DynJob for Job { run_metadata.update(new_run_metadata); } Err(e @ JobError::EarlyFinish { .. }) => { - info!("{e}"); + info!(%e); job_should_run = false; } Err(e) => return Err(e), @@ -654,16 +601,16 @@ impl DynJob for Job { } if !new_errors.is_empty() { - warn!("Job had a step with errors"); - new_errors.iter().for_each(|err| { - warn!("Job error: {:?}", err); + warn!("Job had a step with errors"); + new_errors.iter().for_each(|e| { + warn!(?e, "Job error;"); }); errors.extend(new_errors); } } Err(e @ JobError::EarlyFinish { .. }) => { - info!("{e}"); + info!(%e); break; } Err(e) => return Err(e), @@ -672,14 +619,11 @@ impl DynJob for Job { step_number += 1; } - debug!( - "Total job run time {:?} Job ", - job_init_time.elapsed() - ); + debug!(job_init_time = ?job_init_time.elapsed(), "Total job run time;"); Some(Arc::try_unwrap(working_data_arc).expect("job already ran, no more refs")) } else { - warn!("Tried to run a job without data Job "); + warn!("Tried to run a job without data"); None }; @@ -692,9 +636,10 @@ impl DynJob for Job { errors: errors.into(), next_job: next_jobs.pop_front().map(|mut next_job| { debug!( - "Job requesting to spawn '{}' now that it's complete!", - next_job.name() + next_job_name = %next_job.name(), + "Job requesting to spawn next job now that it's complete!;", ); + next_job.set_next_jobs(next_jobs); next_job @@ -823,16 +768,13 @@ async fn handle_init_phase( match msg { StreamMessage::InitResult(Err(join_error)) => { error!( - "Job \ - failed to initialize due to an internal error: {join_error:#?}", + ?join_error, + "Job failed to initialize due to an internal error;" ); return Err(join_error.into()); } StreamMessage::InitResult(Ok((stateful_job, maybe_data, output))) => { - debug!( - "Init phase took {:?} Job ", - init_time.elapsed() - ); + debug!(init_phase_time = ?init_time.elapsed(), "Init phase completed;"); return Ok(InitPhaseOutput { stateful_job, @@ -854,10 +796,7 @@ async fn handle_init_phase( } } StreamMessage::NewCommand(WorkerCommand::Pause(when)) => { - debug!( - "Pausing Job at init phase took {:?}", - when.elapsed() - ); + debug!(pausing_time = ?when.elapsed(), "Pausing Job at init phase;"); // Notify the worker's work task that now we're paused worker_ctx.pause(); @@ -883,13 +822,11 @@ async fn handle_init_phase( } WorkerCommand::Resume(when) => { debug!( - "Resuming Job at init phase took {:?}", - when.elapsed() - ); - debug!( - "Total paused time {:?} Job ", - paused_time.elapsed() + resuming_time = ?when.elapsed(), + paused_time = ?paused_time.elapsed(), + "Resuming Job at init phase;", ); + status = JobStatus::Running; continue 'messages; @@ -899,12 +836,11 @@ async fn handle_init_phase( init_abort_handle.abort(); debug!( - "Shuting down Job at init phase \ - took {:?} after running for {:?}", - when.elapsed(), - init_time.elapsed(), + shutting_down_time = ?when.elapsed(), + paused_time = ?paused_time.elapsed(), + total_running_time = ?init_time.elapsed(), + "Shuting down Job at init phase;", ); - debug!("Total paused time {:?}", paused_time.elapsed()); // Shutting down at init phase will abort the job return Err(JobError::Canceled(signal_tx)); @@ -912,25 +848,19 @@ async fn handle_init_phase( WorkerCommand::Cancel(when, signal_tx) => { init_abort_handle.abort(); debug!( - "Canceling Job at init phase \ - took {:?} after running for {:?}", - when.elapsed(), - init_time.elapsed(), - ); - debug!( - "Total paused time {:?} Job ", - paused_time.elapsed() + canceling_time = ?when.elapsed(), + paused_time = ?paused_time.elapsed(), + total_running_time = ?init_time.elapsed(), + "Canceling Job at init phase;", ); + return Err(JobError::Canceled(signal_tx)); } WorkerCommand::Pause(_) => { // We continue paused lol } WorkerCommand::Timeout(elapsed, tx) => { - error!( - "Job \ - timed out at init phase after {elapsed:?} without updates" - ); + error!(elapsed_time = ?elapsed, "Job timed out at init phase;"); tx.send(()).ok(); return Err(JobError::Timeout(elapsed)); } @@ -938,10 +868,7 @@ async fn handle_init_phase( } if commands_rx.is_closed() { - error!( - "Job \ - closed the command channel while paused" - ); + error!("Job closed the command channel while paused"); return Err(JobError::Critical( "worker command channel closed while job was paused", )); @@ -954,10 +881,9 @@ async fn handle_init_phase( init_abort_handle.abort(); debug!( - "Shuting down at init phase Job took {:?} \ - after running for {:?}", - when.elapsed(), - init_time.elapsed(), + shutting_down_time = ?when.elapsed(), + total_running_time = ?init_time.elapsed(), + "Shuting down at init phase;", ); // Shutting down at init phase will abort the job @@ -967,18 +893,17 @@ async fn handle_init_phase( init_abort_handle.abort(); debug!( - "Canceling at init phase Job took {:?} \ - after running for {:?}", - when.elapsed(), - init_time.elapsed() + canceling_time = ?when.elapsed(), + total_running_time = ?init_time.elapsed(), + "Canceling at init phase;", ); return Err(JobError::Canceled(signal_tx)); } StreamMessage::NewCommand(WorkerCommand::Timeout(elapsed, tx)) => { error!( - "Job \ - timed out at init phase after {elapsed:?} without updates" + elapsed_time = ?elapsed, + "Job timed out at init phase;", ); tx.send(()).ok(); return Err(JobError::Timeout(elapsed)); @@ -1014,6 +939,7 @@ type StepArcs = ( Arc, ); +#[instrument(skip_all, fields(job_id = %id, job_name = %name, %step_number))] #[inline] async fn handle_single_step( JobRunWorkTable { @@ -1030,7 +956,7 @@ async fn handle_single_step( step, mut step_task, }: JobStepDataWorkTable, - mut commands_rx: chan::Receiver, + commands_rx: chan::Receiver, ) -> Result, JobError> { enum StreamMessage { NewCommand(WorkerCommand), @@ -1050,17 +976,11 @@ async fn handle_single_step( 'messages: while let Some(msg) = msg_stream.next().await { match msg { StreamMessage::StepResult(Err(join_error)) => { - error!( - "Job \ - failed to run step #{step_number} due to an internal error: {join_error:#?}", - ); + error!(?join_error, "Failed to run step due to an internal error;"); return Err(join_error.into()); } StreamMessage::StepResult(Ok(output)) => { - trace!( - "Step finished in {:?} Job ", - init_time.elapsed(), - ); + trace!(elapsed = ?init_time.elapsed(), "Step finished;"); return Ok(JobStepsPhaseOutput { steps, @@ -1082,10 +1002,7 @@ async fn handle_single_step( } } StreamMessage::NewCommand(WorkerCommand::Pause(when)) => { - debug!( - "Pausing Job took {:?}", - when.elapsed() - ); + debug!(elapsed = ?when.elapsed(), "Pausing Job;"); worker_ctx.pause(); @@ -1109,14 +1026,8 @@ async fn handle_single_step( } } WorkerCommand::Resume(when) => { - debug!( - "Resuming Job took {:?}", - when.elapsed(), - ); - debug!( - "Total paused time {:?} Job ", - paused_time.elapsed(), - ); + debug!(elapsed = ?when.elapsed(), "Resuming Job;"); + debug!(paused_time = ?paused_time.elapsed(), "Total paused time;"); status = JobStatus::Running; continue 'messages; @@ -1127,15 +1038,11 @@ async fn handle_single_step( let _ = step_task.await; debug!( - "Shuting down Job took {:?} \ - after running for {:?}", - when.elapsed(), - job_init_time.elapsed(), - ); - debug!( - "Total paused time {:?} Job ", - paused_time.elapsed(), + elapsed_time_to_shutdown = ?when.elapsed(), + job_run_time = ?job_init_time.elapsed(), + "Shuting down;", ); + debug!(paused_time = ?paused_time.elapsed(), "Total paused time;"); // Taking back the last step, so it can run to completion later steps.push_front( @@ -1162,15 +1069,11 @@ async fn handle_single_step( step_task.abort(); let _ = step_task.await; debug!( - "Canceling Job \ - took {:?} after running for {:?}", - when.elapsed(), - job_init_time.elapsed(), - ); - debug!( - "Total paused time {:?} Job ", - paused_time.elapsed(), + canceling_time = ?when.elapsed(), + job_run_time = ?job_init_time.elapsed(), + "Canceling Job;", ); + debug!(paused_time = ?paused_time.elapsed(), "Total paused time;"); return Err(JobError::Canceled(signal_tx)); } WorkerCommand::Pause(_) => { @@ -1178,10 +1081,7 @@ async fn handle_single_step( } WorkerCommand::Timeout(elapsed, tx) => { - error!( - "Job \ - timed out at step #{step_number} after {elapsed:?} without updates" - ); + error!(?elapsed, "Step timed out;"); tx.send(()).ok(); return Err(JobError::Timeout(elapsed)); } @@ -1189,10 +1089,7 @@ async fn handle_single_step( } if commands_rx.is_closed() { - error!( - "Job \ - closed the command channel while paused" - ); + error!("Closed the command channel while paused"); return Err(JobError::Critical( "worker command channel closed while job was paused", )); @@ -1206,10 +1103,9 @@ async fn handle_single_step( let _ = step_task.await; debug!( - "Shuting down Job took {:?} \ - after running for {:?}", - when.elapsed(), - job_init_time.elapsed(), + elapsed = ?when.elapsed(), + job_run_time = ?job_init_time.elapsed(), + "Shutting down Job;", ); // Taking back the last step, so it can run to completion later @@ -1237,18 +1133,15 @@ async fn handle_single_step( step_task.abort(); let _ = step_task.await; debug!( - "Canceling Job took {:?} \ - after running for {:?}", - when.elapsed(), - job_init_time.elapsed(), + cancel_time = ?when.elapsed(), + job_run_time = ?job_init_time.elapsed(), + "Canceling Job;", ); + return Err(JobError::Canceled(signal_tx)); } StreamMessage::NewCommand(WorkerCommand::Timeout(elapsed, tx)) => { - error!( - "Job \ - timed out at step #{step_number} after {elapsed:?} without updates" - ); + error!(?elapsed, "Job timed out without updates;"); tx.send(()).ok(); return Err(JobError::Timeout(elapsed)); } diff --git a/core/src/old_job/report.rs b/core/src/old_job/report.rs index 1e620290f..b85f58110 100644 --- a/core/src/old_job/report.rs +++ b/core/src/old_job/report.rs @@ -1,14 +1,20 @@ -use crate::library::Library; +use crate::{ + library::Library, + object::{ + fs::{ + old_copy::OldFileCopierJobInit, old_cut::OldFileCutterJobInit, + old_delete::OldFileDeleterJobInit, old_erase::OldFileEraserJobInit, + }, + validation::old_validator_job::OldObjectValidatorJobInit, + }, +}; use sd_core_prisma_helpers::job_without_data; use sd_prisma::prisma::job; use sd_utils::db::{maybe_missing, MissingFieldError}; -use std::{ - collections::HashMap, - fmt::{Display, Formatter}, -}; +use std::fmt::{Display, Formatter}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; @@ -26,15 +32,15 @@ pub enum JobReportUpdate { Phase(String), } -#[derive(Debug, Serialize, Deserialize, Type, Clone)] -pub struct JobReport { +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct OldJobReport { pub id: Uuid, pub name: String, pub action: Option, pub data: Option>, - // In Typescript `any | null` is just `any` so we don't get prompted for null checks - // TODO(@Oscar): This will be fixed - #[specta(type = Option>)] + // // In Typescript `any | null` is just `any` so we don't get prompted for null checks + // // TODO(@Oscar): This will be fixed + // #[specta(type = Option>)] pub metadata: Option, pub errors_text: Vec, @@ -53,7 +59,150 @@ pub struct JobReport { pub estimated_completion: DateTime, } -impl Display for JobReport { +impl From for sd_core_heavy_lifting::job_system::report::Report { + fn from( + OldJobReport { + id, + name, + action, + data: _, // Not used in the new job system + metadata, + errors_text: _, // New job system uses type-safe errors + created_at, + started_at, + completed_at, + parent_id, + status, + task_count, + completed_task_count, + phase, + message, + estimated_completion, + }: OldJobReport, + ) -> Self { + use sd_core_heavy_lifting::{job_system::report::ReportOutputMetadata, JobName}; + + let mut new_metadata = Vec::new(); + + if let Some(metadata) = metadata { + if let Some(metadata) = metadata.as_object() { + if let Some(metadata) = metadata.get("output") { + if let Some(metadata) = metadata.as_object() { + if let Some(metadata) = metadata.get("init") { + if let Ok(OldFileCopierJobInit { + source_location_id, + target_location_id, + sources_file_path_ids, + target_location_relative_directory_path, + }) = serde_json::from_value::(metadata.clone()) + { + new_metadata.push( + ReportOutputMetadata::Copier { + source_location_id, + target_location_id, + sources_file_path_ids, + target_location_relative_directory_path, + } + .into(), + ); + } else if let Ok(OldFileCutterJobInit { + source_location_id, + target_location_id, + sources_file_path_ids, + target_location_relative_directory_path, + }) = + serde_json::from_value::(metadata.clone()) + { + new_metadata.push( + ReportOutputMetadata::Mover { + source_location_id, + target_location_id, + sources_file_path_ids, + target_location_relative_directory_path, + } + .into(), + ); + } else if let Ok(OldFileDeleterJobInit { + location_id, + file_path_ids, + }) = + serde_json::from_value::(metadata.clone()) + { + new_metadata.push( + ReportOutputMetadata::Deleter { + location_id, + file_path_ids, + } + .into(), + ); + } else if let Ok(OldFileEraserJobInit { + location_id, + file_path_ids, + passes, + }) = + serde_json::from_value::(metadata.clone()) + { + new_metadata.push( + ReportOutputMetadata::Eraser { + location_id, + file_path_ids, + passes: passes as u32, + } + .into(), + ); + } else if let Ok(OldObjectValidatorJobInit { location, sub_path }) = + serde_json::from_value::( + metadata.clone(), + ) { + new_metadata.push( + ReportOutputMetadata::FileValidator { + location_id: location.id, + sub_path, + } + .into(), + ); + } + } + } + } + } + } + + Self { + id, + name: match name.as_str() { + "file_copier" => JobName::Copy, + "file_cutter" => JobName::Move, + "file_deleter" => JobName::Delete, + "file_eraser" => JobName::Erase, + "object_validator" => JobName::FileValidator, + + // Already implemented in the new job system + "indexer" => JobName::Indexer, + "file_identifier" => JobName::FileIdentifier, + "media_processor" => JobName::MediaProcessor, + + unexpected_job => unimplemented!("Job {unexpected_job} not implemented"), + }, + action, + metadata: new_metadata, + critical_error: None, + non_critical_errors: Vec::new(), + created_at, + started_at, + completed_at, + parent_id, + status: status.into(), + task_count, + completed_task_count, + phase, + message, + estimated_completion, + } + } +} + +impl Display for OldJobReport { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, @@ -64,7 +213,7 @@ impl Display for JobReport { } // convert database struct into a resource struct -impl TryFrom for JobReport { +impl TryFrom for OldJobReport { type Error = MissingFieldError; fn try_from(data: job::Data) -> Result { @@ -75,7 +224,7 @@ impl TryFrom for JobReport { data: data.data, metadata: data.metadata.and_then(|m| { serde_json::from_slice(&m).unwrap_or_else(|e| -> Option { - error!("Failed to deserialize job metadata: {}", e); + error!(?e, "Failed to deserialize job metadata;"); None }) }), @@ -105,7 +254,7 @@ impl TryFrom for JobReport { // I despise having to write this twice, but it seems to be the only way to // remove the data field from the struct // would love to get this DRY'd up -impl TryFrom for JobReport { +impl TryFrom for OldJobReport { type Error = MissingFieldError; fn try_from(data: job_without_data::Data) -> Result { @@ -116,7 +265,7 @@ impl TryFrom for JobReport { data: None, metadata: data.metadata.and_then(|m| { serde_json::from_slice(&m).unwrap_or_else(|e| -> Option { - error!("Failed to deserialize job metadata: {}", e); + error!(?e, "Failed to deserialize job metadata;"); None }) }), @@ -144,7 +293,7 @@ impl TryFrom for JobReport { } } -impl JobReport { +impl OldJobReport { pub fn new(uuid: Uuid, name: String) -> Self { Self { id: uuid, @@ -286,6 +435,21 @@ impl TryFrom for JobStatus { } } +// TODO(fogodev): this is temporary until we can get rid of the old job system +impl From for sd_core_heavy_lifting::job_system::report::Status { + fn from(value: JobStatus) -> Self { + match value { + JobStatus::Queued => Self::Queued, + JobStatus::Running => Self::Running, + JobStatus::Completed => Self::Completed, + JobStatus::Canceled => Self::Canceled, + JobStatus::Failed => Self::Failed, + JobStatus::Paused => Self::Paused, + JobStatus::CompletedWithErrors => Self::CompletedWithErrors, + } + } +} + pub struct JobReportBuilder { pub id: Uuid, pub name: String, @@ -295,8 +459,8 @@ pub struct JobReportBuilder { } impl JobReportBuilder { - pub fn build(self) -> JobReport { - JobReport { + pub fn build(self) -> OldJobReport { + OldJobReport { id: self.id, name: self.name, action: self.action, @@ -325,19 +489,4 @@ impl JobReportBuilder { parent_id: None, } } - - pub fn with_action(mut self, action: impl AsRef) -> Self { - self.action = Some(action.as_ref().to_string()); - self - } - - pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self { - self.metadata = Some(metadata); - self - } - - pub fn with_parent_id(mut self, parent_id: Uuid) -> Self { - self.parent_id = Some(parent_id); - self - } } diff --git a/core/src/old_job/worker.rs b/core/src/old_job/worker.rs index 1405d952b..69815fca1 100644 --- a/core/src/old_job/worker.rs +++ b/core/src/old_job/worker.rs @@ -24,12 +24,12 @@ use tokio::{ time::{interval, timeout, Instant, MissedTickBehavior}, }; use tokio_stream::wrappers::IntervalStream; -use tracing::{debug, error, info, trace, warn}; +use tracing::{debug, error, info, instrument, trace, warn}; use uuid::Uuid; use super::{ - DynJob, JobError, JobIdentity, JobReport, JobReportUpdate, JobRunErrors, JobRunOutput, - JobStatus, OldJobs, + DynJob, JobError, JobIdentity, JobReportUpdate, JobRunErrors, JobRunOutput, JobStatus, + OldJobReport, OldJobs, }; const FIVE_SECS: Duration = Duration::from_secs(5); @@ -114,8 +114,8 @@ impl WorkerContext { pub struct Worker { pub(super) library_id: Uuid, commands_tx: chan::Sender, - report_watch_tx: Arc>, - report_watch_rx: watch::Receiver, + report_watch_tx: Arc>, + report_watch_rx: watch::Receiver, paused: AtomicBool, } @@ -123,7 +123,7 @@ impl Worker { pub async fn new( id: Uuid, mut job: Box, - mut report: JobReport, + mut report: OldJobReport, library: Arc, node: Arc, job_manager: Arc, @@ -255,7 +255,7 @@ impl Worker { } } - pub fn report(&self) -> JobReport { + pub fn report(&self) -> OldJobReport { self.report_watch_rx.borrow().clone() } @@ -263,10 +263,11 @@ impl Worker { self.paused.load(Ordering::Relaxed) } + #[instrument(skip_all, fields(job_id = %report.id, job_name = %report.name))] fn track_progress( - report: &mut JobReport, + report: &mut OldJobReport, last_report_watch_update: &mut Instant, - report_watch_tx: &watch::Sender, + report_watch_tx: &watch::Sender, start_time: DateTime, updates: Vec, library: &Library, @@ -286,14 +287,14 @@ impl Worker { } JobReportUpdate::Message(message) => { - trace!("job {} message: {}", report.id, message); + trace!(?message); report.message = message; } JobReportUpdate::Phase(phase) => { trace!( - "changing Job phase: {} -> {phase}", - report.id, - report.phase + old_phase = %report.phase, + new_phase = %phase, + "Changing Job phase;", ); report.phase = phase; } @@ -307,7 +308,9 @@ impl Worker { let task_count = report.task_count as usize; let completed_task_count = report.completed_task_count as usize; let remaining_task_count = task_count.saturating_sub(completed_task_count); - let remaining_time_per_task = elapsed / (completed_task_count + 1) as i32; // Adding 1 to avoid division by zero + + // To avoid division by zero + let remaining_time_per_task = elapsed / (usize::max(completed_task_count, 1) as i32); let remaining_time = remaining_time_per_task * remaining_task_count as i32; // Update the report with estimated remaining time @@ -338,6 +341,16 @@ impl Worker { })); } + #[instrument( + skip_all, + fields( + job_id = %report.id, + job_name = %report.name, + worker_id = %worker_id, + %start_time, + library_id = %library.id, + ), + )] async fn do_work( worker_id: Uuid, JobWorkTable { @@ -346,7 +359,7 @@ impl Worker { hash, mut report, }: JobWorkTable, - report_watch_tx: Arc>, + report_watch_tx: Arc>, start_time: DateTime, (commands_tx, commands_rx): (chan::Sender, chan::Receiver), library: Arc, @@ -404,7 +417,7 @@ impl Worker { while let Some(msg) = msg_stream.next().await { match msg { StreamMessage::JobResult(Err(join_error)) => { - error!("Worker had a critical error: {join_error:#?}"); + error!(?join_error, "Critical error;"); break; } StreamMessage::JobResult(Ok((job, job_result))) => { @@ -430,10 +443,7 @@ impl Worker { report_watch_tx.send(report.clone()).ok(); - debug!( - "Worker completed Job", - report.id, report.name - ); + debug!("Worker completed Job"); return manager.complete(&library, worker_id, hash, next_job).await; } @@ -460,8 +470,9 @@ impl Worker { let elapsed = last_update_received_at.elapsed(); if elapsed > FIVE_MINUTES { error!( - "Worker has not received any updates for {elapsed:?}" - ); + time_without_updates = ?elapsed, + "Worker has not received any updates;", + ); let (tx, rx) = oneshot::channel(); if commands_tx @@ -469,15 +480,16 @@ impl Worker { .await .is_err() { - error!("Worker failed to send timeout step command to a running job"); + error!("Failed to send timeout step command to a running job"); } else if timeout(FIVE_SECS, rx).await.is_err() { - error!("Worker failed to receive timeout step answer from a running job"); + error!("Failed to receive timeout step answer from a running job"); } // As we already sent a timeout command, we can safely join as the job is over - let Ok((job, job_result)) = run_task.await.map_err(|join_error| { - error!("Worker had a critical error: {join_error:#?}") - }) else { + let Ok((job, job_result)) = run_task + .await + .map_err(|join_error| error!(?join_error, "Had a critical error;")) + else { break; }; @@ -485,10 +497,7 @@ impl Worker { report_watch_tx.send(report.clone()).ok(); - error!( - "Worker timed out Job", - report.id, report.name - ); + error!("Timed out"); break; } @@ -503,7 +512,7 @@ impl Worker { async fn process_job_output( mut job: Box, job_result: Result, - report: &mut JobReport, + report: &mut OldJobReport, library: &Library, ) -> Option> { // Run the job and handle the result @@ -527,10 +536,10 @@ impl Worker { }; report.completed_at = Some(Utc::now()); if let Err(e) = report.update(library).await { - error!("failed to update job report: {:#?}", e); + error!(?e, "Failed to update job report;"); } - debug!("{report}"); + debug!(?report); invalidate_queries(library); @@ -542,10 +551,7 @@ impl Worker { errors: JobRunErrors(errors), next_job, }) => { - warn!( - "Job completed with errors", - report.id, report.name - ); + warn!("Completed with errors"); report.status = JobStatus::CompletedWithErrors; report.errors_text = errors; report.data = None; @@ -560,10 +566,10 @@ impl Worker { }; report.completed_at = Some(Utc::now()); if let Err(e) = report.update(library).await { - error!("failed to update job report: {:#?}", e); + error!(?e, "Failed to update job report;"); } - debug!("{report}"); + debug!(?report); invalidate_queries(library); @@ -571,12 +577,9 @@ impl Worker { } // -> Job paused Err(JobError::Paused(state, signal_tx)) => { - info!( - "Job paused, we will pause all children jobs", - report.id, report.name - ); + info!("Job paused, we will pause all children jobs"); if let Err(e) = job.pause_children(library).await { - error!("Failed to pause children jobs: {e:#?}"); + error!(?e, "Failed to pause children jobs;"); } debug!("Setting worker status to paused"); @@ -585,23 +588,20 @@ impl Worker { report.data = Some(state); if let Err(e) = report.update(library).await { - error!("failed to update job report: {:#?}", e); + error!(?e, "Failed to update job report;"); } - debug!("{report}"); + debug!(?report); invalidate_queries(library); signal_tx.send(()).ok(); } - // -> Job paused + // -> Job canceled Err(JobError::Canceled(signal_tx)) => { - info!( - "Job canceled, we will cancel all children jobs", - report.id, report.name - ); + info!("Job canceled, we will cancel all children jobs"); if let Err(e) = job.cancel_children(library).await { - error!("Failed to pause children jobs: {e:#?}"); + error!(?e, "Failed to pause children jobs;"); } debug!("Setting worker status to paused"); @@ -610,10 +610,10 @@ impl Worker { report.data = None; if let Err(e) = report.update(library).await { - error!("failed to update job report: {:#?}", e); + error!(?e, "Failed to update job report;"); } - debug!("{report}"); + debug!(?report); invalidate_queries(library); @@ -621,21 +621,18 @@ impl Worker { } // -> Job failed Err(e) => { - error!( - "Job failed with error: {e:#?};", - report.id, report.name - ); + error!(?e, "Job failed with error;"); if let Err(e) = job.cancel_children(library).await { - error!("Failed to cancel children jobs: {e:#?}"); + error!(?e, "Failed to cancel children jobs;"); } report.status = JobStatus::Failed; report.data = None; if let Err(e) = report.update(library).await { - error!("failed to update job report: {:#?}", e); + error!(?e, "Failed to update job report;"); } - warn!("{report}"); + warn!(?report); invalidate_queries(library); } @@ -649,7 +646,7 @@ struct JobWorkTable { job: Box, manager: Arc, hash: u64, - report: JobReport, + report: OldJobReport, } fn invalidate_queries(library: &Library) { diff --git a/core/src/p2p/libraries.rs b/core/src/p2p/libraries.rs index f594d30e5..aa83cfcc1 100644 --- a/core/src/p2p/libraries.rs +++ b/core/src/p2p/libraries.rs @@ -20,7 +20,7 @@ pub fn libraries_hook(p2p: Arc, quic: Arc, libraries: Arc, quic: Arc, libraries: Arc, router| { @@ -171,13 +171,13 @@ impl P2PManager { }, }; } - Err(err) => { - error!("Failed to parse p2p relay configuration: {err:?}") + Err(e) => { + error!(?e, "Failed to parse p2p relay configuration;") } } } } - Err(err) => error!("Error pulling p2p relay configuration: {err:?}"), + Err(e) => error!(?e, "Error pulling p2p relay configuration;"), } tokio::select! { @@ -216,19 +216,19 @@ impl P2PManager { let port = config.p2p.port.get(); let ipv4_port = (!config.p2p.disabled).then_some(port); - info!("Setting quic ipv4 listener to: {ipv4_port:?}"); + info!(?ipv4_port, "Setting quic ipv4 listener;"); self.listeners .lock() .unwrap_or_else(PoisonError::into_inner) - .ipv4 = if let Err(err) = self.quic_transport.set_ipv4_enabled(ipv4_port).await { - error!("Failed to enabled quic ipv4 listener: {err}"); + .ipv4 = if let Err(e) = self.quic_transport.set_ipv4_enabled(ipv4_port).await { + error!(?e, "Failed to enabled quic ipv4 listener;"); self.node_config .write(|c| c.p2p.disabled = false) .await .ok(); ListenerState::Error { - error: err.to_string(), + error: e.to_string(), } } else { match !config.p2p.disabled { @@ -239,19 +239,19 @@ impl P2PManager { let enable_ipv6 = !config.p2p.disabled && !config.p2p.disable_ipv6; let ipv6_port = enable_ipv6.then_some(port); - info!("Setting quic ipv6 listener to: {ipv6_port:?}"); + info!(?ipv6_port, "Setting quic ipv6 listener;"); self.listeners .lock() .unwrap_or_else(PoisonError::into_inner) - .ipv6 = if let Err(err) = self.quic_transport.set_ipv6_enabled(ipv6_port).await { - error!("Failed to enabled quic ipv6 listener: {err}"); + .ipv6 = if let Err(e) = self.quic_transport.set_ipv6_enabled(ipv6_port).await { + error!(?e, "Failed to enabled quic ipv6 listener;"); self.node_config .write(|c| c.p2p.disable_ipv6 = false) .await .ok(); ListenerState::Error { - error: err.to_string(), + error: e.to_string(), } } else { match enable_ipv6 { @@ -285,8 +285,8 @@ impl P2PManager { *mdns = Some(m); false } - Err(err) => { - error!("Failed to start mDNS: {err}"); + Err(e) => { + error!(?e, "Failed to start mDNS;"); true } } @@ -372,8 +372,8 @@ async fn start( let mut service = unwrap_infallible(service.call(()).await); tokio::spawn(async move { - let Ok(header) = Header::from_stream(&mut stream).await.map_err(|err| { - error!("Failed to read header from stream: {}", err); + let Ok(header) = Header::from_stream(&mut stream).await.map_err(|e| { + error!(?e, "Failed to read header from stream;"); }) else { return; }; @@ -388,14 +388,14 @@ async fn start( error!("Failed to handle Spacedrop request"); } Header::Sync => { - let Ok(mut tunnel) = Tunnel::responder(stream).await.map_err(|err| { - error!("Failed `Tunnel::responder`: {}", err); + let Ok(mut tunnel) = Tunnel::responder(stream).await.map_err(|e| { + error!(?e, "Failed `Tunnel::responder`;"); }) else { return; }; - let Ok(msg) = SyncMessage::from_stream(&mut tunnel).await.map_err(|err| { - error!("Failed `SyncMessage::from_stream`: {}", err); + let Ok(msg) = SyncMessage::from_stream(&mut tunnel).await.map_err(|e| { + error!(?e, "Failed `SyncMessage::from_stream`"); }) else { return; }; @@ -405,7 +405,7 @@ async fn start( .get_library_for_instance(&tunnel.library_remote_identity()) .await .ok_or_else(|| { - error!("Failed to get library {}", tunnel.library_remote_identity()); + error!(remove_identity = %tunnel.library_remote_identity(), "Failed to get library;"); // TODO: Respond to remote client with warning! }) @@ -425,25 +425,30 @@ async fn start( } Header::RspcRemote => { let remote = stream.remote_identity(); - let Err(err) = operations::rspc::receiver(stream, &mut service, &node).await + let Err(e) = operations::rspc::receiver(stream, &mut service, &node).await else { return; }; - error!("Failed to handling rspc request with '{remote}': {err:?}"); + error!(%remote, ?e, "Failed to handling rspc request;"); } Header::LibraryFile { file_path_id, range, } => { let remote = stream.remote_identity(); - let Err(err) = + let Err(e) = operations::library::receiver(stream, file_path_id, range, &node).await else { return; }; - error!("Failed to handling library file request with {remote:?} for {file_path_id}: {err:?}"); + error!( + ?remote, + %file_path_id, + ?e, + "Failed to handling library file request;", + ); } }; }); diff --git a/core/src/p2p/operations/ping.rs b/core/src/p2p/operations/ping.rs index 06939d785..235e1111f 100644 --- a/core/src/p2p/operations/ping.rs +++ b/core/src/p2p/operations/ping.rs @@ -28,7 +28,7 @@ pub async fn ping(p2p: Arc, identity: RemoteIdentity) -> Result<(), Box Result<(), Box> { debug!( - "Received http request from peer '{}'", - stream.remote_identity(), + peer = %stream.remote_identity(), + "Received http request from;", ); // TODO: Authentication diff --git a/core/src/p2p/operations/spacedrop.rs b/core/src/p2p/operations/spacedrop.rs index 64e2e5826..1118a6531 100644 --- a/core/src/p2p/operations/spacedrop.rs +++ b/core/src/p2p/operations/spacedrop.rs @@ -74,43 +74,43 @@ pub async fn spacedrop( let total_length: u64 = requests.iter().map(|req| req.size).sum(); let id = Uuid::new_v4(); - debug!("({id}): starting Spacedrop with peer '{identity}"); + debug!(spacedrop_id = %id, peer = %identity, "Starting Spacedrop;"); let peer = p2p .p2p .peers() .get(&identity) .ok_or_else(|| { - debug!("({id}): failed to find connection method with '{identity}'"); + debug!(spacedrop_id = %id, peer = %identity, "Failed to find connection method;"); SpacedropError::FailedPeerConnection })? .clone(); - let mut stream = peer.new_stream().await.map_err(|err| { - debug!("({id}): failed to connect to '{identity}': {err:?}"); - SpacedropError::FailedNewStream(err) + let mut stream = peer.new_stream().await.map_err(|e| { + debug!(spacedrop_id = %id, peer = %identity, ?e, "Failed to connect"); + SpacedropError::FailedNewStream(e) })?; tokio::spawn(async move { - debug!("({id}): connected, sending header"); + debug!(spacedrop_id = %id, "Connected, sending header"); let header = Header::Spacedrop(SpaceblockRequests { id, block_size: BlockSize::from_file_size(total_length), requests, }); - if let Err(err) = stream.write_all(&header.to_bytes()).await { - debug!("({id}): failed to send header: {err}"); + if let Err(e) = stream.write_all(&header.to_bytes()).await { + debug!(spacedrop_id = %id, ?e, "Failed to send header"); return; } let Header::Spacedrop(requests) = header else { unreachable!(); }; - debug!("({id}): waiting for response"); + debug!(spacedrop_id = %id, "Waiting for response"); let result = tokio::select! { result = stream.read_u8() => result, // Add 5 seconds incase the user responded on the deadline and slow network _ = sleep(SPACEDROP_TIMEOUT + Duration::from_secs(5)) => { - debug!("({id}): timed out, cancelling"); + debug!(spacedrop_id = %id, "Timed out, cancelling"); p2p.events.send(P2PEvent::SpacedropTimedOut { id }).ok(); return; }, @@ -118,13 +118,13 @@ pub async fn spacedrop( match result { Ok(0) => { - debug!("({id}): Spacedrop was rejected from peer '{identity}'"); + debug!(spacedrop_id = %id, peer = %identity, "Spacedrop was rejected from;"); p2p.events.send(P2PEvent::SpacedropRejected { id }).ok(); return; } - Ok(1) => {} // Okay - Ok(_) => todo!(), // TODO: Proper error - Err(err) => todo!("{:?}", err), // TODO: Proper error + Ok(1) => {} // Okay + Ok(_) => todo!(), // TODO: Proper error + Err(e) => todo!("{:?}", e), // TODO: Proper error } let cancelled = Arc::new(AtomicBool::new(false)); @@ -133,7 +133,7 @@ pub async fn spacedrop( .unwrap_or_else(PoisonError::into_inner) .insert(id, cancelled.clone()); - debug!("({id}): starting transfer"); + debug!(spacedrop_id = %id, "Starting transfer"); let i = Instant::now(); let mut transfer = Transfer::new( @@ -147,10 +147,20 @@ pub async fn spacedrop( ); for (file_id, (path, file)) in files.into_iter().enumerate() { - debug!("({id}): transmitting '{file_id}' from '{path:?}'"); + debug!( + spacedrop_id = %id, + %file_id, + path = %path.display(), + "Transmitting;", + ); + let file = BufReader::new(file); - if let Err(err) = transfer.send(&mut stream, file).await { - debug!("({id}): failed to send file '{file_id}': {err}"); + if let Err(e) = transfer.send(&mut stream, file).await { + debug!( + spacedrop_id = %id, + %file_id, + ?e, + "Failed to send file;"); // TODO: Error to frontend // p2p.events // .send(P2PEvent::SpacedropFailed { id, file_id }) @@ -159,7 +169,7 @@ pub async fn spacedrop( } } - debug!("({id}): finished; took '{:?}", i.elapsed()); + debug!(spacedrop_id = %id, elapsed_time = ?i.elapsed(), "Finished;"); }); Ok(id) @@ -175,8 +185,8 @@ impl P2PManager { .remove(&id) { chan.send(Some(path)) - .map_err(|err| { - warn!("error accepting Spacedrop '{id:?}': '{err:?}'"); + .map_err(|e| { + warn!(spacedrop_id = %id, ?e, "Error accepting Spacedrop;"); }) .ok(); } @@ -190,8 +200,8 @@ impl P2PManager { .remove(&id) { chan.send(None) - .map_err(|err| { - warn!("error rejecting Spacedrop '{id:?}': '{err:?}'"); + .map_err(|e| { + warn!(spacedrop_id = %id, ?e, "Error rejecting Spacedrop;"); }) .ok(); } @@ -218,10 +228,11 @@ pub(crate) async fn receiver( let (tx, rx) = oneshot::channel(); info!( - "({id}): received '{}' files from peer '{}' with block size '{:?}'", - req.requests.len(), - stream.remote_identity(), - req.block_size + spacedrop_id = %id, + files_count = req.requests.len(), + peer = %stream.remote_identity(), + block_size = ?req.block_size, + "Receiving spacedrop files;", ); this.spacedrop_pairing_reqs .lock() @@ -258,19 +269,19 @@ pub(crate) async fn receiver( tokio::select! { _ = sleep(SPACEDROP_TIMEOUT) => { - info!("({id}): timeout, rejecting!"); + info!(spacedrop_id = %id, "Timeout, rejecting!;"); - stream.write_all(&[0]).await.map_err(|err| { - error!("({id}): error reject bit: '{err:?}'"); + stream.write_all(&[0]).await.map_err(|e| { + error!(spacedrop_id = %id, ?e, "Error reject bit;"); })?; - stream.flush().await.map_err(|err| { - error!("({id}): error flushing reject bit: '{err:?}'"); + stream.flush().await.map_err(|e| { + error!(spacedrop_id = %id, ?e, "Error flushing reject bit;"); })?; } file_path = rx => { match file_path { Ok(Some(file_path)) => { - info!("({id}): accepted saving to '{:?}'", file_path); + info!(spacedrop_id = %id, saving_to = %file_path, "Accepted;"); let cancelled = Arc::new(AtomicBool::new(false)); this.spacedrop_cancellations @@ -278,8 +289,8 @@ pub(crate) async fn receiver( .unwrap_or_else(PoisonError::into_inner) .insert(id, cancelled.clone()); - stream.write_all(&[1]).await.map_err(|err| { - error!("({id}): error sending continuation bit: '{err:?}'"); + stream.write_all(&[1]).await.map_err(|e| { + error!(spacedrop_id = %id, ?e, "Error sending continuation bit;"); // TODO: Send error to the frontend @@ -301,11 +312,20 @@ pub(crate) async fn receiver( path.push(&file_name); } - debug!("({id}): accepting '{file_name}' and saving to '{:?}'", path); + debug!( + spacedrop_id = %id, + %file_name, + saving_to = %path.display(), + "Accepting;", + ); if let Some(parent) = path.parent() { - create_dir_all(&parent).await.map_err(|err| { - error!("({id}): error creating parent directory '{parent:?}': '{err:?}'"); + create_dir_all(&parent).await.map_err(|e| { + error!( + spacedrop_id = %id, + parent = %parent.display(), + ?e, + "Error creating parent directory;"); // TODO: Send error to the frontend @@ -313,16 +333,25 @@ pub(crate) async fn receiver( })?; } - let f = File::create(&path).await.map_err(|err| { - error!("({id}): error creating file at '{path:?}': '{err:?}'"); + let f = File::create(&path).await.map_err(|e| { + error!( + spacedrop_id = %id, + creating_file_at = %path.display(), + ?e, + "Error creating file;", + ); // TODO: Send error to the frontend // TODO: Send error to remote peer })?; let f = BufWriter::new(f); - if let Err(err) = transfer.receive(&mut stream, f).await { - error!("({id}): error receiving file '{file_name}': '{err:?}'"); + if let Err(e) = transfer.receive(&mut stream, f).await { + error!( + spacedrop_id = %id, + %file_name, + ?e, + "Error receiving file;"); // TODO: Send error to frontend @@ -330,20 +359,20 @@ pub(crate) async fn receiver( } } - info!("({id}): complete"); + info!(spacedrop_id = %id, "Completed;"); } Ok(None) => { - info!("({id}): rejected"); + info!(spacedrop_id = %id, "Rejected;"); - stream.write_all(&[0]).await.map_err(|err| { - error!("({id}): error sending rejection: '{err:?}'"); + stream.write_all(&[0]).await.map_err(|e| { + error!(spacedrop_id = %id, ?e, "Error sending rejection;"); })?; - stream.flush().await.map_err(|err| { - error!("({id}): error flushing rejection: '{err:?}'"); + stream.flush().await.map_err(|e| { + error!(spacedrop_id = %id, ?e, "Error flushing rejection;"); })?; } Err(_) => { - warn!("({id}): error with Spacedrop pairing request receiver!"); + warn!(spacedrop_id = %id, "Error with Spacedrop pairing request receiver!;"); } } } diff --git a/core/src/p2p/sync/mod.rs b/core/src/p2p/sync/mod.rs index 3e7d7695f..421b0e6a7 100644 --- a/core/src/p2p/sync/mod.rs +++ b/core/src/p2p/sync/mod.rs @@ -83,6 +83,7 @@ mod originator { } } + #[instrument(skip(sync, p2p))] /// REMEMBER: This only syncs one direction! pub async fn run( library: Arc, @@ -99,8 +100,9 @@ mod originator { let library = library.clone(); tokio::spawn(async move { debug!( - "Alerting peer {remote_identity:?} of new sync events for library {:?}", - library.id + ?remote_identity, + %library.id, + "Alerting peer of new sync events for library;" ); let mut stream = peer.new_stream().await.unwrap(); @@ -223,10 +225,9 @@ mod responder { let timestamps = match req { Request::FinishedIngesting => break, Request::Messages { timestamps, .. } => timestamps, - _ => continue, }; - debug!("Getting ops for timestamps {timestamps:?}"); + debug!(?timestamps, "Getting ops for timestamps;"); stream .write_all( diff --git a/core/src/preferences/library.rs b/core/src/preferences/library.rs index 0facd1adb..2fadb8005 100644 --- a/core/src/preferences/library.rs +++ b/core/src/preferences/library.rs @@ -37,7 +37,7 @@ impl LibraryPreferences { kvs.into_iter() .filter_map(|data| { rmpv::decode::read_value(&mut data.value?.as_slice()) - .map_err(|e| error!("{e:#?}")) + .map_err(|e| error!(?e)) .ok() .map(|value| { ( diff --git a/core/src/preferences/mod.rs b/core/src/preferences/mod.rs index 006e283d5..215798fb1 100644 --- a/core/src/preferences/mod.rs +++ b/core/src/preferences/mod.rs @@ -38,7 +38,7 @@ where .into_iter() .filter_map(|(key, entry)| { Uuid::parse_str(&key) - .map_err(|e| error!("{e:#?}")) + .map_err(|e| error!(?e)) .ok() .map(|uuid| (uuid, entry.expect_value())) }) diff --git a/core/src/util/debug_initializer.rs b/core/src/util/debug_initializer.rs index 8d796cc3d..8221aa77e 100644 --- a/core/src/util/debug_initializer.rs +++ b/core/src/util/debug_initializer.rs @@ -28,7 +28,7 @@ use tokio::{ fs::{self, metadata}, time::sleep, }; -use tracing::{info, warn}; +use tracing::{info, instrument, warn}; use uuid::Uuid; #[derive(Deserialize)] @@ -75,6 +75,8 @@ pub enum InitConfigError { #[error("failed to get current directory from environment: {0}")] CurrentDir(io::Error), + #[error(transparent)] + Processing(#[from] sd_core_heavy_lifting::Error), #[error(transparent)] FileIO(#[from] FileIOError), } @@ -107,18 +109,19 @@ impl InitConfig { Ok(None) } + #[instrument(skip_all, fields(path = %self.path.display()), err)] pub async fn apply( self, library_manager: &Arc, node: &Arc, ) -> Result<(), InitConfigError> { - info!("Initializing app from file: {:?}", self.path); + info!("Initializing app from file"); for lib in self.libraries { let name = lib.name.to_string(); let _guard = AbortOnDrop(tokio::spawn(async move { loop { - info!("Initializing library '{name}' from 'sd_init.json'..."); + info!(library_name = %name, "Initializing library from 'sd_init.json'...;"); sleep(Duration::from_secs(1)).await; } })); @@ -145,7 +148,7 @@ impl InitConfig { let locations = library.db.location().find_many(vec![]).exec().await?; for location in locations { - warn!("deleting location: {:?}", location.path); + warn!(location_path = ?location.path, "deleting location;"); delete_location(node, &library, location.id).await?; } } @@ -158,7 +161,7 @@ impl InitConfig { .exec() .await? { - warn!("deleting location: {:?}", location.path); + warn!(location_path = ?location.path, "deleting location;"); delete_location(node, &library, location.id).await?; } @@ -166,7 +169,7 @@ impl InitConfig { if let Err(e) = fs::remove_file(sd_file).await { if e.kind() != io::ErrorKind::NotFound { - warn!("failed to remove '.spacedrive' file: {:?}", e); + warn!(?e, "failed to remove '.spacedrive' file;"); } } @@ -181,14 +184,14 @@ impl InitConfig { scan_location(node, &library, location, ScanState::Pending).await?; } else { warn!( - "Debug init error: location '{}' was not found after being created!", - loc.path + location_path = ?loc.path, + "Debug init error: location was not found after being created!", ); } } } - info!("Initialized app from file: {}", self.path.display()); + info!("Initialized app from file"); Ok(()) } diff --git a/core/src/util/mpscrr.rs b/core/src/util/mpscrr.rs index 450a888d3..4c7826bea 100644 --- a/core/src/util/mpscrr.rs +++ b/core/src/util/mpscrr.rs @@ -110,7 +110,7 @@ impl Sender { .await .into_iter() .filter_map(|x| { - x.map_err(|err| match err { + x.map_err(|e| match e { SenderError::Finished(key) => { self.0 .write() diff --git a/core/src/util/version_manager.rs b/core/src/util/version_manager.rs index da4cd37bd..62d42308d 100644 --- a/core/src/util/version_manager.rs +++ b/core/src/util/version_manager.rs @@ -157,9 +157,9 @@ impl< Ok(version) => version, Err(VersionManagerError::VersionFileDoesNotExist) => { warn!( - "Config file for {} does not exist, trying to create a new one with version -> {}", - type_name::(), - Config::LATEST_VERSION + config = %type_name::(), + latest_version = %Config::LATEST_VERSION, + "Config file for does not exist, trying to create a new one with latest version;", ); let Some(latest_config) = Config::from_latest_version() else { @@ -198,8 +198,10 @@ impl< ); info!( - "Running {} migrator: {current} -> {next}", - type_name::() + config = %type_name::(), + %current, + %next, + "Running migrator;", ); migrate_fn(current, next).await?; } @@ -207,7 +209,7 @@ impl< this.set_version(version_file_path, Config::LATEST_VERSION) .await?; } else { - debug!("No migration required for {}", type_name::()); + debug!(config = %type_name::(), "No migration required;"); } fs::read(version_file_path) diff --git a/core/src/volume/mod.rs b/core/src/volume/mod.rs index 494b395b9..1cae4191b 100644 --- a/core/src/volume/mod.rs +++ b/core/src/volume/mod.rs @@ -128,12 +128,8 @@ pub async fn get_volumes() -> Vec { // Ensure disk has a valid device path let real_path = match tokio::fs::canonicalize(disk_name).await { - Err(real_path) => { - error!( - "Failed to canonicalize disk path {}: {:#?}", - disk_name.to_string_lossy(), - real_path - ); + Err(e) => { + error!(?disk_name, ?e, "Failed to canonicalize disk path;",); continue; } Ok(real_path) => real_path, @@ -306,7 +302,7 @@ pub async fn get_volumes() -> Vec { .args(["info", "-plist"]) .output() .await - .map_err(|err| error!("Failed to execute hdiutil: {err:#?}")) + .map_err(|e| error!(?e, "Failed to execute hdiutil;")) .ok() .and_then(|wmic_process| { use std::str::FromStr; @@ -314,8 +310,8 @@ pub async fn get_volumes() -> Vec { if wmic_process.status.success() { let info: Result = plist::from_bytes(&wmic_process.stdout); match info { - Err(err) => { - error!("Failed to parse hdiutil output: {err:#?}"); + Err(e) => { + error!(?e, "Failed to parse hdiutil output;"); None } Ok(info) => Some( @@ -396,7 +392,7 @@ pub async fn get_volumes() -> Vec { ]) .output() .await - .map_err(|err| error!("Failed to execute hdiutil: {err:#?}")) + .map_err(|e| error!(?e, "Failed to execute hdiutil;")) .ok() .and_then(|wmic_process| { if wmic_process.status.success() { @@ -413,7 +409,7 @@ pub async fn get_volumes() -> Vec { .trim() .parse::() { - Err(err) => error!("Failed to parse wmic output: {err:#?}"), + Err(e) => error!(?e, "Failed to parse wmic output;"), Ok(n) => total_capacity = n, } } diff --git a/crates/ai/src/old_image_labeler/process.rs b/crates/ai/src/old_image_labeler/process.rs index 8e674f3be..125dbe21c 100644 --- a/crates/ai/src/old_image_labeler/process.rs +++ b/crates/ai/src/old_image_labeler/process.rs @@ -85,7 +85,7 @@ pub(super) async fn spawned_processing( let mut queue = file_paths .into_iter() .filter_map(|file_path| { - if file_path.object_id.is_none() { + if file_path.object.is_none() { errors.push(( file_path.id, ImageLabelerError::IsolateFilePathData(MissingFieldError::new( @@ -201,7 +201,7 @@ pub(super) async fn spawned_processing( let ids = ( file_path.id, - file_path.object_id.expect("already checked above"), + file_path.object.as_ref().expect("already checked above").id, ); if output_tx.is_closed() { diff --git a/crates/media-metadata/src/ffmpeg/mod.rs b/crates/media-metadata/src/ffmpeg/mod.rs index af7740413..702762c86 100644 --- a/crates/media-metadata/src/ffmpeg/mod.rs +++ b/crates/media-metadata/src/ffmpeg/mod.rs @@ -54,6 +54,7 @@ mod extract_data { FFmpegAudioProps, FFmpegChapter, FFmpegCodec, FFmpegMediaData, FFmpegMetadata, FFmpegProgram, FFmpegProps, FFmpegStream, FFmpegSubtitleProps, FFmpegVideoProps, }; + use sd_utils::i64_to_frontend; impl From for super::FFmpegMetadata { fn from( @@ -69,27 +70,9 @@ mod extract_data { ) -> Self { Self { formats, - duration: duration.map(|duration| { - #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] - { - // SAFETY: We're splitting in (high, low) parts, so we're not going to lose data on truncation - ((duration >> 32) as i32, duration as u32) - } - }), - start_time: start_time.map(|start_time| { - #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] - { - // SAFETY: We're splitting in (high, low) parts, so we're not going to lose data on truncation - ((start_time >> 32) as i32, start_time as u32) - } - }), - bit_rate: { - #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] - { - // SAFETY: We're splitting in (high, low) parts, so we're not going to lose data on truncation - ((bit_rate >> 32) as i32, bit_rate as u32) - } - }, + duration: duration.map(i64_to_frontend), + start_time: start_time.map(i64_to_frontend), + bit_rate: i64_to_frontend(bit_rate), chapters: chapters.into_iter().map(Into::into).collect(), programs: programs.into_iter().map(Into::into).collect(), metadata: metadata.into(), @@ -117,20 +100,8 @@ mod extract_data { } }, // TODO: FIX these 2 when rspc/specta supports bigint - start: { - #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] - { - // SAFETY: We're splitting in (high, low) parts, so we're not going to lose data on truncation - ((start >> 32) as i32, start as u32) - } - }, - end: { - #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] - { - // SAFETY: We're splitting in (high, low) parts, so we're not going to lose data on truncation - ((end >> 32) as i32, end as u32) - } - }, + start: i64_to_frontend(start), + end: i64_to_frontend(end), time_base_num, time_base_den, metadata: metadata.into(), diff --git a/crates/p2p/crates/block/src/lib.rs b/crates/p2p/crates/block/src/lib.rs index 35a1af8ca..7d6554dfd 100644 --- a/crates/p2p/crates/block/src/lib.rs +++ b/crates/p2p/crates/block/src/lib.rs @@ -257,7 +257,7 @@ mod tests { tx.send(()).unwrap(); Transfer::new(&req, |_| {}, &Default::default()) .send(&mut client, file) - .await; + .await } }); @@ -266,7 +266,8 @@ mod tests { let mut result = Vec::new(); Transfer::new(&req, |_| {}, &Default::default()) .receive(&mut server, &mut result) - .await; + .await + .unwrap(); assert_eq!(result, data); } @@ -298,7 +299,7 @@ mod tests { tx.send(()).unwrap(); Transfer::new(&req, |_| {}, &Default::default()) .send(&mut client, file) - .await; + .await } }); @@ -307,7 +308,9 @@ mod tests { let mut result = Vec::new(); Transfer::new(&req, |_| {}, &Default::default()) .receive(&mut server, &mut result) - .await; + .await + .unwrap(); + assert_eq!(result, data); } @@ -339,14 +342,14 @@ mod tests { Transfer::new(&req, |_| {}, &Arc::new(AtomicBool::new(true))) .send(&mut client, file) - .await; + .await } }); rx.await.unwrap(); let mut result = Vec::new(); - Transfer::new(&req, |_| {}, &Default::default()) + let _ = Transfer::new(&req, |_| {}, &Default::default()) .receive(&mut server, &mut result) .await; assert_eq!(result, Vec::::new()); // Cancelled by sender so no data @@ -380,14 +383,14 @@ mod tests { Transfer::new(&req, |_| {}, &Default::default()) .send(&mut client, file) - .await; + .await } }); rx.await.unwrap(); let mut result = Vec::new(); - Transfer::new(&req, |_| {}, &Arc::new(AtomicBool::new(true))) + let _ = Transfer::new(&req, |_| {}, &Arc::new(AtomicBool::new(true))) .receive(&mut server, &mut result) .await; assert_eq!(result, Vec::::new()); // Cancelled by sender so no data @@ -422,14 +425,14 @@ mod tests { Transfer::new(&req, |_| {}, &Default::default()) .send(&mut client, file) - .await; + .await } }); rx.await.unwrap(); let mut result = Vec::new(); - Transfer::new(&req, |_| {}, &Default::default()) + let _ = Transfer::new(&req, |_| {}, &Default::default()) .receive(&mut server, &mut result) .await; assert_eq!(result, Vec::::new()); // Cancelled by sender so no data diff --git a/crates/sync/src/compressed.rs b/crates/sync/src/compressed.rs index bdcd523da..1056a68ad 100644 --- a/crates/sync/src/compressed.rs +++ b/crates/sync/src/compressed.rs @@ -87,6 +87,7 @@ impl CompressedCRDTOperations { }) } + #[must_use] pub fn len(&self) -> usize { self.0 .iter() @@ -98,6 +99,11 @@ impl CompressedCRDTOperations { .sum::() } + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + pub fn into_ops(self) -> Vec { let mut ops = vec![]; diff --git a/crates/task-system/Cargo.toml b/crates/task-system/Cargo.toml index 7ef5eab5e..e1d932230 100644 --- a/crates/task-system/Cargo.toml +++ b/crates/task-system/Cargo.toml @@ -38,3 +38,4 @@ thiserror = { workspace = true } tokio = { workspace = true, features = ["macros", "test-util", "fs"] } tracing-test = { workspace = true, features = ["no-env-filter"] } uuid = { workspace = true, features = ["serde"] } +tracing-subscriber = { workspace = true, features = ["env-filter"] } diff --git a/crates/task-system/src/error.rs b/crates/task-system/src/error.rs index 9b19387f8..8e9747b8e 100644 --- a/crates/task-system/src/error.rs +++ b/crates/task-system/src/error.rs @@ -1,6 +1,6 @@ use std::{error::Error, fmt}; -use super::task::TaskId; +use super::task::{Task, TaskId}; /// Task system's error type definition, representing when internal errors occurs. #[derive(Debug, thiserror::Error)] @@ -28,3 +28,8 @@ pub trait RunError: Error + fmt::Debug + Send + Sync + 'static {} /// [`std::fmt::Debug`](https://doc.rust-lang.org/std/fmt/trait.Debug.html). /// So you will not need to implement this trait for your error type, just implement the `Error` and `Debug` impl RunError for T {} + +/// A task system dispatcher error type, returning tasks when the task system has shutdown. +#[derive(Debug, thiserror::Error)] +#[error("task system already shutdown and can't dispatch more tasks: ", .0.len())] +pub struct DispatcherShutdownError(pub Vec>>); diff --git a/crates/task-system/src/lib.rs b/crates/task-system/src/lib.rs index ef2ed8eb7..7e6349db5 100644 --- a/crates/task-system/src/lib.rs +++ b/crates/task-system/src/lib.rs @@ -93,7 +93,7 @@ mod system; mod task; mod worker; -pub use error::{RunError, SystemError as TaskSystemError}; +pub use error::{DispatcherShutdownError, RunError, SystemError as TaskSystemError}; pub use system::{ BaseDispatcher as BaseTaskDispatcher, Dispatcher as TaskDispatcher, System as TaskSystem, }; diff --git a/crates/task-system/src/message.rs b/crates/task-system/src/message.rs index f6f8265c7..723506e3f 100644 --- a/crates/task-system/src/message.rs +++ b/crates/task-system/src/message.rs @@ -1,8 +1,11 @@ +use std::sync::Arc; + +use async_channel as chan; use tokio::sync::oneshot; use super::{ error::{RunError, SystemError}, - task::{TaskId, TaskWorkState}, + task::{InternalTaskExecStatus, TaskId, TaskWorkState, TaskWorktable}, worker::WorkerId, }; @@ -12,35 +15,29 @@ pub enum SystemMessage { WorkingReport(WorkerId), ResumeTask { task_id: TaskId, - worker_id: WorkerId, + task_work_table: Arc, ack: oneshot::Sender>, }, PauseNotRunningTask { task_id: TaskId, - worker_id: WorkerId, + task_work_table: Arc, ack: oneshot::Sender>, }, CancelNotRunningTask { task_id: TaskId, - worker_id: WorkerId, - ack: oneshot::Sender<()>, + task_work_table: Arc, + ack: oneshot::Sender>, }, ForceAbortion { task_id: TaskId, - worker_id: WorkerId, + task_work_table: Arc, ack: oneshot::Sender>, }, - NotifyIdleWorkers { - start_from: WorkerId, - task_count: usize, - }, ShutdownRequest(oneshot::Sender>), } -#[derive(Debug)] pub enum WorkerMessage { NewTask(TaskWorkState), - TaskCountRequest(oneshot::Sender), ResumeTask { task_id: TaskId, ack: oneshot::Sender>, @@ -51,13 +48,25 @@ pub enum WorkerMessage { }, CancelNotRunningTask { task_id: TaskId, - ack: oneshot::Sender<()>, + ack: oneshot::Sender>, }, ForceAbortion { task_id: TaskId, ack: oneshot::Sender>, }, ShutdownRequest(oneshot::Sender<()>), - StealRequest(oneshot::Sender>>), - WakeUp, + StealRequest { + stealer_id: WorkerId, + ack: oneshot::Sender, + stolen_task_tx: chan::Sender>>, + }, } + +pub struct TaskRunnerOutput { + pub task_work_state: TaskWorkState, + pub status: InternalTaskExecStatus, +} + +pub struct TaskOutputMessage(pub TaskId, pub Result, ()>); + +pub struct StoleTaskMessage(pub TaskWorkState); diff --git a/crates/task-system/src/system.rs b/crates/task-system/src/system.rs index 41a2c802d..904b0a5eb 100644 --- a/crates/task-system/src/system.rs +++ b/crates/task-system/src/system.rs @@ -15,13 +15,13 @@ use async_channel as chan; use futures::StreamExt; use futures_concurrency::future::Join; use tokio::{spawn, sync::oneshot, task::JoinHandle}; -use tracing::{error, info, trace, warn}; +use tracing::{error, info, instrument, trace, warn, Instrument}; use super::{ - error::{RunError, SystemError}, + error::{DispatcherShutdownError, RunError, SystemError}, message::SystemMessage, - task::{IntoTask, Task, TaskHandle, TaskId}, - worker::{AtomicWorkerId, WorkStealer, Worker, WorkerBuilder, WorkerId}, + task::{IntoTask, Task, TaskHandle, TaskId, TaskWorktable}, + worker::{AtomicWorkerId, WorkStealer, Worker, WorkerBuilder}, }; /// The task system is the main entry point for the library, it is responsible for creating and managing the workers @@ -34,17 +34,22 @@ pub struct System { msgs_tx: chan::Sender, dispatcher: BaseDispatcher, handle: RefCell>>, + has_shutdown: Arc, } impl System { /// Created a new task system with a number of workers equal to the available parallelism in the user's machine. pub fn new() -> Self { - let workers_count = std::thread::available_parallelism().map_or_else( - |e| { - error!("Failed to get available parallelism in the job system: {e:#?}"); - 1 - }, - NonZeroUsize::get, + // TODO: Using only the half of available cores, make this configurable on runtime in the future + let workers_count = usize::max( + std::thread::available_parallelism().map_or_else( + |e| { + error!("Failed to get available parallelism in the job system: {e:#?}"); + 1 + }, + NonZeroUsize::get, + ) / 2, + 1, ); let (msgs_tx, msgs_rx) = chan::bounded(8); @@ -79,7 +84,7 @@ impl System { .await { if e.is_panic() { - error!("Job system panicked: {e:#?}"); + error!(?e, "Task system panicked"); } else { trace!("Task system received shutdown signal and will exit..."); break; @@ -91,7 +96,9 @@ impl System { } }); - trace!("Task system online!"); + info!(%workers_count, "Task system online!"); + + let has_shutdown = Arc::new(AtomicBool::new(false)); Self { workers: Arc::clone(&workers), @@ -100,9 +107,10 @@ impl System { workers, idle_workers, last_worker_id: Arc::new(AtomicWorkerId::new(0)), + has_shutdown: Arc::clone(&has_shutdown), }, - handle: RefCell::new(Some(handle)), + has_shutdown, } } @@ -112,15 +120,20 @@ impl System { } /// Dispatches a task to the system, the task will be assigned to a worker and executed as soon as possible. - pub async fn dispatch(&self, into_task: impl IntoTask) -> TaskHandle { + #[allow(clippy::missing_panics_doc)] + pub async fn dispatch( + &self, + into_task: impl IntoTask, + ) -> Result, DispatcherShutdownError> { self.dispatcher.dispatch(into_task).await } /// Dispatches many tasks to the system, the tasks will be assigned to workers and executed as soon as possible. + #[allow(clippy::missing_panics_doc)] pub async fn dispatch_many> + Send>( &self, into_tasks: I, - ) -> Vec> + ) -> Result>, DispatcherShutdownError> where ::IntoIter: Send, { @@ -142,84 +155,50 @@ impl System { while let Some(msg) = msg_stream.next().await { match msg { SystemMessage::IdleReport(worker_id) => { - trace!("Task system received a worker idle report request: "); idle_workers[worker_id].store(true, Ordering::Relaxed); } SystemMessage::WorkingReport(worker_id) => { - trace!( - "Task system received a working report request: " - ); idle_workers[worker_id].store(false, Ordering::Relaxed); } SystemMessage::ResumeTask { task_id, - worker_id, + task_work_table, ack, - } => { - trace!("Task system received a task resume request: "); - workers[worker_id].resume_task(task_id, ack).await; - } + } => dispatch_resume_request(&workers, task_id, task_work_table, ack), SystemMessage::PauseNotRunningTask { task_id, - worker_id, + task_work_table, ack, } => { - trace!("Task system received a task resume request: "); - workers[worker_id] - .pause_not_running_task(task_id, ack) - .await; + dispatch_pause_not_running_task_request( + &workers, + task_id, + task_work_table, + ack, + ); } SystemMessage::CancelNotRunningTask { task_id, - worker_id, + task_work_table, ack, - } => { - trace!("Task system received a task resume request: "); - workers[worker_id] - .cancel_not_running_task(task_id, ack) - .await; - } + } => dispatch_cancel_not_running_task_request( + &workers, + task_id, + task_work_table, + ack, + ), SystemMessage::ForceAbortion { task_id, - worker_id, + task_work_table, ack, - } => { - trace!( - "Task system received a task force abortion request: \ - " - ); - workers[worker_id].force_task_abortion(task_id, ack).await; - } - - SystemMessage::NotifyIdleWorkers { - start_from, - task_count, - } => { - trace!( - "Task system received a request to notify idle workers: \ - " - ); - - for idx in (0..workers.len()) - .cycle() - .skip(start_from) - .take(usize::min(task_count, workers.len())) - { - if idle_workers[idx].load(Ordering::Relaxed) { - workers[idx].wake().await; - // we don't mark the worker as not idle because we wait for it to - // successfully steal a task and then report it back as active - } - } - } + } => dispatch_force_abortion_task_request(&workers, task_id, task_work_table, ack), SystemMessage::ShutdownRequest(tx) => { - trace!("Task system received a shutdown request"); tx.send(Ok(())) .expect("System channel closed trying to shutdown"); return; @@ -235,6 +214,7 @@ impl System { /// If the system message channel is closed for some unknown reason or if we fail to respond to /// oneshot channel with shutdown response. pub async fn shutdown(&self) { + self.has_shutdown.store(true, Ordering::Release); if let Some(handle) = self .handle .try_borrow_mut() @@ -263,7 +243,7 @@ impl System { } if let Err(e) = handle.await { - error!("Task system failed to shutdown on handle await: {e:#?}"); + error!(?e, "Task system failed to shutdown on handle await"); } } else { warn!("Trying to shutdown the tasks system that was already shutdown"); @@ -271,6 +251,173 @@ impl System { } } +#[instrument(skip(workers, ack))] +fn dispatch_resume_request( + workers: &Arc>>, + task_id: TaskId, + task_work_table: Arc, + ack: oneshot::Sender>, +) { + trace!("Task system received a task resume request"); + spawn( + { + let workers = Arc::clone(workers); + async move { + let (tx, rx) = oneshot::channel(); + let first_attempt_worker_id = task_work_table.worker_id(); + workers[first_attempt_worker_id] + .resume_task(task_id, tx) + .await; + let res = rx + .await + .expect("Task system channel closed trying to resume not running task"); + + if matches!(res, Err(SystemError::TaskNotFound(_))) { + warn!( + %first_attempt_worker_id, + "Failed the first try to resume a not running task, trying again", + ); + workers[task_work_table.worker_id()] + .resume_task(task_id, ack) + .await; + } else { + ack.send(res) + .expect("System channel closed trying to resume not running task"); + } + } + } + .in_current_span(), + ); + trace!("Task system resumed task"); +} + +#[instrument(skip(workers, ack, task_work_table))] +fn dispatch_pause_not_running_task_request( + workers: &Arc>>, + task_id: TaskId, + task_work_table: Arc, + ack: oneshot::Sender>, +) { + spawn( + { + let workers: Arc>> = Arc::clone(workers); + + async move { + let (tx, rx) = oneshot::channel(); + let first_attempt_worker_id = task_work_table.worker_id(); + workers[first_attempt_worker_id] + .pause_not_running_task(task_id, tx) + .await; + let res = rx + .await + .expect("Task system channel closed trying to pause not running task"); + + if matches!(res, Err(SystemError::TaskNotFound(_))) { + warn!( + %first_attempt_worker_id, + "Failed the first try to pause a not running task, trying again", + ); + workers[task_work_table.worker_id()] + .pause_not_running_task(task_id, ack) + .await; + } else { + ack.send(res) + .expect("System channel closed trying to pause not running task"); + } + } + } + .in_current_span(), + ); +} + +#[instrument(skip(workers, ack))] +fn dispatch_cancel_not_running_task_request( + workers: &Arc>>, + task_id: TaskId, + task_work_table: Arc, + ack: oneshot::Sender>, +) { + trace!("Task system received a task cancel request"); + spawn( + { + let workers = Arc::clone(workers); + async move { + let (tx, rx) = oneshot::channel(); + let first_attempt_worker_id = task_work_table.worker_id(); + workers[first_attempt_worker_id] + .cancel_not_running_task(task_id, tx) + .await; + let res = rx + .await + .expect("Task system channel closed trying to cancel a not running task"); + + if matches!(res, Err(SystemError::TaskNotFound(_))) { + if task_work_table.is_finalized() { + return ack + .send(Ok(())) + .expect("System channel closed trying to cancel a not running task"); + } + + warn!( + %first_attempt_worker_id, + "Failed the first try to cancel a not running task, trying again", + ); + workers[task_work_table.worker_id()] + .cancel_not_running_task(task_id, ack) + .await; + } else { + ack.send(res) + .expect("System channel closed trying to cancel not running task"); + } + } + } + .in_current_span(), + ); + + trace!("Task system canceled task"); +} + +#[instrument(skip(workers, ack))] +fn dispatch_force_abortion_task_request( + workers: &Arc>>, + task_id: TaskId, + task_work_table: Arc, + ack: oneshot::Sender>, +) { + trace!("Task system received a task force abortion request"); + spawn( + { + let workers = Arc::clone(workers); + async move { + let (tx, rx) = oneshot::channel(); + let first_attempt_worker_id = task_work_table.worker_id(); + workers[first_attempt_worker_id] + .force_task_abortion(task_id, tx) + .await; + let res = rx.await.expect( + "Task system channel closed trying to force abortion of a not running task", + ); + + if matches!(res, Err(SystemError::TaskNotFound(_))) { + warn!( + %first_attempt_worker_id, + "Failed the first try to force abortion of a not running task, trying again", + ); + workers[task_work_table.worker_id()] + .force_task_abortion(task_id, ack) + .await; + } else { + ack.send(res).expect( + "System channel closed trying to force abortion of a not running task", + ); + } + } + } + .in_current_span(), + ); + trace!("Task system aborted task"); +} + /// The default implementation of the task system will create a system with a number of workers equal to the available /// parallelism in the user's machine. impl Default for System { @@ -288,104 +435,118 @@ unsafe impl Sync for System {} pub struct SystemComm(chan::Sender); impl SystemComm { - pub async fn idle_report(&self, worker_id: usize) { - self.0 - .send(SystemMessage::IdleReport(worker_id)) - .await - .expect("System channel closed trying to report idle"); + pub fn idle_report(&self, worker_id: usize) { + let system_tx = self.0.clone(); + spawn( + async move { + system_tx + .send(SystemMessage::IdleReport(worker_id)) + .await + .expect("System channel closed trying to report idle"); + } + .in_current_span(), + ); } - pub async fn working_report(&self, worker_id: usize) { - self.0 - .send(SystemMessage::WorkingReport(worker_id)) - .await - .expect("System channel closed trying to report working"); + pub fn working_report(&self, worker_id: usize) { + let system_tx = self.0.clone(); + spawn( + async move { + system_tx + .send(SystemMessage::WorkingReport(worker_id)) + .await + .expect("System channel closed trying to report working"); + } + .in_current_span(), + ); } - pub async fn pause_not_running_task( + pub fn pause_not_running_task( &self, task_id: TaskId, - worker_id: WorkerId, - ) -> Result<(), SystemError> { - let (tx, rx) = oneshot::channel(); - - self.0 - .send(SystemMessage::PauseNotRunningTask { - task_id, - worker_id, - ack: tx, - }) - .await - .expect("System channel closed trying to pause not running task"); - - rx.await - .expect("System channel closed trying receive pause not running task response") + task_work_table: Arc, + ack: oneshot::Sender>, + ) { + let system_tx = self.0.clone(); + spawn( + async move { + system_tx + .send(SystemMessage::PauseNotRunningTask { + task_id, + task_work_table, + ack, + }) + .await + .expect("System channel closed trying to pause not running task"); + } + .in_current_span(), + ); } - pub async fn cancel_not_running_task(&self, task_id: TaskId, worker_id: WorkerId) { - let (tx, rx) = oneshot::channel(); - - self.0 - .send(SystemMessage::CancelNotRunningTask { - task_id, - worker_id, - ack: tx, - }) - .await - .expect("System channel closed trying to cancel a not running task"); - - rx.await - .expect("System channel closed trying receive cancel a not running task response"); - } - - pub async fn request_help(&self, worker_id: WorkerId, task_count: usize) { - self.0 - .send(SystemMessage::NotifyIdleWorkers { - start_from: worker_id, - task_count, - }) - .await - .expect("System channel closed trying to request help"); - } - - pub async fn resume_task( + pub fn cancel_not_running_task( &self, task_id: TaskId, - worker_id: WorkerId, - ) -> Result<(), SystemError> { - let (tx, rx) = oneshot::channel(); - - self.0 - .send(SystemMessage::ResumeTask { - task_id, - worker_id, - ack: tx, - }) - .await - .expect("System channel closed trying to resume task"); - - rx.await - .expect("System channel closed trying receive resume task response") + task_work_table: Arc, + ack: oneshot::Sender>, + ) { + let system_tx = self.0.clone(); + spawn( + async move { + system_tx + .send(SystemMessage::CancelNotRunningTask { + task_id, + task_work_table, + ack, + }) + .await + .expect("System channel closed trying to cancel a not running task"); + } + .in_current_span(), + ); } - pub async fn force_abortion( + pub fn resume_task( &self, task_id: TaskId, - worker_id: WorkerId, - ) -> Result<(), SystemError> { - let (tx, rx) = oneshot::channel(); + task_work_table: Arc, + ack: oneshot::Sender>, + ) { + let system_tx = self.0.clone(); + spawn( + async move { + system_tx + .send(SystemMessage::ResumeTask { + task_id, + task_work_table, + ack, + }) + .await + .expect("System channel closed trying to resume task"); + } + .in_current_span(), + ); + } - self.0 - .send(SystemMessage::ForceAbortion { - task_id, - worker_id, - ack: tx, - }) - .await - .expect("System channel closed trying to resume task"); - - rx.await - .expect("System channel closed trying receive resume task response") + pub fn force_abortion( + &self, + task_id: TaskId, + task_work_table: Arc, + ack: oneshot::Sender>, + ) { + let system_tx = self.0.clone(); + spawn( + async move { + system_tx + .send(SystemMessage::ForceAbortion { + task_id, + task_work_table, + ack, + }) + .await + .expect("System channel closed trying to resume task"); + } + .in_current_span(), + ); } } @@ -398,11 +559,24 @@ pub struct BaseDispatcher { workers: Arc>>, idle_workers: Arc>, last_worker_id: Arc, + has_shutdown: Arc, } +/// A trait that represents a dispatcher that can be used to dispatch tasks to the system. +/// It can be used to dispatch tasks to the system from other threads or tasks. +/// +/// The `E: RunError` error parameter is the error type that the dispatcher can return. +/// Although the [`BaseDispatcher`] which is the default implementation of this trait, will always returns +/// a [`Result`] with the [`TaskHandle`] in the [`Ok`] variant, it can be used to implement a custom +/// fallible dispatcher that returns an [`Err`] variant with a custom error type. pub trait Dispatcher: fmt::Debug + Clone + Send + Sync + 'static { + type DispatchError: RunError; + /// Dispatches a task to the system, the task will be assigned to a worker and executed as soon as possible. - fn dispatch(&self, into_task: impl IntoTask) -> impl Future> + Send { + fn dispatch( + &self, + into_task: impl IntoTask, + ) -> impl Future, Self::DispatchError>> + Send { self.dispatch_boxed(into_task.into_task()) } @@ -411,15 +585,15 @@ pub trait Dispatcher: fmt::Debug + Clone + Send + Sync + 'static { fn dispatch_boxed( &self, boxed_task: Box>, - ) -> impl Future> + Send; + ) -> impl Future, Self::DispatchError>> + Send; /// Dispatches many tasks to the system, the tasks will be assigned to workers and executed as soon as possible. fn dispatch_many> + Send>( &self, into_tasks: I, - ) -> impl Future>> + Send + ) -> impl Future>, Self::DispatchError>> + Send where - ::IntoIter: Send, + I::IntoIter: Send, { self.dispatch_many_boxed(into_tasks.into_iter().map(IntoTask::into_task)) } @@ -429,7 +603,7 @@ pub trait Dispatcher: fmt::Debug + Clone + Send + Sync + 'static { fn dispatch_many_boxed( &self, boxed_tasks: impl IntoIterator>> + Send, - ) -> impl Future>> + Send; + ) -> impl Future>, Self::DispatchError>> + Send; } impl Clone for BaseDispatcher { @@ -438,17 +612,23 @@ impl Clone for BaseDispatcher { workers: Arc::clone(&self.workers), idle_workers: Arc::clone(&self.idle_workers), last_worker_id: Arc::clone(&self.last_worker_id), + has_shutdown: Arc::clone(&self.has_shutdown), } } } impl Dispatcher for BaseDispatcher { - async fn dispatch(&self, into_task: impl IntoTask) -> TaskHandle { - self.dispatch_boxed(into_task.into_task()).await - } + type DispatchError = DispatcherShutdownError; #[allow(clippy::missing_panics_doc)] - async fn dispatch_boxed(&self, task: Box>) -> TaskHandle { + async fn dispatch_boxed( + &self, + task: Box>, + ) -> Result, Self::DispatchError> { + if self.has_shutdown.load(Ordering::Acquire) { + return Err(DispatcherShutdownError(vec![task])); + } + let worker_id = self .last_worker_id .fetch_update(Ordering::Release, Ordering::Acquire, |last_worker_id| { @@ -456,35 +636,27 @@ impl Dispatcher for BaseDispatcher { }) .expect("we hardcoded the update function to always return Some(next_worker_id) through dispatcher"); - trace!( - "Dispatching task to worker: ", - task.id() - ); + trace!(%worker_id, task_id = %task.id(), "Dispatching task to worker"); + let handle = self.workers[worker_id].add_task(task).await; self.idle_workers[worker_id].store(false, Ordering::Relaxed); - handle + Ok(handle) } async fn dispatch_many_boxed( &self, into_tasks: impl IntoIterator>> + Send, - ) -> Vec> { - let mut workers_task_count = self - .workers - .iter() - .map(|worker| async move { (worker.id, worker.task_count().await) }) - .collect::>() - .join() - .await; - - workers_task_count.sort_by_key(|(_id, count)| *count); + ) -> Result>, Self::DispatchError> { + if self.has_shutdown.load(Ordering::Acquire) { + return Err(DispatcherShutdownError(into_tasks.into_iter().collect())); + } let (handles, workers_ids_set) = into_tasks .into_iter() - .zip(workers_task_count.into_iter().cycle()) - .map(|(task, (worker_id, _))| async move { + .zip((0..self.workers.len()).cycle()) + .map(|(task, worker_id)| async move { (self.workers[worker_id].add_task(task).await, worker_id) }) .collect::>() @@ -497,7 +669,7 @@ impl Dispatcher for BaseDispatcher { self.idle_workers[worker_id].store(false, Ordering::Relaxed); } - handles + Ok(handles) } } diff --git a/crates/task-system/src/task.rs b/crates/task-system/src/task.rs index 7804a01ca..7c5125983 100644 --- a/crates/task-system/src/task.rs +++ b/crates/task-system/src/task.rs @@ -1,9 +1,9 @@ use std::{ fmt, future::{Future, IntoFuture}, - pin::Pin, + pin::{pin, Pin}, sync::{ - atomic::{AtomicBool, AtomicU8, Ordering}, + atomic::{AtomicBool, Ordering}, Arc, }, task::{Context, Poll}, @@ -12,10 +12,10 @@ use std::{ use async_channel as chan; use async_trait::async_trait; -use chan::{Recv, RecvError}; use downcast_rs::{impl_downcast, Downcast}; -use tokio::{runtime::Handle, sync::oneshot}; -use tracing::{trace, warn}; +use futures::StreamExt; +use tokio::{spawn, sync::oneshot}; +use tracing::{error, instrument, trace, warn, Instrument}; use uuid::Uuid; use super::{ @@ -32,12 +32,18 @@ pub type TaskId = Uuid; /// The user will downcast it to the concrete type that the task returns. Most of the time, /// tasks will not return anything, so it isn't a costly abstraction, as only a heap allocation /// is needed when the user wants to return a [`Box`]. -pub trait AnyTaskOutput: Send + fmt::Debug + Downcast + 'static {} +pub trait AnyTaskOutput: Send + Downcast + 'static {} + +impl fmt::Debug for Box { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "") + } +} impl_downcast!(AnyTaskOutput); -/// Blanket implementation for all types that implements `std::fmt::Debug + Send + 'static` -impl AnyTaskOutput for T {} +/// Blanket implementation for all types that implements `Send + 'static` +impl AnyTaskOutput for T {} /// A helper trait to convert any type that implements [`AnyTaskOutput`] into a [`TaskOutput`], boxing it. pub trait IntoAnyTaskOutput { @@ -130,7 +136,7 @@ impl + 'static, E: RunError> IntoTask for T { /// We're currently using the [`async_trait`](https://docs.rs/async-trait) crate to allow dyn async traits, /// due to a limitation in the Rust language. #[async_trait] -pub trait Task: fmt::Debug + Downcast + Send + Sync + 'static { +pub trait Task: Downcast + Send + Sync + 'static { /// An unique identifier for the task, it will be used to identify the task on the system and also to the user. fn id(&self) -> TaskId; @@ -161,6 +167,12 @@ pub trait Task: fmt::Debug + Downcast + Send + Sync + 'static { impl_downcast!(Task where E: RunError); +impl fmt::Debug for Box> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "") + } +} + pub trait SerializableTask: Task where Self: Sized, @@ -181,8 +193,7 @@ where #[pin_project::pin_project] pub struct InterrupterFuture<'recv> { #[pin] - fut: Recv<'recv, InterruptionRequest>, - has_interrupted: &'recv AtomicU8, + fut: chan::Recv<'recv, InterruptionRequest>, } impl Future for InterrupterFuture<'_> { @@ -193,13 +204,19 @@ impl Future for InterrupterFuture<'_> { match this.fut.poll(cx) { Poll::Ready(Ok(InterruptionRequest { kind, ack })) => { + trace!(?kind, "Running task received interruption request"); if ack.send(()).is_err() { warn!("TaskInterrupter ack channel closed"); } - this.has_interrupted.store(kind as u8, Ordering::Relaxed); + if let InternalInterruptionKind::Suspend(has_suspended) = &kind { + has_suspended.store(true, Ordering::SeqCst); + } + + let kind = kind.into(); + Poll::Ready(kind) } - Poll::Ready(Err(RecvError)) => { + Poll::Ready(Err(chan::RecvError)) => { // In case the task handle was dropped, we can't receive any more interrupt messages // so we will never interrupt and the task will run freely until ended warn!("Task interrupter channel closed, will run task until it finishes!"); @@ -220,7 +237,6 @@ impl<'recv> IntoFuture for &'recv Interrupter { fn into_future(self) -> Self::IntoFuture { InterrupterFuture { fut: self.interrupt_rx.recv(), - has_interrupted: &self.has_interrupted, } } } @@ -230,47 +246,68 @@ impl<'recv> IntoFuture for &'recv Interrupter { #[derive(Debug)] pub struct Interrupter { interrupt_rx: chan::Receiver, - has_interrupted: AtomicU8, +} + +impl Drop for Interrupter { + fn drop(&mut self) { + if !self.interrupt_rx.is_closed() { + self.close(); + } + } } impl Interrupter { pub(crate) fn new(interrupt_tx: chan::Receiver) -> Self { Self { interrupt_rx: interrupt_tx, - has_interrupted: AtomicU8::new(0), } } /// Check if the user requested a pause or a cancel, returning the kind of interruption that was requested /// in a non-blocking manner. pub fn try_check_interrupt(&self) -> Option { - InterruptionKind::load(&self.has_interrupted).map_or_else( - || { - if let Ok(InterruptionRequest { kind, ack }) = self.interrupt_rx.try_recv() { - if ack.send(()).is_err() { - warn!("TaskInterrupter ack channel closed"); - } + if let Ok(InterruptionRequest { kind, ack }) = self.interrupt_rx.try_recv() { + trace!(?kind, "Interrupter received interruption request"); - self.has_interrupted.store(kind as u8, Ordering::Relaxed); + if let InternalInterruptionKind::Suspend(has_suspended) = &kind { + has_suspended.store(true, Ordering::SeqCst); + } - Some(kind) - } else { - None - } - }, - Some, - ) + let kind = kind.into(); + + if ack.send(()).is_err() { + warn!("TaskInterrupter ack channel closed"); + } + + Some(kind) + } else { + None + } } - pub(super) fn reset(&self) { - self.has_interrupted - .compare_exchange( - InterruptionKind::Pause as u8, - 0, - Ordering::Release, - Ordering::Relaxed, - ) - .expect("we must only reset paused tasks"); + pub(super) fn close(&self) { + self.interrupt_rx.close(); + if !self.interrupt_rx.is_empty() { + trace!("Pending interruption requests were not handled"); + spawn({ + let interrupt_rx = self.interrupt_rx.clone(); + + async move { + let mut interrupt_stream = pin!(interrupt_rx); + + while let Some(InterruptionRequest { kind, ack }) = + interrupt_stream.next().await + { + trace!( + ?kind, + "Interrupter received interruption request after task was completed" + ); + ack.send(()).expect("Interrupter ack channel closed"); + } + } + .in_current_span() + }); + } } } @@ -280,8 +317,14 @@ macro_rules! check_interruption { let interrupter: &Interrupter = $interrupter; match interrupter.try_check_interrupt() { - Some($crate::InterruptionKind::Cancel) => return Ok($crate::ExecStatus::Canceled), - Some($crate::InterruptionKind::Pause) => return Ok($crate::ExecStatus::Paused), + Some($crate::InterruptionKind::Cancel) => { + ::tracing::trace!("Task was canceled by the user"); + return Ok($crate::ExecStatus::Canceled); + } + Some($crate::InterruptionKind::Pause) => { + ::tracing::trace!("Task was paused by the user or suspended by the task system"); + return Ok($crate::ExecStatus::Paused); + } None => { /* Everything is Awesome! */ } } }; @@ -294,11 +337,13 @@ macro_rules! check_interruption { match interrupter.try_check_interrupt() { Some($crate::InterruptionKind::Cancel) => { *duration_accumulator += instant.elapsed(); + ::tracing::trace!("Task was canceled by the user"); return Ok($crate::ExecStatus::Canceled); } Some($crate::InterruptionKind::Pause) => { *duration_accumulator += instant.elapsed(); + ::tracing::trace!("Task was paused by the user or suspended by the task system"); return Ok($crate::ExecStatus::Paused); } @@ -309,25 +354,30 @@ macro_rules! check_interruption { /// The kind of interruption that can be requested by the user, a pause or a cancel #[derive(Debug, Clone, Copy)] -#[repr(u8)] pub enum InterruptionKind { - Pause = 1, - Cancel = 2, + Pause, + Cancel, } -impl InterruptionKind { - fn load(kind: &AtomicU8) -> Option { - match kind.load(Ordering::Relaxed) { - 1 => Some(Self::Pause), - 2 => Some(Self::Cancel), - _ => None, +#[derive(Debug, Clone)] +enum InternalInterruptionKind { + Pause, + Suspend(Arc), + Cancel, +} + +impl From for InterruptionKind { + fn from(kind: InternalInterruptionKind) -> Self { + match kind { + InternalInterruptionKind::Pause | InternalInterruptionKind::Suspend(_) => Self::Pause, + InternalInterruptionKind::Cancel => Self::Cancel, } } } #[derive(Debug)] pub struct InterruptionRequest { - kind: InterruptionKind, + kind: InternalInterruptionKind, ack: oneshot::Sender<()>, } @@ -351,32 +401,43 @@ impl TaskRemoteController { /// # Panics /// /// Will panic if the worker failed to ack the pause request + #[instrument(skip(self), fields(task_id = %self.task_id), err)] pub async fn pause(&self) -> Result<(), SystemError> { - let is_paused = self.worktable.is_paused.load(Ordering::Relaxed); - let is_canceled = self.worktable.is_canceled.load(Ordering::Relaxed); - let is_done = self.worktable.is_done.load(Ordering::Relaxed); + if self.worktable.is_finalized() { + trace!("Task is finalized, will not pause"); + return Ok(()); + } - trace!("Received pause command task: "); + let is_paused = self.worktable.is_paused.load(Ordering::Acquire); + let is_canceled = self.worktable.has_canceled.load(Ordering::Acquire); + let is_done = self.worktable.is_done.load(Ordering::Acquire); + + trace!(%is_canceled, %is_done, "Received pause command task"); if !is_paused && !is_canceled && !is_done { - if self.worktable.is_running.load(Ordering::Relaxed) { + if self.worktable.is_running.load(Ordering::Acquire) { let (tx, rx) = oneshot::channel(); trace!("Task is running, sending pause request"); - self.worktable.pause(tx).await; + self.worktable.pause(tx); rx.await.expect("Worker failed to ack pause request"); } else { - trace!("Task is not running, setting is_paused flag"); - self.worktable.is_paused.store(true, Ordering::Relaxed); - return self - .system_comm - .pause_not_running_task( - self.task_id, - self.worktable.current_worker_id.load(Ordering::Relaxed), - ) - .await; + trace!("Task is not running, setting is_paused flag and communicating with system"); + self.worktable.is_paused.store(true, Ordering::Release); + + let (tx, rx) = oneshot::channel(); + + self.system_comm.pause_not_running_task( + self.task_id, + Arc::clone(&self.worktable), + tx, + ); + + return rx + .await + .expect("Worker failed to ack pause not running task request"); } } @@ -388,60 +449,103 @@ impl TaskRemoteController { /// # Panics /// /// Will panic if the worker failed to ack the cancel request - pub async fn cancel(&self) { - let is_canceled = self.worktable.is_canceled.load(Ordering::Relaxed); - let is_done = self.worktable.is_done.load(Ordering::Relaxed); + #[instrument(skip(self), fields(task_id = %self.task_id))] + pub async fn cancel(&self) -> Result<(), SystemError> { + if self.worktable.is_finalized() { + trace!("Task is finalized, will not cancel"); + return Ok(()); + } - trace!("Received cancel command task: "); + let is_canceled = self.worktable.has_canceled(); + let is_done = self.worktable.is_done(); + + trace!(%is_canceled, %is_done, "Received cancel command task"); if !is_canceled && !is_done { - if self.worktable.is_running.load(Ordering::Relaxed) { + if self.worktable.is_running() { let (tx, rx) = oneshot::channel(); trace!("Task is running, sending cancel request"); - self.worktable.cancel(tx).await; + self.worktable.cancel(tx); rx.await.expect("Worker failed to ack cancel request"); } else { - trace!("Task is not running, setting is_canceled flag"); - self.worktable.is_canceled.store(true, Ordering::Relaxed); - self.system_comm - .cancel_not_running_task( - self.task_id, - self.worktable.current_worker_id.load(Ordering::Relaxed), - ) - .await; + trace!( + "Task is not running, setting is_canceled flag and communicating with system" + ); + self.worktable.has_canceled.store(true, Ordering::Release); + + let (tx, rx) = oneshot::channel(); + + self.system_comm.cancel_not_running_task( + self.task_id, + Arc::clone(&self.worktable), + tx, + ); + + return rx + .await + .expect("Worker failed to ack cancel not running task request"); } } + + Ok(()) } /// Forcefully abort the task, this can lead to corrupted data or inconsistent states, so use it with caution. + /// + /// # Panics + /// + /// Will panic if the worker failed to ack the forced abortion request + #[instrument(skip(self), fields(task_id = %self.task_id), err)] pub async fn force_abortion(&self) -> Result<(), SystemError> { + if self.worktable.is_finalized() { + trace!("Task is finalized, will not force abortion"); + return Ok(()); + } + trace!("Received force abortion command task"); self.worktable.set_aborted(); + + let (tx, rx) = oneshot::channel(); + self.system_comm - .force_abortion( - self.task_id, - self.worktable.current_worker_id.load(Ordering::Relaxed), - ) - .await + .force_abortion(self.task_id, Arc::clone(&self.worktable), tx); + + rx.await + .expect("Worker failed to ack force abortion request") } /// Marks the task to be resumed by the task system, the worker will start processing it if there is a slot /// available or will be enqueued otherwise. + /// + /// # Panics + /// + /// Will panic if the worker failed to ack the resume request + #[instrument(skip(self), fields(task_id = %self.task_id), err)] pub async fn resume(&self) -> Result<(), SystemError> { + if self.worktable.is_finalized() { + trace!("Task is finalized, will not resume"); + return Ok(()); + } + trace!("Received resume command task"); + + let (tx, rx) = oneshot::channel(); + self.system_comm - .resume_task( - self.task_id, - self.worktable.current_worker_id.load(Ordering::Relaxed), - ) - .await + .resume_task(self.task_id, Arc::clone(&self.worktable), tx); + + rx.await.expect("Worker failed to ack resume request") } /// Verify if the task was already completed #[must_use] pub fn is_done(&self) -> bool { - self.worktable.is_done.load(Ordering::Relaxed) + self.worktable.is_done() + | self.worktable.has_shutdown() + | self.worktable.has_aborted() + | self.worktable.has_canceled() + | self.worktable.has_failed() } } @@ -471,21 +575,13 @@ impl TaskHandle { } /// Gracefully pause the task at a safe point defined by the user using the [`Interrupter`] - /// - /// # Panics - /// - /// Will panic if the worker failed to ack the pause request pub async fn pause(&self) -> Result<(), SystemError> { self.controller.pause().await } /// Gracefully cancel the task at a safe point defined by the user using the [`Interrupter`] - /// - /// # Panics - /// - /// Will panic if the worker failed to ack the cancel request - pub async fn cancel(&self) { - self.controller.cancel().await; + pub async fn cancel(&self) -> Result<(), SystemError> { + self.controller.cancel().await } /// Forcefully abort the task, this can lead to corrupted data or inconsistent states, so use it with caution. @@ -508,20 +604,41 @@ impl TaskHandle { } /// A helper struct when you just want to cancel a task if its `TaskHandle` gets dropped. -pub struct CancelTaskOnDrop(pub TaskHandle); +pub struct CancelTaskOnDrop(Option>); + +impl CancelTaskOnDrop { + /// Create a new `CancelTaskOnDrop` object with the given `TaskHandle`. + #[must_use] + pub const fn new(handle: TaskHandle) -> Self { + Self(Some(handle)) + } +} impl Future for CancelTaskOnDrop { type Output = Result, SystemError>; fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - Pin::new(&mut self.0).poll(cx) + if let Some(handle) = self.0.as_mut() { + match Pin::new(handle).poll(cx) { + Poll::Ready(res) => { + self.0 = None; + Poll::Ready(res) + } + Poll::Pending => Poll::Pending, + } + } else { + error!("tried to poll an already completed CancelTaskOnDrop future"); + Poll::Pending + } } } impl Drop for CancelTaskOnDrop { fn drop(&mut self) { // FIXME: We should use async drop when it becomes stable - Handle::current().block_on(self.0.cancel()); + if let Some(handle) = self.0.take() { + spawn(async move { handle.cancel().await }.in_current_span()); + } } } @@ -531,9 +648,12 @@ pub struct TaskWorktable { is_running: AtomicBool, is_done: AtomicBool, is_paused: AtomicBool, - is_canceled: AtomicBool, - is_aborted: AtomicBool, + has_canceled: AtomicBool, + has_aborted: AtomicBool, + has_shutdown: AtomicBool, + has_failed: AtomicBool, interrupt_tx: chan::Sender, + finalized: AtomicBool, current_worker_id: AtomicWorkerId, } @@ -544,13 +664,27 @@ impl TaskWorktable { is_running: AtomicBool::new(false), is_done: AtomicBool::new(false), is_paused: AtomicBool::new(false), - is_canceled: AtomicBool::new(false), - is_aborted: AtomicBool::new(false), + has_canceled: AtomicBool::new(false), + has_aborted: AtomicBool::new(false), + has_shutdown: AtomicBool::new(false), + has_failed: AtomicBool::new(false), + finalized: AtomicBool::new(false), interrupt_tx, current_worker_id: AtomicWorkerId::new(worker_id), } } + #[inline] + pub fn worker_id(&self) -> WorkerId { + self.current_worker_id.load(Ordering::Acquire) + } + + #[inline] + pub fn change_worker(&self, new_worker_id: WorkerId) { + self.current_worker_id + .store(new_worker_id, Ordering::Release); + } + pub fn set_started(&self) { self.started.store(true, Ordering::Relaxed); self.is_running.store(true, Ordering::Relaxed); @@ -561,67 +695,241 @@ impl TaskWorktable { self.is_running.store(false, Ordering::Relaxed); } + pub fn set_canceled(&self) { + self.has_canceled.store(true, Ordering::Relaxed); + self.is_running.store(false, Ordering::Relaxed); + } + pub fn set_unpause(&self) { self.is_paused.store(false, Ordering::Relaxed); } pub fn set_aborted(&self) { - self.is_aborted.store(true, Ordering::Relaxed); + self.has_aborted.store(true, Ordering::Relaxed); + self.is_running.store(false, Ordering::Relaxed); } - pub async fn pause(&self, tx: oneshot::Sender<()>) { - self.is_paused.store(true, Ordering::Relaxed); + pub fn set_failed(&self) { + self.has_failed.store(true, Ordering::Relaxed); self.is_running.store(false, Ordering::Relaxed); - - trace!("Sending pause signal to Interrupter object on task"); - - self.interrupt_tx - .send(InterruptionRequest { - kind: InterruptionKind::Pause, - ack: tx, - }) - .await - .expect("Worker channel closed trying to pause task"); } - pub async fn cancel(&self, tx: oneshot::Sender<()>) { - self.is_canceled.store(true, Ordering::Relaxed); + pub fn set_shutdown(&self) { + self.has_shutdown.store(true, Ordering::Relaxed); self.is_running.store(false, Ordering::Relaxed); + } - self.interrupt_tx - .send(InterruptionRequest { - kind: InterruptionKind::Cancel, - ack: tx, - }) - .await - .expect("Worker channel closed trying to pause task"); + pub fn set_finalized(&self) { + self.finalized.store(true, Ordering::Release); + } + + pub fn pause(self: &Arc, outer_tx: oneshot::Sender<()>) { + spawn({ + let this = Arc::clone(self); + + trace!("Sending pause signal to Interrupter object on task"); + + async move { + let (tx, rx) = oneshot::channel(); + + if this + .interrupt_tx + .send(InterruptionRequest { + kind: InternalInterruptionKind::Pause, + ack: tx, + }) + .await + .is_ok() + { + rx.await.expect("Task failed to ack pause request"); + + this.is_paused.store(true, Ordering::Release); + this.is_running.store(false, Ordering::Release); + } + + trace!("Sent pause signal to Interrupter object on task"); + + outer_tx + .send(()) + .expect("Worker channel closed trying to pause task"); + } + .in_current_span() + }); + } + + pub fn suspend( + self: &Arc, + outer_tx: oneshot::Sender<()>, + has_suspended: Arc, + ) { + trace!("Sending suspend signal to Interrupter object on task"); + spawn({ + let this = Arc::clone(self); + + async move { + let (tx, rx) = oneshot::channel(); + + if this + .interrupt_tx + .send(InterruptionRequest { + kind: InternalInterruptionKind::Suspend(has_suspended), + ack: tx, + }) + .await + .is_ok() + { + rx.await.expect("Task failed to ack suspend request"); + + this.is_paused.store(true, Ordering::Release); + this.is_running.store(false, Ordering::Release); + } + + if outer_tx.send(()).is_err() { + trace!("Task suspend channel closed trying to suspend task, maybe task manage to be completed"); + } + } + .in_current_span() + }); + } + + pub fn cancel(self: &Arc, outer_tx: oneshot::Sender<()>) { + trace!("Sending cancel signal to Interrupter object on task"); + spawn({ + let this = Arc::clone(self); + async move { + let (tx, rx) = oneshot::channel(); + + if this + .interrupt_tx + .send(InterruptionRequest { + kind: InternalInterruptionKind::Cancel, + ack: tx, + }) + .await + .is_ok() + { + rx.await.expect("Task failed to ack cancel request"); + + this.has_canceled.store(true, Ordering::Release); + this.is_running.store(false, Ordering::Release); + } + + outer_tx + .send(()) + .expect("Worker channel closed trying to cancel task"); + } + .in_current_span() + }); + } + + pub fn is_done(&self) -> bool { + self.is_done.load(Ordering::Acquire) + } + + pub fn is_running(&self) -> bool { + self.is_running.load(Ordering::Acquire) } pub fn is_paused(&self) -> bool { - self.is_paused.load(Ordering::Relaxed) + self.is_paused.load(Ordering::Acquire) } - pub fn is_canceled(&self) -> bool { - self.is_canceled.load(Ordering::Relaxed) + pub fn has_canceled(&self) -> bool { + self.has_canceled.load(Ordering::Acquire) } - pub fn is_aborted(&self) -> bool { - self.is_aborted.load(Ordering::Relaxed) + pub fn has_failed(&self) -> bool { + self.has_failed.load(Ordering::Acquire) + } + + pub fn has_aborted(&self) -> bool { + self.has_aborted.load(Ordering::Acquire) + } + + pub fn has_shutdown(&self) -> bool { + self.has_shutdown.load(Ordering::Acquire) + } + + pub fn is_finalized(&self) -> bool { + self.finalized.load(Ordering::Acquire) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PendingTaskKind { + Normal, + Priority, + Suspended, +} + +impl PendingTaskKind { + const fn with_priority(has_priority: bool) -> Self { + if has_priority { + Self::Priority + } else { + Self::Normal + } } } -#[derive(Debug)] pub struct TaskWorkState { pub(crate) task: Box>, pub(crate) worktable: Arc, - pub(crate) done_tx: oneshot::Sender, SystemError>>, + pub(crate) done_tx: PanicOnSenderDrop, pub(crate) interrupter: Arc, } impl TaskWorkState { - pub fn change_worker(&self, new_worker_id: WorkerId) { - self.worktable - .current_worker_id - .store(new_worker_id, Ordering::Relaxed); + #[inline] + pub fn id(&self) -> TaskId { + self.task.id() + } + + #[inline] + pub fn kind(&self) -> PendingTaskKind { + PendingTaskKind::with_priority(self.task.with_priority()) + } +} + +#[derive(Debug)] +pub struct PanicOnSenderDrop { + task_id: TaskId, + maybe_done_tx: Option, SystemError>>>, +} + +impl PanicOnSenderDrop { + pub fn new( + task_id: TaskId, + done_tx: oneshot::Sender, SystemError>>, + ) -> Self { + Self { + task_id, + maybe_done_tx: Some(done_tx), + } + } + + pub fn send( + mut self, + res: Result, SystemError>, + ) -> Result<(), Result, SystemError>> { + self.maybe_done_tx + .take() + .expect("tried to send a task output twice to the same task handle") + .send(res) + } +} + +impl Drop for PanicOnSenderDrop { + #[track_caller] + fn drop(&mut self) { + trace!(task_id = %self.task_id, "Dropping TaskWorkState"); + assert!( + self.maybe_done_tx.is_none(), + "TaskHandle done channel dropped before sending a result: {}", + std::panic::Location::caller() + ); + trace!(task_id = %self.task_id, + "TaskWorkState successfully dropped" + ); } } diff --git a/crates/task-system/src/worker/mod.rs b/crates/task-system/src/worker/mod.rs index cdeae4ddc..070bbdc81 100644 --- a/crates/task-system/src/worker/mod.rs +++ b/crates/task-system/src/worker/mod.rs @@ -6,16 +6,15 @@ use std::{ use async_channel as chan; use tokio::{spawn, sync::oneshot, task::JoinHandle}; -use tracing::{error, info, trace, warn}; - -use crate::task::TaskRemoteController; +use tracing::{error, info, instrument, trace, warn, Instrument}; use super::{ error::{RunError, SystemError}, - message::WorkerMessage, + message::{StoleTaskMessage, TaskRunnerOutput, WorkerMessage}, system::SystemComm, task::{ - InternalTaskExecStatus, Interrupter, Task, TaskHandle, TaskId, TaskWorkState, TaskWorktable, + Interrupter, PanicOnSenderDrop, Task, TaskHandle, TaskId, TaskRemoteController, + TaskWorkState, TaskWorktable, }, }; @@ -54,6 +53,7 @@ impl WorkerBuilder { ) } + #[instrument(name = "task_system_worker", skip(self, system_comm, task_stealer), fields(worker_id = self.id))] pub fn build(self, system_comm: SystemComm, task_stealer: WorkStealer) -> Worker { let Self { id, @@ -65,7 +65,7 @@ impl WorkerBuilder { let system_comm = system_comm.clone(); async move { - trace!("Worker message processing task starting..."); + trace!("Worker message processing task starting..."); while let Err(e) = spawn(run( id, system_comm.clone(), @@ -75,20 +75,16 @@ impl WorkerBuilder { .await { if e.is_panic() { - error!( - "Worker critically failed and will restart: \ - {e:#?}" - ); + error!(?e, "Worker critically failed and will restart;"); } else { - trace!( - "Worker received shutdown signal and will exit..." - ); + trace!("Worker received shutdown signal and will exit..."); break; } } - info!("Worker gracefully shutdown"); + info!("Worker gracefully shutdown"); } + .in_current_span() }); Worker { @@ -123,7 +119,7 @@ impl Worker { task: new_task, worktable: Arc::clone(&worktable), interrupter: Arc::new(Interrupter::new(interrupt_rx)), - done_tx, + done_tx: PanicOnSenderDrop::new(task_id, done_tx), })) .await .expect("Worker channel closed trying to add task"); @@ -138,18 +134,6 @@ impl Worker { } } - pub async fn task_count(&self) -> usize { - let (tx, rx) = oneshot::channel(); - - self.msgs_tx - .send(WorkerMessage::TaskCountRequest(tx)) - .await - .expect("Worker channel closed trying to get task count"); - - rx.await - .expect("Worker channel closed trying to receive task count response") - } - pub async fn resume_task( &self, task_id: TaskId, @@ -172,7 +156,11 @@ impl Worker { .expect("Worker channel closed trying to pause a not running task"); } - pub async fn cancel_not_running_task(&self, task_id: TaskId, ack: oneshot::Sender<()>) { + pub async fn cancel_not_running_task( + &self, + task_id: TaskId, + ack: oneshot::Sender>, + ) { self.msgs_tx .send(WorkerMessage::CancelNotRunningTask { task_id, ack }) .await @@ -190,6 +178,7 @@ impl Worker { .expect("Worker channel closed trying to force task abortion"); } + #[instrument(skip(self), fields(worker_id = self.id))] pub async fn shutdown(&self) { if let Some(handle) = self .handle @@ -215,13 +204,6 @@ impl Worker { warn!("Trying to shutdown a worker that was already shutdown"); } } - - pub async fn wake(&self) { - self.msgs_tx - .send(WorkerMessage::WakeUp) - .await - .expect("Worker channel closed trying to wake up"); - } } /// SAFETY: Due to usage of refcell we lost `Sync` impl, but we only use it to have a shutdown method @@ -235,26 +217,24 @@ pub struct WorkerComm { } impl WorkerComm { - pub async fn steal_task(&self, worker_id: WorkerId) -> Option> { + pub async fn steal_task( + &self, + stealer_id: WorkerId, + stolen_task_tx: chan::Sender>>, + ) -> bool { let (tx, rx) = oneshot::channel(); self.msgs_tx - .send(WorkerMessage::StealRequest(tx)) + .send(WorkerMessage::StealRequest { + stealer_id, + ack: tx, + stolen_task_tx, + }) .await .expect("Worker channel closed trying to steal task"); rx.await .expect("Worker channel closed trying to steal task") - .map(|task_work_state| { - trace!( - "Worker stole task: \ - ", - self.worker_id, - task_work_state.task.id() - ); - task_work_state.change_worker(worker_id); - task_work_state - }) } } @@ -277,7 +257,12 @@ impl WorkStealer { } } - pub async fn steal(&self, worker_id: WorkerId) -> Option> { + #[instrument(skip(self, stolen_task_tx))] + pub async fn steal( + &self, + stealer_id: WorkerId, + stolen_task_tx: &chan::Sender>>, + ) { let total_workers = self.worker_comms.len(); for worker_comm in self @@ -286,41 +271,24 @@ impl WorkStealer { // Cycling over the workers .cycle() // Starting from the next worker id - .skip(worker_id) + .skip(stealer_id) // Taking the total amount of workers .take(total_workers) // Removing the current worker as we can't steal from ourselves - .filter(|worker_comm| worker_comm.worker_id != worker_id) + .filter(|worker_comm| worker_comm.worker_id != stealer_id) { - trace!( - "Trying to steal from worker ", - worker_comm.worker_id - ); - - if let Some(task) = worker_comm.steal_task(worker_id).await { - return Some(task); + if worker_comm + .steal_task(stealer_id, stolen_task_tx.clone()) + .await + { + trace!(stolen_worker_id = worker_comm.worker_id, "Stole a task"); + return; } - - trace!( - "Worker has no tasks to steal", - worker_comm.worker_id - ); } - None - } - - pub fn workers_count(&self) -> usize { - self.worker_comms.len() + stolen_task_tx + .send(None) + .await + .expect("Stolen task channel closed"); } } - -struct TaskRunnerOutput { - task_work_state: TaskWorkState, - status: InternalTaskExecStatus, -} - -enum RunnerMessage { - TaskOutput(TaskId, Result, ()>), - StoleTask(Option>), -} diff --git a/crates/task-system/src/worker/run.rs b/crates/task-system/src/worker/run.rs index 70de8c65c..9b2ed3c46 100644 --- a/crates/task-system/src/worker/run.rs +++ b/crates/task-system/src/worker/run.rs @@ -5,34 +5,42 @@ use futures::StreamExt; use futures_concurrency::stream::Merge; use tokio::time::{interval_at, Instant}; use tokio_stream::wrappers::IntervalStream; -use tracing::{error, warn}; +use tracing::{debug, error, instrument, trace, warn}; use super::{ - super::{error::RunError, message::WorkerMessage, system::SystemComm}, + super::{ + error::RunError, + message::{StoleTaskMessage, TaskOutputMessage, WorkerMessage}, + system::SystemComm, + }, runner::Runner, - RunnerMessage, WorkStealer, WorkerId, ONE_SECOND, + WorkStealer, WorkerId, ONE_SECOND, }; +enum StreamMessage { + Commands(WorkerMessage), + Steal(Option>), + TaskOutput(TaskOutputMessage), + IdleCheck, +} + +#[instrument(skip(system_comm, work_stealer, msgs_rx))] pub(super) async fn run( - id: WorkerId, + worker_id: WorkerId, system_comm: SystemComm, work_stealer: WorkStealer, msgs_rx: chan::Receiver>, ) { - enum StreamMessage { - Commands(WorkerMessage), - RunnerMsg(RunnerMessage), - IdleCheck, - } - - let (mut runner, runner_rx) = Runner::new(id, work_stealer, system_comm); + let (mut runner, stole_task_rx, task_output_rx) = + Runner::new(worker_id, work_stealer, system_comm); let mut idle_checker_interval = interval_at(Instant::now(), ONE_SECOND); idle_checker_interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); let mut msg_stream = pin!(( msgs_rx.map(StreamMessage::Commands), - runner_rx.map(StreamMessage::RunnerMsg), + stole_task_rx.map(StreamMessage::Steal), + task_output_rx.map(StreamMessage::TaskOutput), IntervalStream::new(idle_checker_interval).map(|_| StreamMessage::IdleCheck), ) .merge()); @@ -41,20 +49,19 @@ pub(super) async fn run( match msg { // Worker messages StreamMessage::Commands(WorkerMessage::NewTask(task_work_state)) => { + let task_id = task_work_state.id(); runner.abort_steal_task(); - runner.new_task(task_work_state).await; - } - - StreamMessage::Commands(WorkerMessage::TaskCountRequest(tx)) => { - if tx.send(runner.total_tasks()).is_err() { - warn!("Task count request channel closed before sending task count"); - } + trace!(%task_id, "New task received"); + runner.new_task(task_id, task_work_state.kind(), task_work_state); + trace!(%task_id, "New task added"); } StreamMessage::Commands(WorkerMessage::ResumeTask { task_id, ack }) => { - if ack.send(runner.resume_task(task_id).await).is_err() { + trace!(%task_id, "Resume task request received"); + if ack.send(runner.resume_task(task_id)).is_err() { warn!("Resume task channel closed before sending ack"); } + trace!(%task_id, "Resumed task"); } StreamMessage::Commands(WorkerMessage::PauseNotRunningTask { task_id, ack }) => { @@ -64,41 +71,53 @@ pub(super) async fn run( } StreamMessage::Commands(WorkerMessage::CancelNotRunningTask { task_id, ack }) => { - runner.cancel_not_running_task(task_id); - if ack.send(()).is_err() { + if ack.send(runner.cancel_not_running_task(&task_id)).is_err() { warn!("Resume task channel closed before sending ack"); } } StreamMessage::Commands(WorkerMessage::ForceAbortion { task_id, ack }) => { - if ack.send(runner.force_task_abortion(task_id).await).is_err() { + trace!(%task_id, "Force abortion task request received"); + if ack + .send(runner.force_task_abortion(&task_id).await) + .is_err() + { warn!("Force abortion channel closed before sending ack"); } + trace!(%task_id, "Force aborted task response sent"); } StreamMessage::Commands(WorkerMessage::ShutdownRequest(tx)) => { return runner.shutdown(tx).await; } - StreamMessage::Commands(WorkerMessage::StealRequest(tx)) => runner.steal_request(tx), - - StreamMessage::Commands(WorkerMessage::WakeUp) => runner.wake_up(), + StreamMessage::Commands(WorkerMessage::StealRequest { + stealer_id, + ack, + stolen_task_tx, + }) => { + if ack + .send(runner.steal_request(stealer_id, stolen_task_tx).await) + .is_err() + { + debug!("Steal request attempt aborted before sending ack"); + } + } // Runner messages - StreamMessage::RunnerMsg(RunnerMessage::TaskOutput(task_id, Ok(output))) => { - runner.process_task_output(task_id, output).await; + StreamMessage::TaskOutput(TaskOutputMessage(task_id, Ok(output))) => { + runner.process_task_output(&task_id, output).await; } - StreamMessage::RunnerMsg(RunnerMessage::TaskOutput(task_id, Err(()))) => { - error!("Task failed "); + StreamMessage::TaskOutput(TaskOutputMessage(task_id, Err(()))) => { + error!(%task_id, "Task failed"); - runner.clean_suspended_task(task_id); - - runner.dispatch_next_task(task_id).await; + runner.clear_errored_task(task_id).await; + trace!(%task_id, "Failed task cleared"); } - StreamMessage::RunnerMsg(RunnerMessage::StoleTask(maybe_new_task)) => { - runner.process_stolen_task(maybe_new_task).await; + StreamMessage::Steal(maybe_stolen_task) => { + runner.process_stolen_task(maybe_stolen_task).await; } // Idle checking to steal some work diff --git a/crates/task-system/src/worker/runner.rs b/crates/task-system/src/worker/runner.rs index ac4788266..d99981558 100644 --- a/crates/task-system/src/worker/runner.rs +++ b/crates/task-system/src/worker/runner.rs @@ -11,25 +11,26 @@ use std::{ use async_channel as chan; use futures::{FutureExt, StreamExt}; -use futures_concurrency::future::Race; +use futures_concurrency::{future::Race, stream::Merge}; use tokio::{ spawn, sync::oneshot, task::{JoinError, JoinHandle}, time::{sleep, timeout, Instant}, }; -use tracing::{debug, error, trace, warn}; +use tracing::{debug, error, instrument, trace, warn, Instrument}; use super::{ super::{ error::{RunError, SystemError}, + message::{StoleTaskMessage, TaskOutputMessage}, system::SystemComm, task::{ - ExecStatus, InternalTaskExecStatus, Interrupter, Task, TaskId, TaskOutput, TaskStatus, - TaskWorkState, TaskWorktable, + ExecStatus, InternalTaskExecStatus, Interrupter, PanicOnSenderDrop, PendingTaskKind, + Task, TaskId, TaskOutput, TaskStatus, TaskWorkState, TaskWorktable, }, }, - RunnerMessage, TaskRunnerOutput, WorkStealer, WorkerId, ONE_SECOND, + TaskRunnerOutput, WorkStealer, WorkerId, ONE_SECOND, }; const TEN_SECONDS: Duration = Duration::from_secs(10); @@ -49,44 +50,12 @@ struct AbortAndSuspendSignalers { suspend_tx: oneshot::Sender<()>, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub(super) enum PendingTaskKind { - Normal, - Priority, - Suspended, -} - -impl PendingTaskKind { - const fn with_priority(has_priority: bool) -> Self { - if has_priority { - Self::Priority - } else { - Self::Normal - } - } -} - struct RunningTask { - task_id: TaskId, - task_kind: PendingTaskKind, + id: TaskId, + kind: PendingTaskKind, handle: JoinHandle<()>, } -fn dispatch_steal_request( - worker_id: WorkerId, - work_stealer: WorkStealer, - runner_tx: chan::Sender>, -) -> JoinHandle<()> { - spawn(async move { - runner_tx - .send(RunnerMessage::StoleTask( - work_stealer.steal(worker_id).await, - )) - .await - .expect("runner channel closed before send stolen task"); - }) -} - enum WaitingSuspendedTask { Task(TaskId), None, @@ -107,25 +76,33 @@ pub(super) struct Runner { paused_tasks: HashMap>, suspended_task: Option>, priority_tasks: VecDeque>, - last_requested_help: Instant, is_idle: bool, waiting_suspension: WaitingSuspendedTask, abort_and_suspend_map: HashMap, - msgs_tx: chan::Sender>, + stole_task_tx: chan::Sender>>, + task_output_tx: chan::Sender>, current_task_handle: Option, - suspend_on_shutdown_rx: chan::Receiver>, + suspend_on_shutdown_stole_task_rx: chan::Receiver>>, + suspend_on_shutdown_task_output_rx: chan::Receiver>, current_steal_task_handle: Option>, last_steal_attempt_at: Instant, steal_attempts_count: u32, } +type RunnerCreate = ( + Runner, + chan::Receiver>>, + chan::Receiver>, +); + impl Runner { pub(super) fn new( worker_id: WorkerId, work_stealer: WorkStealer, system_comm: SystemComm, - ) -> (Self, chan::Receiver>) { - let (runner_tx, runner_rx) = chan::bounded(8); + ) -> RunnerCreate { + let (stolen_task_tx, stolen_task_rx) = chan::bounded(2); + let (task_output_tx, task_output_rx) = chan::bounded(8); ( Self { @@ -137,40 +114,38 @@ impl Runner { paused_tasks: HashMap::new(), suspended_task: None, priority_tasks: VecDeque::with_capacity(PRIORITY_TASK_QUEUE_INITIAL_SIZE), - last_requested_help: Instant::now(), is_idle: true, waiting_suspension: WaitingSuspendedTask::None, abort_and_suspend_map: HashMap::with_capacity(ABORT_AND_SUSPEND_MAP_INITIAL_SIZE), - msgs_tx: runner_tx, + stole_task_tx: stolen_task_tx, + task_output_tx, current_task_handle: None, - suspend_on_shutdown_rx: runner_rx.clone(), + suspend_on_shutdown_stole_task_rx: stolen_task_rx.clone(), + suspend_on_shutdown_task_output_rx: task_output_rx.clone(), current_steal_task_handle: None, last_steal_attempt_at: Instant::now(), steal_attempts_count: 0, }, - runner_rx, + stolen_task_rx, + task_output_rx, ) } + #[instrument(skip(self))] pub(super) fn total_tasks(&self) -> usize { let priority_tasks_count = self.priority_tasks.len(); let current_task_count = usize::from(self.current_task_handle.is_some()); let suspended_task_count = usize::from(self.suspended_task.is_some()); let tasks_count = self.tasks.len(); - trace!( - "Task count: \ - ", - self.worker_id + trace!(%priority_tasks_count, %current_task_count, %suspended_task_count, %tasks_count, + "Tasks count" ); priority_tasks_count + current_task_count + suspended_task_count + tasks_count } + #[instrument(skip(self, task_work_state))] pub(super) fn spawn_task_runner( &mut self, task_id: TaskId, @@ -187,107 +162,79 @@ impl Runner { }, ); - let handle = spawn(run_single_task( - self.worker_id, - task_work_state, - self.msgs_tx.clone(), - suspend_rx, - abort_rx, - )); - - trace!( - "Task runner spawned: ", - self.worker_id + let handle = spawn( + run_single_task( + task_work_state, + self.task_output_tx.clone(), + suspend_rx, + abort_rx, + ) + .in_current_span(), ); + trace!("Task runner spawned"); + handle } - pub(super) async fn new_task(&mut self, task_work_state: TaskWorkState) { - let task_id = task_work_state.task.id(); - let new_kind = PendingTaskKind::with_priority(task_work_state.task.with_priority()); + #[instrument(skip(self, task_work_state))] + pub(super) fn new_task( + &mut self, + task_id: TaskId, + task_kind: PendingTaskKind, + task_work_state: TaskWorkState, + ) { + trace!("Received new task"); - trace!( - "Received new task: ", - self.worker_id - ); + self.task_kinds.insert(task_id, task_kind); - self.task_kinds.insert(task_id, new_kind); - - match self - .inner_add_task(task_id, new_kind, task_work_state) - .await - { - TaskAddStatus::Running => trace!( - "Task running: ", - self.worker_id - ), - TaskAddStatus::Enqueued => trace!( - "Task enqueued: ", - self.worker_id - ), + match self.inner_add_task(task_id, task_kind, task_work_state) { + TaskAddStatus::Running => trace!("New task is running"), + TaskAddStatus::Enqueued => { + trace!( + total_tasks = self.total_tasks(), + "Task enqueued with other tasks" + ); + } } } - pub(super) async fn resume_task(&mut self, task_id: TaskId) -> Result<(), SystemError> { - trace!( - "Resume task request: ", - self.worker_id - ); + #[instrument(skip(self))] + pub(super) fn resume_task(&mut self, task_id: TaskId) -> Result<(), SystemError> { + trace!("Resume task request"); if let Some(task_work_state) = self.paused_tasks.remove(&task_id) { task_work_state.worktable.set_unpause(); - match self - .inner_add_task( - task_id, - *self - .task_kinds - .get(&task_id) - .expect("we added the task kind before pausing it"), - task_work_state, - ) - .await - { - TaskAddStatus::Running => trace!( - "Resumed task is running: ", - self.worker_id - ), - TaskAddStatus::Enqueued => trace!( - "Resumed task was enqueued: ", - self.worker_id - ), + match self.inner_add_task( + task_id, + *self + .task_kinds + .get(&task_id) + .expect("we added the task kind before pausing it"), + task_work_state, + ) { + TaskAddStatus::Running => trace!("Resumed task is running"), + TaskAddStatus::Enqueued => trace!("Resumed task was enqueued"), } - Ok(()) - } else { - trace!( - "Task not found: ", - self.worker_id - ); - Err(SystemError::TaskNotFound(task_id)) + return Ok(()); } + + trace!("Task not found"); + Err(SystemError::TaskNotFound(task_id)) } + #[instrument(skip(self))] pub(super) fn pause_not_running_task(&mut self, task_id: TaskId) -> Result<(), SystemError> { - trace!( - "Pause not running task request: ", - self.worker_id - ); - if self.paused_tasks.contains_key(&task_id) { - trace!( - "Task is already paused: ", - self.worker_id - ); + trace!("Task is already paused"); return Ok(()); } if let Some(current_task) = &self.current_task_handle { - if current_task.task_id == task_id { + if current_task.id == task_id { trace!( - "Task began to run before we managed to pause it, run function will pause it: \ - ", - self.worker_id + "Task began to run before we managed to pause it, run function will pause it" ); return Ok(()); // The task will pause itself } @@ -300,13 +247,11 @@ impl Runner { Err(SystemError::TaskNotFound(task_id)) } + #[instrument(skip(self))] fn pause_suspended_task(&mut self, task_id: TaskId) -> bool { if let Some(suspended_task) = &self.suspended_task { - if suspended_task.task.id() == task_id { - trace!( - "Task is already suspended but will be paused: ", - self.worker_id - ); + if suspended_task.id() == task_id { + trace!("Task is already suspended but will be paused"); self.paused_tasks.insert( task_id, @@ -320,11 +265,12 @@ impl Runner { false } + #[instrument(skip(self))] fn pause_task_from_queues(&mut self, task_id: TaskId) -> bool { if let Some(index) = self .priority_tasks .iter() - .position(|task_work_state| task_work_state.task.id() == task_id) + .position(|task_work_state| task_work_state.id() == task_id) { self.paused_tasks.insert( task_id, @@ -339,7 +285,7 @@ impl Runner { if let Some(index) = self .tasks .iter() - .position(|task_work_state| task_work_state.task.id() == task_id) + .position(|task_work_state| task_work_state.id() == task_id) { self.paused_tasks.insert( task_id, @@ -352,75 +298,79 @@ impl Runner { false } - pub(super) fn cancel_not_running_task(&mut self, task_id: TaskId) { - trace!( - "Cancel not running task request: ", - self.worker_id - ); + #[instrument(skip(self))] + pub(super) fn cancel_not_running_task(&mut self, task_id: &TaskId) -> Result<(), SystemError> { + trace!("Cancel not running task request"); if let Some(current_task) = &self.current_task_handle { - if current_task.task_id == task_id { + if current_task.id == *task_id { trace!( - "Task began to run before we managed to cancel it, run function will cancel it: \ - ", - self.worker_id + "Task began to run before we managed to cancel it, run function will cancel it" ); - return; // The task will cancel itself + return Ok(()); // The task will cancel itself } } + // We only remove from task_kinds as if the task is already running, it will be removed when we + // process its cancelled output later + self.task_kinds.remove(task_id); + if let Some(suspended_task) = &self.suspended_task { - if suspended_task.task.id() == task_id { - trace!( - "Task is already suspended but will be paused: ", - self.worker_id - ); + if suspended_task.id() == *task_id { + trace!("Task is already suspended but will be canceled"); - send_cancel_task_response( - self.worker_id, - task_id, - self.suspended_task.take().expect("we just checked it"), - ); + send_cancel_task_response(self.suspended_task.take().expect("we just checked it")); - return; + return Ok(()); } } - self.cancel_task_from_queues(task_id); + if self.cancel_task_from_queues(task_id) { + return Ok(()); + } + + Err(SystemError::TaskNotFound(*task_id)) // If the task is not found, then it's possible that the user already canceled it but still have the handle } - fn cancel_task_from_queues(&mut self, task_id: TaskId) { + #[instrument(skip(self))] + #[inline] + fn cancel_task_from_queues(&mut self, task_id: &TaskId) -> bool { if let Some(index) = self .priority_tasks .iter() - .position(|task_work_state| task_work_state.task.id() == task_id) + .position(|task_work_state| task_work_state.id() == *task_id) { send_cancel_task_response( - self.worker_id, - task_id, self.priority_tasks .remove(index) .expect("we just checked it"), ); - return; + return true; } if let Some(index) = self .tasks .iter() - .position(|task_work_state| task_work_state.task.id() == task_id) + .position(|task_work_state| task_work_state.id() == *task_id) { - send_cancel_task_response( - self.worker_id, - task_id, - self.tasks.remove(index).expect("we just checked it"), - ); + send_cancel_task_response(self.tasks.remove(index).expect("we just checked it")); + + return true; } + + if let Some(task_work_state) = self.paused_tasks.remove(task_id) { + send_cancel_task_response(task_work_state); + + return true; + } + + false } + #[instrument(skip(self, task_work_state))] #[inline] fn add_task_when_idle( &mut self, @@ -428,15 +378,12 @@ impl Runner { task_kind: PendingTaskKind, task_work_state: TaskWorkState, ) { - trace!( - "Idle worker will process the new task: ", - self.worker_id - ); + trace!("Idle worker will process the new task"); let handle = self.spawn_task_runner(task_id, task_work_state); self.current_task_handle = Some(RunningTask { - task_id, - task_kind, + id: task_id, + kind: task_kind, handle, }); @@ -445,8 +392,64 @@ impl Runner { self.is_idle = false; } + #[instrument(skip(self, task_work_state))] #[inline] - pub(super) async fn inner_add_task( + fn add_task_when_busy( + &mut self, + new_kind: PendingTaskKind, + task_work_state: TaskWorkState, + old_task_id: TaskId, + old_kind: PendingTaskKind, + ) -> TaskAddStatus { + match (new_kind, old_kind) { + (PendingTaskKind::Priority, PendingTaskKind::Priority) => { + trace!("Old and new tasks have priority, will put new task on priority queue"); + self.priority_tasks.push_front(task_work_state); + TaskAddStatus::Enqueued + } + (PendingTaskKind::Priority, PendingTaskKind::Normal) => { + if self.waiting_suspension.is_waiting() { + trace!( + "Worker is already waiting for a task to be suspended, will enqueue new task" + ); + self.priority_tasks.push_front(task_work_state); + } else { + trace!("Old task will be suspended"); + // We put the query at the top of the priority queue, so it will be + // dispatched by the run function as soon as the current task is suspended + self.priority_tasks.push_front(task_work_state); + + if self + .abort_and_suspend_map + .remove(&old_task_id) + .expect("we always store the abort and suspend signalers") + .suspend_tx + .send(()) + .is_err() + { + warn!(%old_task_id, + "Suspend channel closed before receiving suspend signal. \ + This probably happened because the task finished before we could suspend it." + ); + } + + self.waiting_suspension = WaitingSuspendedTask::Task(old_task_id); + } + + TaskAddStatus::Running + } + (_, _) => { + trace!("New task doesn't have priority and will be enqueued"); + self.tasks.push_back(task_work_state); + + TaskAddStatus::Enqueued + } + } + } + + #[instrument(skip(self, task_work_state))] + #[inline] + pub(super) fn inner_add_task( &mut self, task_id: TaskId, task_kind: PendingTaskKind, @@ -456,125 +459,35 @@ impl Runner { self.add_task_when_idle(task_id, task_kind, task_work_state); TaskAddStatus::Running } else { + trace!("Worker is busy"); + let RunningTask { - task_id: old_task_id, - task_kind: old_kind, + id: old_task_id, + kind: old_kind, .. } = self .current_task_handle .as_ref() .expect("Worker isn't idle, but no task is running"); - trace!( - "Worker is busy: \ - ", - self.worker_id, - ); - - let add_status = match (task_kind, old_kind) { - (PendingTaskKind::Priority, PendingTaskKind::Priority) => { - trace!( - "Old and new tasks have priority, will put new task on priority queue: \ - ", - self.worker_id - ); - self.priority_tasks.push_front(task_work_state); - - TaskAddStatus::Enqueued - } - (PendingTaskKind::Priority, PendingTaskKind::Normal) => { - if self.waiting_suspension.is_waiting() { - trace!( - "Worker is already waiting for a task to be suspended, will enqueue new task: \ - ", - self.worker_id - ); - - self.priority_tasks.push_front(task_work_state); - } else { - trace!( - "Old task will be suspended: \ - ", - self.worker_id - ); - - // We put the query at the top of the priority queue, so it will be - // dispatched by the run function as soon as the current task is suspended - self.priority_tasks.push_front(task_work_state); - - if self - .abort_and_suspend_map - .remove(old_task_id) - .expect("we always store the abort and suspend signalers") - .suspend_tx - .send(()) - .is_err() - { - warn!( - "Task suspend channel closed before receiving suspend signal. \ - This probably happened because the task finished before we could suspend it." - ); - } - - self.waiting_suspension = WaitingSuspendedTask::Task(*old_task_id); - } - - TaskAddStatus::Running - } - (_, _) => { - trace!( - "New task doesn't have priority and will be enqueued: \ - ", - self.worker_id, - ); - - self.tasks.push_back(task_work_state); - - TaskAddStatus::Enqueued - } - }; - - let task_count = self.total_tasks(); - - trace!( - "Worker with {task_count} pending tasks: ", - self.worker_id - ); - - if task_count > self.work_stealer.workers_count() - && self.last_requested_help.elapsed() > ONE_SECOND - { - trace!( - "Worker requesting help from the system: \ - ", - self.worker_id - ); - - self.system_comm - .request_help(self.worker_id, task_count) - .await; - - self.last_requested_help = Instant::now(); - } - - add_status + self.add_task_when_busy(task_kind, task_work_state, *old_task_id, *old_kind) } } + #[instrument(skip(self))] pub(super) async fn force_task_abortion( &mut self, - task_id: uuid::Uuid, + task_id: &TaskId, ) -> Result<(), SystemError> { if let Some(AbortAndSuspendSignalers { abort_tx, .. }) = - self.abort_and_suspend_map.remove(&task_id) + self.abort_and_suspend_map.remove(task_id) { let (tx, rx) = oneshot::channel(); if abort_tx.send(tx).is_err() { debug!( - "Failed to send force abortion request, the task probably finished before we could abort it: \ - ", - self.worker_id + "Failed to send force abortion request, \ + the task probably finished before we could abort it" ); Ok(()) @@ -584,36 +497,29 @@ impl Runner { // If the sender was dropped, then the task finished before we could // abort it which is fine Ok(Err(_)) => Ok(()), - Err(_) => Err(SystemError::TaskForcedAbortTimeout(task_id)), + Err(_) => Err(SystemError::TaskForcedAbortTimeout(*task_id)), } } } else { - trace!( - "Forced abortion of a not running task request: ", - self.worker_id - ); + trace!("Forced abortion of a not running task request"); if let Some(current_task) = &self.current_task_handle { - if current_task.task_id == task_id { + if current_task.id == *task_id { trace!( - "Task began to run before we managed to abort it, run function will abort it: \ - ", - self.worker_id - ); + "Task began to run before we managed to abort it, \ + run function will abort it" + ); return Ok(()); // The task will abort itself } } + self.task_kinds.remove(task_id); + if let Some(suspended_task) = &self.suspended_task { - if suspended_task.task.id() == task_id { - trace!( - "Task is already suspended but will be paused: ", - self.worker_id - ); + if suspended_task.id() == *task_id { + trace!("Task is already suspended but will be force aborted"); send_forced_abortion_task_response( - self.worker_id, - task_id, self.suspended_task.take().expect("we just checked it"), ); @@ -624,11 +530,9 @@ impl Runner { if let Some(index) = self .priority_tasks .iter() - .position(|task_work_state| task_work_state.task.id() == task_id) + .position(|task_work_state| task_work_state.id() == *task_id) { send_forced_abortion_task_response( - self.worker_id, - task_id, self.priority_tasks .remove(index) .expect("we just checked it"), @@ -640,96 +544,109 @@ impl Runner { if let Some(index) = self .tasks .iter() - .position(|task_work_state| task_work_state.task.id() == task_id) + .position(|task_work_state| task_work_state.id() == *task_id) { send_forced_abortion_task_response( - self.worker_id, - task_id, self.tasks.remove(index).expect("we just checked it"), ); return Ok(()); } - // If the task is not found, then it's possible that the user already aborted it but still have the handle + // If the task is not found, then it's possible that + // the user already aborted it but still have the handle Ok(()) } } + #[instrument(skip(self, tx))] pub(super) async fn shutdown(mut self, tx: oneshot::Sender<()>) { - trace!( - "Worker beginning shutdown process: ", - self.worker_id - ); + trace!("Worker beginning shutdown process"); - trace!( - "Aborting steal task for shutdown if there is one running: ", - self.worker_id - ); + trace!("Aborting steal task for shutdown if there is one running"); self.abort_steal_task(); let Self { - worker_id, tasks, + suspended_task, paused_tasks, priority_tasks, is_idle, abort_and_suspend_map, - msgs_tx: runner_tx, + stole_task_tx: stolen_task_tx, + task_output_tx, mut current_task_handle, - suspend_on_shutdown_rx, + suspend_on_shutdown_stole_task_rx, + suspend_on_shutdown_task_output_rx, .. } = self; if is_idle { - trace!("Worker is idle, no tasks to shutdown: "); + trace!("Worker is idle, no tasks to shutdown"); + assert!( + current_task_handle.is_none(), + "can't shutdown with a running task if we're idle" + ); + assert!( + tasks.is_empty(), + "can't shutdown with pending tasks if we're idle" + ); + assert!( + priority_tasks.is_empty(), + "can't shutdown with priority tasks if we're idle" + ); + assert!( + suspended_task.is_none(), + "can't shutdown with a suspended task if we're idle" + ); + + paused_tasks + .into_values() + .for_each(send_shutdown_task_response); } else { - trace!("Worker is busy, will shutdown tasks: "); + trace!("Worker is busy, will shutdown tasks"); if let Some(RunningTask { - task_id, handle, .. + id: task_id, + handle, + .. }) = current_task_handle.take() { for (task_id, AbortAndSuspendSignalers { suspend_tx, .. }) in abort_and_suspend_map { if suspend_tx.send(()).is_err() { - warn!( - "Shutdown request channel closed before sending abort signal: \ - " + warn!(%task_id, + "Shutdown request channel closed before sending abort signal" ); } else { - trace!( - "Sent suspend signal for task on shutdown: \ - " - ); + trace!(%task_id, "Sent suspend signal for task on shutdown"); } } if let Err(e) = handle.await { - error!("Task failed to join: {e:#?}"); + error!(%task_id, ?e, "Task failed to join"); } - runner_tx.close(); + stolen_task_tx.close(); + task_output_tx.close(); - Self::process_tasks_being_suspended_on_shutdown(worker_id, suspend_on_shutdown_rx) - .await; + Self::process_tasks_being_suspended_on_shutdown( + suspend_on_shutdown_stole_task_rx, + suspend_on_shutdown_task_output_rx, + ) + .await; } priority_tasks .into_iter() + .chain(suspended_task.into_iter()) .chain(paused_tasks.into_values()) .chain(tasks.into_iter()) - .for_each(|task_work_state| { - send_shutdown_task_response( - worker_id, - task_work_state.task.id(), - task_work_state, - ); - }); + .for_each(send_shutdown_task_response); } - trace!("Worker shutdown process completed: "); + trace!("Worker shutdown process completed"); if tx.send(()).is_err() { warn!("Shutdown request channel closed before sending ack"); @@ -737,50 +654,58 @@ impl Runner { } async fn process_tasks_being_suspended_on_shutdown( - worker_id: WorkerId, - suspend_on_shutdown_rx: chan::Receiver>, + suspend_on_shutdown_stole_task_rx: chan::Receiver>>, + suspend_on_shutdown_task_output_rx: chan::Receiver>, ) { - let mut suspend_on_shutdown_rx = pin!(suspend_on_shutdown_rx); + enum StreamMessage { + Output(TaskOutputMessage), + Steal(Option>), + } - while let Some(runner_msg) = suspend_on_shutdown_rx.next().await { - match runner_msg { - RunnerMessage::TaskOutput(task_id, res) => match res { + let mut msg_stream = pin!(( + suspend_on_shutdown_stole_task_rx.map(StreamMessage::Steal), + suspend_on_shutdown_task_output_rx.map(StreamMessage::Output), + ) + .merge()); + + while let Some(msg) = msg_stream.next().await { + match msg { + StreamMessage::Output(TaskOutputMessage(task_id, res)) => match res { Ok(TaskRunnerOutput { task_work_state, status, }) => match status { InternalTaskExecStatus::Done(out) => { - send_complete_task_response(worker_id, task_id, task_work_state, out); + send_complete_task_response(task_work_state, out); } InternalTaskExecStatus::Canceled => { - send_cancel_task_response(worker_id, task_id, task_work_state); + send_cancel_task_response(task_work_state); } InternalTaskExecStatus::Suspend | InternalTaskExecStatus::Paused => { - send_shutdown_task_response(worker_id, task_id, task_work_state); + send_shutdown_task_response(task_work_state); } InternalTaskExecStatus::Error(e) => { - send_error_task_response(worker_id, task_id, task_work_state, e); + send_error_task_response(task_work_state, e); } }, Err(()) => { - error!( - "Task failed to suspend on shutdown" - ); + error!(%task_id, "Task failed to suspend on shutdown"); } }, - RunnerMessage::StoleTask(Some(task_work_state)) => { - send_shutdown_task_response( - worker_id, - task_work_state.task.id(), - task_work_state, + StreamMessage::Steal(Some(StoleTaskMessage(task_work_state))) => { + trace!( + task_id = %task_work_state.id(), + "Stole task", ); + + send_shutdown_task_response(task_work_state); } - RunnerMessage::StoleTask(None) => {} + StreamMessage::Steal(None) => {} } } } @@ -791,7 +716,6 @@ impl Runner { } if let Some(task) = self.suspended_task.take() { - task.interrupter.reset(); task.worktable.set_unpause(); return Some((PendingTaskKind::Suspended, task)); } @@ -801,99 +725,77 @@ impl Runner { .map(|task| (PendingTaskKind::Normal, task)) } - pub(super) fn steal_request(&mut self, tx: oneshot::Sender>>) { - trace!("Steal request: ", self.worker_id); - if let Some((kind, task_work_state)) = self.get_next_task() { - self.proceed_with_task_to_be_stolen(kind, task_work_state, tx); - } else { - trace!("No task to steal: ", self.worker_id); - if tx.send(None).is_err() { - warn!( - "Steal request channel closed before sending no task response: \ - ", - self.worker_id - ); - } - } - } - - fn proceed_with_task_to_be_stolen( + #[instrument(skip_all)] + pub(super) async fn steal_request( &mut self, - kind: PendingTaskKind, - task_work_state: TaskWorkState, - tx: oneshot::Sender>>, - ) { - let task_id = task_work_state.task.id(); - self.task_kinds.remove(&task_id); + stealer_id: WorkerId, + stolen_task_tx: chan::Sender>>, + ) -> bool { + while let Some((kind, task_work_state)) = self.get_next_task() { + let task_id = task_work_state.id(); + self.task_kinds.remove(&task_id); - trace!( - "Stealing task: ", - self.worker_id - ); + trace!(%task_id, ?kind, "Task being stolen"); - if let Err(Some(task_work_state)) = tx.send(Some(task_work_state)) { - self.put_back_failed_to_stole_task(task_id, kind, task_work_state); - } - } - - fn put_back_failed_to_stole_task( - &mut self, - id: TaskId, - kind: PendingTaskKind, - task_work_state: TaskWorkState, - ) { - warn!( - "Steal request channel closed before sending task: ", - self.worker_id - ); - match kind { - PendingTaskKind::Normal => self.tasks.push_front(task_work_state), - PendingTaskKind::Priority => self.priority_tasks.push_front(task_work_state), - PendingTaskKind::Suspended => self.suspended_task = Some(task_work_state), - } - - self.task_kinds.insert(id, kind); - } - - pub(super) fn wake_up(&mut self) { - if self.is_idle { - trace!( - "Worker is idle, waking up: ", - self.worker_id - ); - - if self.current_steal_task_handle.is_none() { - self.current_steal_task_handle = Some(dispatch_steal_request( - self.worker_id, - self.work_stealer.clone(), - self.msgs_tx.clone(), - )); - } else { - trace!( - "Steal task already running, ignoring wake up request: ", - self.worker_id - ); + if task_work_state.worktable.has_canceled() { + trace!(%task_id, "Task was canceled before we could steal it"); + send_cancel_task_response(task_work_state); + continue; } - } else { - trace!( - "Worker already working, ignoring wake up request: ", - self.worker_id - ); + + if task_work_state.worktable.has_aborted() { + trace!(%task_id, "Task was force aborted before we could steal it"); + send_forced_abortion_task_response(task_work_state); + continue; + } + + if task_work_state.worktable.is_paused() { + trace!(%task_id, "Task was paused before we could steal it"); + self.task_kinds.insert(task_id, kind); + self.paused_tasks.insert(task_id, task_work_state); + continue; + } + + trace!(%task_id, ?kind, "Task being stolen"); + + task_work_state.worktable.change_worker(stealer_id); + + if let Err(chan::SendError(Some(StoleTaskMessage(task_work_state)))) = stolen_task_tx + .send(Some(StoleTaskMessage(task_work_state))) + .await + { + warn!("Steal request channel closed before sending task"); + task_work_state.worktable.change_worker(self.worker_id); + match kind { + PendingTaskKind::Normal => self.tasks.push_front(task_work_state), + PendingTaskKind::Priority => self.priority_tasks.push_front(task_work_state), + PendingTaskKind::Suspended => { + assert!( + self.suspended_task.is_none(), + "tried to suspend a task when we already have a suspended task" + ); + self.suspended_task = Some(task_work_state); + } + } + + self.task_kinds.insert(task_id, kind); + + return false; + } + + return true; // Successfully stole the task } + + false // No task to steal } + #[instrument(skip(self))] #[inline] - pub(super) async fn dispatch_next_task(&mut self, finished_task_id: TaskId) { - trace!( - "Task finished and will try to process a new task: \ - ", - self.worker_id - ); - - self.abort_and_suspend_map.remove(&finished_task_id); + pub(super) async fn dispatch_next_task(&mut self, finished_task_id: &TaskId) { + self.abort_and_suspend_map.remove(finished_task_id); let RunningTask { - task_id: old_task_id, + id: old_task_id, handle, .. @@ -902,62 +804,42 @@ impl Runner { .take() .expect("Task handle missing, but task output received"); - assert_eq!(finished_task_id, old_task_id, "Task output id mismatch"); + assert_eq!(*finished_task_id, old_task_id, "Task output id mismatch"); // Sanity check - trace!( - "Waiting task handle: ", - self.worker_id - ); if let Err(e) = handle.await { - error!("Task failed to join: {e:#?}"); + error!(?e, "Task failed to join"); } - trace!( - "Waited task handle: ", - self.worker_id - ); - if let Some((task_kind, task_work_state)) = self.get_next_task() { - let task_id = task_work_state.task.id(); + if let Some((next_task_kind, task_work_state)) = self.get_next_task() { + let next_task_id = task_work_state.id(); - trace!( - "Dispatching next task: ", - self.worker_id - ); + trace!(%next_task_id, ?next_task_kind, "Dispatching next task"); - let handle = self.spawn_task_runner(task_id, task_work_state); + let handle = self.spawn_task_runner(next_task_id, task_work_state); self.current_task_handle = Some(RunningTask { - task_id, - task_kind, + id: next_task_id, + kind: next_task_kind, handle, }); } else { - trace!( - "No task to dispatch, worker is now idle and will dispatch a steal request: ", - self.worker_id - ); - self.is_idle = true; - self.system_comm.idle_report(self.worker_id).await; + self.system_comm.idle_report(self.worker_id); if self.current_steal_task_handle.is_none() { self.current_steal_task_handle = Some(dispatch_steal_request( self.worker_id, self.work_stealer.clone(), - self.msgs_tx.clone(), + self.stole_task_tx.clone(), )); - } else { - trace!( - "Steal task already running: ", - self.worker_id - ); } } } + #[instrument(skip(self, task_work_state, status))] pub(super) async fn process_task_output( &mut self, - task_id: TaskId, + task_id: &TaskId, TaskRunnerOutput { task_work_state, status, @@ -965,90 +847,63 @@ impl Runner { ) { match status { InternalTaskExecStatus::Done(out) => { - self.task_kinds.remove(&task_id); - send_complete_task_response(self.worker_id, task_id, task_work_state, out); + self.task_kinds.remove(task_id); + send_complete_task_response(task_work_state, out); } InternalTaskExecStatus::Paused => { - self.paused_tasks.insert(task_id, task_work_state); - trace!( - "Task paused: ", - self.worker_id - ); + self.paused_tasks.insert(*task_id, task_work_state); + trace!("Task paused"); } InternalTaskExecStatus::Canceled => { - self.task_kinds.remove(&task_id); - send_cancel_task_response(self.worker_id, task_id, task_work_state); + self.task_kinds.remove(task_id); + send_cancel_task_response(task_work_state); } InternalTaskExecStatus::Error(e) => { - self.task_kinds.remove(&task_id); - send_error_task_response(self.worker_id, task_id, task_work_state, e); + self.task_kinds.remove(task_id); + send_error_task_response(task_work_state, e); } InternalTaskExecStatus::Suspend => { - self.suspended_task = Some(task_work_state); - trace!( - "Task suspended: ", - self.worker_id + assert!( + self.suspended_task.is_none(), + "tried to suspend a task when we already have a suspended task" ); + self.suspended_task = Some(task_work_state); + trace!("Task suspended"); self.clean_suspended_task(task_id); } } - trace!( - "Processing task output completed and will try to dispatch a new task: \ - ", - self.worker_id - ); - self.dispatch_next_task(task_id).await; } + #[instrument(skip(self))] pub(super) fn idle_check(&mut self) { if self.is_idle { - trace!( - "Worker is idle for some time and will try to steal a task: ", - self.worker_id - ); - if self.current_steal_task_handle.is_none() { self.steal_attempt(); - } else { - trace!( - "Steal task already running, ignoring on this idle check: ", - self.worker_id - ); } self.idle_memory_cleanup(); } } + #[instrument(skip(self), fields(steal_attempts_count = self.steal_attempts_count))] fn steal_attempt(&mut self) { let elapsed = self.last_steal_attempt_at.elapsed(); let required = (TEN_SECONDS * self.steal_attempts_count).min(ONE_MINUTE); - trace!( - "Steal attempt required cool down: \ - ", - self.worker_id, - self.steal_attempts_count - ); + if elapsed > required { self.current_steal_task_handle = Some(dispatch_steal_request( self.worker_id, self.work_stealer.clone(), - self.msgs_tx.clone(), + self.stole_task_tx.clone(), )); self.last_steal_attempt_at = Instant::now(); - } else { - trace!( - "Steal attempt still cooling down: ", - self.worker_id, - self.steal_attempts_count - ); } } @@ -1060,7 +915,11 @@ impl Runner { } if self.task_kinds.capacity() > TASK_QUEUE_INITIAL_SIZE { - assert_eq!(self.task_kinds.len(), self.paused_tasks.len()); + assert_eq!( + self.task_kinds.len(), + self.paused_tasks.len(), + "If we're idle, the number of task_kinds MUST be equal to the number of paused tasks" + ); self.task_kinds.shrink_to(TASK_QUEUE_INITIAL_SIZE); } @@ -1081,60 +940,77 @@ impl Runner { } } + #[instrument(skip(self))] pub(super) fn abort_steal_task(&mut self) { if let Some(steal_task_handle) = self.current_steal_task_handle.take() { steal_task_handle.abort(); - trace!("Aborted steal task: ", self.worker_id); - } else { - trace!("No steal task to abort: ", self.worker_id); + trace!("Aborted steal task"); } } - pub(super) async fn process_stolen_task(&mut self, maybe_new_task: Option>) { + #[instrument( + skip(self, maybe_new_task), + fields( + maybe_new_task = ?maybe_new_task.as_ref() + .map(|StoleTaskMessage(task_work_state)| task_work_state.id()) + ) + )] + pub(super) async fn process_stolen_task( + &mut self, + maybe_new_task: Option>, + ) { if let Some(steal_task_handle) = self.current_steal_task_handle.take() { if let Err(e) = steal_task_handle.await { - error!("Steal task failed to join: {e:#?}"); + error!(?e, "Steal task failed to join"); } } - if let Some(task_work_state) = maybe_new_task { - self.system_comm.working_report(self.worker_id).await; - trace!( - "Stolen task: ", - self.worker_id, - task_work_state.task.id() - ); + if let Some(StoleTaskMessage(task_work_state)) = maybe_new_task { + self.system_comm.working_report(self.worker_id); + + let stolen_task_id = task_work_state.id(); + + trace!(%stolen_task_id, "Stolen task"); + self.steal_attempts_count = 0; - self.new_task(task_work_state).await; + self.new_task(stolen_task_id, task_work_state.kind(), task_work_state); } else { self.steal_attempts_count += 1; } } - pub(crate) fn clean_suspended_task(&mut self, task_id: uuid::Uuid) { + #[instrument(skip(self))] + pub(crate) fn clean_suspended_task(&mut self, task_id: &TaskId) { match self.waiting_suspension { - WaitingSuspendedTask::Task(waiting_task_id) if waiting_task_id == task_id => { - trace!( - "Task was suspended and will be cleaned: ", - self.worker_id - ); + WaitingSuspendedTask::Task(waiting_task_id) if waiting_task_id == *task_id => { + trace!("Task was suspended and will be cleaned"); self.waiting_suspension = WaitingSuspendedTask::None; } WaitingSuspendedTask::Task(_) => { - trace!( - "Task wasn't suspended, ignoring: ", - self.worker_id - ); + trace!("Task wasn't suspended, ignoring"); + } + WaitingSuspendedTask::None => { + // Everything is Awesome! } - WaitingSuspendedTask::None => {} } } + + #[instrument(skip(self))] + pub(crate) async fn clear_errored_task(&mut self, task_id: TaskId) { + self.task_kinds.remove(&task_id); + + self.clean_suspended_task(&task_id); + + trace!("Cleansed errored task"); + + self.dispatch_next_task(&task_id).await; + } } type RunTaskOutput = (Box>, Result, SystemError>); +#[instrument(skip(task, worktable, interrupter))] fn handle_run_task_attempt( - worker_id: WorkerId, task_id: TaskId, mut task: Box>, worktable: &TaskWorktable, @@ -1142,28 +1018,30 @@ fn handle_run_task_attempt( ) -> JoinHandle> { spawn({ let already_paused = worktable.is_paused(); - let already_canceled = worktable.is_canceled(); - let already_aborted = worktable.is_aborted(); + let already_canceled = worktable.has_canceled(); + let already_aborted = worktable.has_aborted(); + + let early_result = if already_paused { + trace!("Task was paused before running"); + + Some(Ok(Ok(ExecStatus::Paused))) + } else if already_canceled { + trace!("Task was canceled before running"); + + Some(Ok(Ok(ExecStatus::Canceled))) + } else if already_aborted { + trace!("Task was aborted before running"); + + Some(Err(SystemError::TaskAborted(task_id))) + } else { + // We can mark that the task has actually started now + worktable.set_started(); + None + }; async move { - if already_paused { - trace!( - "Task was paused before running: " - ); - - (task, Ok(Ok(ExecStatus::Paused))) - } else if already_canceled { - trace!( - "Task was canceled before running: " - ); - - (task, Ok(Ok(ExecStatus::Canceled))) - } else if already_aborted { - trace!( - "Task was aborted before running: " - ); - - (task, Err(SystemError::TaskAborted(task_id))) + if let Some(res) = early_result { + (task, res) } else { let run_result = if let Some(timeout_duration) = task.with_timeout() { (task.run(&interrupter).map(Ok), async move { @@ -1179,7 +1057,7 @@ fn handle_run_task_attempt( match run_result { Ok(res) => { - trace!("Ran task: : {res:?}"); + trace!(?res, "Ran task"); (task, Ok(res)) } @@ -1187,74 +1065,75 @@ fn handle_run_task_attempt( } } } + .in_current_span() }) } fn handle_task_suspension( - worker_id: WorkerId, - task_id: TaskId, has_suspended: Arc, worktable: Arc, suspend_rx: oneshot::Receiver<()>, ) -> JoinHandle<()> { - spawn(async move { - if suspend_rx.await.is_ok() { - let (tx, rx) = oneshot::channel(); + spawn( + async move { + if suspend_rx.await.is_ok() { + let (tx, rx) = oneshot::channel(); - trace!("Suspend signal received: "); + trace!("Suspend signal received"); - // The interrupter only knows about Pause and Cancel commands, we use pause as - // the suspend task feature should be invisible to the user - worktable.pause(tx).await; + worktable.suspend(tx, has_suspended); - match rx.await { - Ok(()) => { - trace!("Suspending: "); - has_suspended.store(true, Ordering::Relaxed); - } - Err(_) => { + if rx.await.is_ok() { + trace!("Suspending"); + } else { // The task probably finished before we could suspend it so the channel was dropped - trace!( - "Suspend channel closed: " - ); + trace!("Suspend channel closed"); } + } else { + trace!("Suspend channel closed, task probably finished before we could suspend it"); } - } else { - trace!( - "Suspend channel closed, task probably finished before we could suspend it: \ - " - ); } - }) + .in_current_span(), + ) } type PartialTaskWorkState = ( TaskId, Arc, - oneshot::Sender, SystemError>>, + PanicOnSenderDrop, Arc, ); async fn emit_task_completed_message( - worker_id: WorkerId, run_task_output: RunTaskOutput, has_suspended: Arc, (task_id, worktable, done_tx, interrupter): PartialTaskWorkState, - runner_tx: chan::Sender>, + task_output_tx: chan::Sender>, ) { match run_task_output { (task, Ok(res)) => { - trace!( - "Task completed ok: " - ); - runner_tx - .send(RunnerMessage::TaskOutput(task_id, { - let mut internal_status = res.into(); + trace!(?res, "Task completed ok"); - if matches!(internal_status, InternalTaskExecStatus::Paused) - && has_suspended.load(Ordering::Relaxed) - { - internal_status = InternalTaskExecStatus::Suspend; + task_output_tx + .send(TaskOutputMessage(task_id, { + let mut internal_status = res.into(); + let suspended = has_suspended.load(Ordering::SeqCst); + + match internal_status { + InternalTaskExecStatus::Paused if suspended => { + internal_status = InternalTaskExecStatus::Suspend; + } + + InternalTaskExecStatus::Paused | InternalTaskExecStatus::Suspend => { + /* Nothing to do */ + } + + InternalTaskExecStatus::Done(_) + | InternalTaskExecStatus::Canceled + | InternalTaskExecStatus::Error(_) => { + trace!(?internal_status, "Task completed, closing interrupter"); + interrupter.close(); + } } Ok(TaskRunnerOutput { @@ -1272,12 +1151,14 @@ async fn emit_task_completed_message( } (_, Err(e)) => { - trace!("Task had an error: "); + error!(?e, "Task had an error"); if done_tx .send(if matches!(e, SystemError::TaskAborted(_)) { + worktable.set_aborted(); Ok(TaskStatus::ForcedAbortion) } else { + worktable.set_failed(); Err(e) }) .is_err() @@ -1285,23 +1166,23 @@ async fn emit_task_completed_message( error!("Task done channel closed while sending error response"); } - runner_tx - .send(RunnerMessage::TaskOutput(task_id, Err(()))) + task_output_tx + .send(TaskOutputMessage(task_id, Err(()))) .await .expect("Task runner channel closed while sending task output"); } } } +#[instrument(skip_all, fields(task_id = %task.id()))] async fn run_single_task( - worker_id: WorkerId, TaskWorkState { task, worktable, interrupter, done_tx, }: TaskWorkState, - runner_tx: chan::Sender>, + task_output_tx: chan::Sender>, suspend_rx: oneshot::Receiver<()>, abort_rx: oneshot::Receiver>>, ) { @@ -1312,25 +1193,15 @@ async fn run_single_task( let task_id = task.id(); - worktable.set_started(); + trace!("Running task"); - trace!("Running task: "); - - let handle = handle_run_task_attempt( - worker_id, - task_id, - task, - &worktable, - Arc::clone(&interrupter), - ); + let handle = handle_run_task_attempt(task_id, task, &worktable, Arc::clone(&interrupter)); let task_abort_handle = handle.abort_handle(); let has_suspended = Arc::new(AtomicBool::new(false)); let suspender_handle = handle_task_suspension( - worker_id, - task_id, Arc::clone(&has_suspended), Arc::clone(&worktable), suspend_rx, @@ -1338,14 +1209,12 @@ async fn run_single_task( match (async { RaceOutput::Completed(handle.await) }, async move { if let Ok(tx) = abort_rx.await { - trace!("Aborting task: "); + trace!("Aborting task"); RaceOutput::Abort(tx) } else { // If the abort channel is closed, we should just ignore it and keep waiting for the task to finish // as we're being suspended by the worker - trace!( - "Abort channel closed, will wait for task to finish: " - ); + trace!("Abort channel closed, will wait for task to finish"); pending().await } }) @@ -1354,23 +1223,25 @@ async fn run_single_task( { RaceOutput::Completed(Ok(run_task_output)) => { emit_task_completed_message( - worker_id, run_task_output, has_suspended, (task_id, worktable, done_tx, interrupter), - runner_tx, + task_output_tx, ) .await; } RaceOutput::Completed(Err(join_error)) => { - error!("Task failed to join: {join_error:#?}",); + interrupter.close(); + error!(?join_error, "Task failed to join"); if done_tx.send(Err(SystemError::TaskJoin(task_id))).is_err() { error!("Task done channel closed while sending join error response"); } - if runner_tx - .send(RunnerMessage::TaskOutput(task_id, Err(()))) + worktable.set_failed(); + + if task_output_tx + .send(TaskOutputMessage(task_id, Err(()))) .await .is_err() { @@ -1381,14 +1252,16 @@ async fn run_single_task( RaceOutput::Abort(tx) => { task_abort_handle.abort(); - trace!("Task aborted: "); + trace!("Task aborted"); if done_tx.send(Ok(TaskStatus::ForcedAbortion)).is_err() { error!("Task done channel closed while sending abort error response"); } - if runner_tx - .send(RunnerMessage::TaskOutput(task_id, Err(()))) + worktable.set_aborted(); + + if task_output_tx + .send(TaskOutputMessage(task_id, Err(()))) .await .is_err() { @@ -1402,116 +1275,110 @@ async fn run_single_task( } if !suspender_handle.is_finished() { - trace!( - "Aborting suspender handler as it isn't needed anymore: " - ); // if we received a suspend signal this abort will do nothing, as the task finished already suspender_handle.abort(); } - - trace!("Run single task finished: "); } +#[instrument(skip(task, done_tx, worktable, out), fields(task_id = %task.id()))] fn send_complete_task_response( - worker_id: WorkerId, - task_id: TaskId, TaskWorkState { - done_tx, worktable, .. + done_tx, + worktable, + task, + .. }: TaskWorkState, out: TaskOutput, ) { worktable.set_completed(); - if done_tx.send(Ok(TaskStatus::Done((task_id, out)))).is_err() { - warn!( - "Task done channel closed before sending done response for task: \ - " - ); + worktable.set_finalized(); + if done_tx + .send(Ok(TaskStatus::Done((task.id(), out)))) + .is_err() + { + warn!("Task done channel closed before sending done response for task"); } else { - trace!( - "Emitted task done signal on shutdown: \ - " - ); + trace!("Emitted task done signal on task completion"); } } +#[instrument(skip(task, done_tx, worktable), fields(task_id = %task.id()))] fn send_cancel_task_response( - worker_id: WorkerId, - task_id: TaskId, TaskWorkState { - done_tx, worktable, .. + task, + done_tx, + worktable, + .. }: TaskWorkState, ) { - worktable.set_completed(); + worktable.set_canceled(); + worktable.set_finalized(); if done_tx.send(Ok(TaskStatus::Canceled)).is_err() { - warn!( - "Task done channel closed before sending canceled response for task: \ - ", - ); + warn!("Task done channel closed before sending canceled response for task"); } else { - trace!( - "Emitted task canceled signal on cancel not running task: \ - ", - ); + trace!("Emitted task canceled signal on cancel request"); } } +#[instrument(skip(task, done_tx, worktable), fields(task_id = %task.id()))] fn send_shutdown_task_response( - worker_id: WorkerId, - task_id: TaskId, - TaskWorkState { task, done_tx, .. }: TaskWorkState, + TaskWorkState { + task, + done_tx, + worktable, + .. + }: TaskWorkState, ) { + worktable.set_shutdown(); + worktable.set_finalized(); if done_tx.send(Ok(TaskStatus::Shutdown(task))).is_err() { - warn!( - "Task done channel closed before sending shutdown response for task: \ - " - ); + warn!("Task done channel closed before sending shutdown response for task"); } else { - trace!( - "Successfully suspended and sent back DynTask on worker shutdown: \ - " - ); + trace!("Successfully suspended and sent back DynTask on worker shutdown"); } } +#[instrument(skip(task, done_tx, worktable), fields(task_id = %task.id()))] fn send_error_task_response( - worker_id: usize, - task_id: uuid::Uuid, TaskWorkState { - done_tx, worktable, .. + task, + done_tx, + worktable, + .. }: TaskWorkState, e: E, ) { worktable.set_completed(); + worktable.set_finalized(); if done_tx.send(Ok(TaskStatus::Error(e))).is_err() { - warn!( - "Task done channel closed before sending error response for task: \ - " - ); + warn!("Task done channel closed before sending error response for task"); } else { - trace!( - "Emitted task error signal on shutdown: \ - " - ); + trace!("Emitted task error signal"); } } +#[instrument(skip(task, done_tx, worktable), fields(task_id = %task.id()))] fn send_forced_abortion_task_response( - worker_id: WorkerId, - task_id: TaskId, TaskWorkState { - done_tx, worktable, .. + task, + done_tx, + worktable, + .. }: TaskWorkState, ) { - worktable.set_completed(); + worktable.set_aborted(); + worktable.set_finalized(); if done_tx.send(Ok(TaskStatus::ForcedAbortion)).is_err() { - warn!( - "Task done channel closed before sending forced abortion response for task: \ - ", - ); + warn!("Task done channel closed before sending forced abortion response for task"); } else { - trace!( - "Emitted task forced abortion signal on cancel not running task: \ - ", - ); + trace!("Emitted task forced abortion signal"); } } + +fn dispatch_steal_request( + worker_id: WorkerId, + work_stealer: WorkStealer, + stole_task_tx: chan::Sender>>, +) -> JoinHandle<()> { + spawn(async move { work_stealer.steal(worker_id, &stole_task_tx).await }.in_current_span()) +} diff --git a/crates/task-system/tests/common/actors.rs b/crates/task-system/tests/common/actors.rs index 37bcbcefc..46205985f 100644 --- a/crates/task-system/tests/common/actors.rs +++ b/crates/task-system/tests/common/actors.rs @@ -83,6 +83,7 @@ impl SampleActor { paused_count, )) .await + .unwrap() } else { task_dispatcher .dispatch(SampleActorTask::with_id( @@ -92,6 +93,7 @@ impl SampleActor { paused_count, )) .await + .unwrap() }) .await .expect("Task handle receiver dropped"); @@ -121,8 +123,12 @@ impl SampleActor { self.task_dispatcher .dispatch(self.new_priority_task(duration)) .await + .unwrap() } else { - self.task_dispatcher.dispatch(self.new_task(duration)).await + self.task_dispatcher + .dispatch(self.new_task(duration)) + .await + .unwrap() }) .await .expect("Task handle receiver dropped"); diff --git a/crates/task-system/tests/common/jobs.rs b/crates/task-system/tests/common/jobs.rs index 7055c3473..5c7e0f242 100644 --- a/crates/task-system/tests/common/jobs.rs +++ b/crates/task-system/tests/common/jobs.rs @@ -41,6 +41,7 @@ impl SampleJob { task_dispatcher .dispatch_many(initial_steps) .await + .unwrap() .into_iter(), ) .lend_mut(); @@ -108,7 +109,8 @@ impl Task for SampleJobTask { expected_children: self.expected_children - 1, task_dispatcher: self.task_dispatcher.clone(), }) - .await, + .await + .unwrap(), } .into_output(), )) diff --git a/crates/task-system/tests/common/tasks.rs b/crates/task-system/tests/common/tasks.rs index 3d556ee07..6f27d7638 100644 --- a/crates/task-system/tests/common/tasks.rs +++ b/crates/task-system/tests/common/tasks.rs @@ -1,17 +1,22 @@ -use std::{future::pending, time::Duration}; +use std::{ + future::{pending, IntoFuture}, + time::Duration, +}; use sd_task_system::{ ExecStatus, Interrupter, InterruptionKind, IntoAnyTaskOutput, Task, TaskId, TaskOutput, }; +use async_channel as chan; use async_trait::async_trait; +use futures::FutureExt; use futures_concurrency::future::Race; use thiserror::Error; use tokio::{ sync::oneshot, time::{sleep, Instant}, }; -use tracing::{error, info}; +use tracing::{error, info, instrument}; #[derive(Debug, Error)] pub enum SampleError { @@ -215,6 +220,7 @@ impl Task for PauseOnceTask { self.id } + #[instrument(skip(self, interrupter), fields(task_id = %self.id))] async fn run(&mut self, interrupter: &Interrupter) -> Result { if let Some(began_tx) = self.began_tx.take() { if began_tx.send(()).is_err() { @@ -224,6 +230,7 @@ impl Task for PauseOnceTask { if !self.has_paused { self.has_paused = true; + info!("waiting for pause"); match interrupter.await { InterruptionKind::Pause => { info!("Pausing PauseOnceTask ", self.id); @@ -276,3 +283,59 @@ impl Task for BrokenTask { pending().await } } + +#[derive(Debug)] +pub struct WaitSignalTask { + id: TaskId, + signal_rx: chan::Receiver<()>, +} + +impl WaitSignalTask { + pub fn new() -> (Self, chan::Sender<()>) { + let (signal_tx, signal_rx) = chan::bounded(1); + ( + Self { + id: TaskId::new_v4(), + signal_rx, + }, + signal_tx, + ) + } +} + +#[async_trait] +impl Task for WaitSignalTask { + fn id(&self) -> TaskId { + self.id + } + + #[instrument(skip(self, interrupter), fields(task_id = %self.id))] + async fn run(&mut self, interrupter: &Interrupter) -> Result { + enum RaceOutput { + Signal, + Interrupt(InterruptionKind), + } + + let race = ( + self.signal_rx.recv().map(|res| { + res.unwrap(); + RaceOutput::Signal + }), + interrupter.into_future().map(RaceOutput::Interrupt), + ); + + match race.race().await { + RaceOutput::Signal => Ok(ExecStatus::Done(TaskOutput::Empty)), + RaceOutput::Interrupt(kind) => match kind { + InterruptionKind::Pause => { + info!("Paused"); + Ok(ExecStatus::Paused) + } + InterruptionKind::Cancel => { + info!("Canceled"); + Ok(ExecStatus::Canceled) + } + }, + } + } +} diff --git a/crates/task-system/tests/integration_test.rs b/crates/task-system/tests/integration_test.rs index db563754d..02c7cf6b7 100644 --- a/crates/task-system/tests/integration_test.rs +++ b/crates/task-system/tests/integration_test.rs @@ -1,4 +1,4 @@ -use sd_task_system::{TaskOutput, TaskStatus, TaskSystem}; +use sd_task_system::{TaskHandle, TaskOutput, TaskStatus, TaskSystem}; use std::{collections::VecDeque, time::Duration}; @@ -6,13 +6,16 @@ use futures_concurrency::future::Join; use rand::Rng; use tempfile::tempdir; use tracing::info; +use tracing_subscriber::EnvFilter; use tracing_test::traced_test; mod common; use common::{ actors::SampleActor, - tasks::{BogusTask, BrokenTask, NeverTask, PauseOnceTask, ReadyTask, SampleError}, + tasks::{ + BogusTask, BrokenTask, NeverTask, PauseOnceTask, ReadyTask, SampleError, WaitSignalTask, + }, }; use crate::common::jobs::SampleJob; @@ -57,7 +60,7 @@ async fn test_actor() { async fn shutdown_test() { let system = TaskSystem::new(); - let handle = system.dispatch(NeverTask::default()).await; + let handle = system.dispatch(NeverTask::default()).await.unwrap(); system.shutdown().await; @@ -69,10 +72,10 @@ async fn shutdown_test() { async fn cancel_test() { let system = TaskSystem::new(); - let handle = system.dispatch(NeverTask::default()).await; + let handle = system.dispatch(NeverTask::default()).await.unwrap(); info!("issuing cancel"); - handle.cancel().await; + handle.cancel().await.unwrap(); assert!(matches!(handle.await, Ok(TaskStatus::Canceled))); @@ -84,7 +87,7 @@ async fn cancel_test() { async fn done_test() { let system = TaskSystem::new(); - let handle = system.dispatch(ReadyTask::default()).await; + let handle = system.dispatch(ReadyTask::default()).await.unwrap(); assert!(matches!( handle.await, @@ -101,7 +104,7 @@ async fn abort_test() { let (task, began_rx) = BrokenTask::new(); - let handle = system.dispatch(task).await; + let handle = system.dispatch(task).await.unwrap(); began_rx.await.unwrap(); @@ -117,7 +120,7 @@ async fn abort_test() { async fn error_test() { let system = TaskSystem::new(); - let handle = system.dispatch(BogusTask::default()).await; + let handle = system.dispatch(BogusTask::default()).await.unwrap(); assert!(matches!( handle.await, @@ -134,7 +137,7 @@ async fn pause_test() { let (task, began_rx) = PauseOnceTask::new(); - let handle = system.dispatch(task).await; + let handle = system.dispatch(task).await.unwrap(); info!("Task dispatched, now we wait for it to begin..."); @@ -156,6 +159,83 @@ async fn pause_test() { system.shutdown().await; } +#[test] +fn many_pauses_test() { + std::env::set_var("RUST_LOG", "info,sd_task_system=error"); + + tracing_subscriber::fmt() + .with_file(true) + .with_line_number(true) + .with_env_filter(EnvFilter::from_default_env()) + .init(); + + std::thread::spawn(|| { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap() + .block_on(async move { + let system = TaskSystem::::new(); + + let (tasks, signalers) = (0..50) + .map(|_| WaitSignalTask::new()) + .unzip::<_, _, Vec<_>, Vec<_>>(); + + info!(total_tasks = %tasks.len()); + + let handles = system.dispatch_many(tasks).await.unwrap(); + + info!("all tasks dispatched"); + + for i in 1..=20 { + handles + .iter() + .map(TaskHandle::pause) + .collect::>() + .join() + .await; + + info!(%i, "all tasks paused"); + + handles + .iter() + .map(TaskHandle::resume) + .collect::>() + .join() + .await; + + info!(%i, "all tasks resumed"); + } + + signalers + .into_iter() + .enumerate() + .map(|(task_idx, signal_tx)| async move { + signal_tx.send(()).await.unwrap_or_else(|e| { + panic!("failed to send signal for task {task_idx}: {e:#?}") + }) + }) + .collect::>() + .join() + .await; + + info!("all tasks signaled for completion"); + + assert!(handles + .join() + .await + .into_iter() + .all(|res| matches!(res, Ok(TaskStatus::Done((_task_id, TaskOutput::Empty)))))); + + info!("all tasks done"); + + system.shutdown().await; + }) + }) + .join() + .unwrap(); +} + #[tokio::test] #[traced_test] async fn jobs_test() { @@ -182,11 +262,12 @@ async fn steal_test() { .unzip::<_, _, Vec<_>, Vec<_>>(); // With this, all workers will be busy - let mut pause_handles = VecDeque::from(system.dispatch_many(pause_tasks).await); + let mut pause_handles = VecDeque::from(system.dispatch_many(pause_tasks).await.unwrap()); let ready_handles = system .dispatch_many((0..100).map(|_| ReadyTask::default())) - .await; + .await + .unwrap(); pause_begans .into_iter() diff --git a/crates/utils/src/db.rs b/crates/utils/src/db.rs index f30a24773..edd406127 100644 --- a/crates/utils/src/db.rs +++ b/crates/utils/src/db.rs @@ -1,8 +1,11 @@ -use prisma_client_rust::{migrations::*, NewClientError}; -use sd_prisma::prisma::{self, PrismaClient}; +use prisma_client_rust::{ + migrations::{DbPushError, MigrateDeployError}, + NewClientError, +}; +use sd_prisma::prisma::PrismaClient; use thiserror::Error; -/// MigrationError represents an error that occurring while opening a initialising and running migrations on the database. +/// `[MigrationError]` represents an error that occurring while opening a initialising and running migrations on the database. #[derive(Error, Debug)] pub enum MigrationError { #[error("An error occurred while initialising a new database connection: {0}")] @@ -14,9 +17,9 @@ pub enum MigrationError { DbPushFailed(#[from] DbPushError), } -/// load_and_migrate will load the database from the given path and migrate it to the latest version of the schema. +/// `[load_and_migrate]` will load the database from the given path and migrate it to the latest version of the schema. pub async fn load_and_migrate(db_url: &str) -> Result { - let client = prisma::PrismaClient::_builder() + let client = PrismaClient::_builder() .with_url(db_url.to_string()) .build() .await @@ -57,25 +60,41 @@ pub async fn load_and_migrate(db_url: &str) -> Result u64 { - u64::from_le_bytes(db_inode.try_into().expect("corrupted inode in database")) +/// Construct back an inode after storing it in database +#[must_use] +pub const fn inode_from_db(db_inode: &[u8]) -> u64 { + u64::from_le_bytes([ + db_inode[0], + db_inode[1], + db_inode[2], + db_inode[3], + db_inode[4], + db_inode[5], + db_inode[6], + db_inode[7], + ]) } +/// Constructs a database representation of an inode +#[must_use] pub fn inode_to_db(inode: u64) -> Vec { inode.to_le_bytes().to_vec() } +#[must_use] pub fn ffmpeg_data_field_to_db(field: i64) -> Vec { field.to_be_bytes().to_vec() } -pub fn ffmpeg_data_field_from_db(field: &[u8]) -> i64 { +#[must_use] +pub const fn ffmpeg_data_field_from_db(field: &[u8]) -> i64 { i64::from_be_bytes([ field[0], field[1], field[2], field[3], field[4], field[5], field[6], field[7], ]) } -pub fn size_in_bytes_from_db(db_size_in_bytes: &[u8]) -> u64 { +#[must_use] +pub const fn size_in_bytes_from_db(db_size_in_bytes: &[u8]) -> u64 { u64::from_be_bytes([ db_size_in_bytes[0], db_size_in_bytes[1], @@ -88,6 +107,7 @@ pub fn size_in_bytes_from_db(db_size_in_bytes: &[u8]) -> u64 { ]) } +#[must_use] pub fn size_in_bytes_to_db(size: u64) -> Vec { size.to_be_bytes().to_vec() } @@ -105,7 +125,7 @@ impl MissingFieldError { impl From for rspc::Error { fn from(value: MissingFieldError) -> Self { - rspc::Error::with_cause( + Self::with_cause( rspc::ErrorCode::InternalServerError, "Missing crucial data in the database".to_string(), value, @@ -122,7 +142,7 @@ pub trait OptionalField: Sized { impl OptionalField for Option { type Out = T; - fn transform(self) -> Option { + fn transform(self) -> Self { self } } diff --git a/crates/utils/src/lib.rs b/crates/utils/src/lib.rs index d8e2236b7..07a0bd20a 100644 --- a/crates/utils/src/lib.rs +++ b/crates/utils/src/lib.rs @@ -1,3 +1,32 @@ +#![warn( + clippy::all, + clippy::pedantic, + clippy::correctness, + clippy::perf, + clippy::style, + clippy::suspicious, + clippy::complexity, + clippy::nursery, + clippy::unwrap_used, + unused_qualifications, + rust_2018_idioms, + trivial_casts, + trivial_numeric_casts, + unused_allocation, + clippy::unnecessary_cast, + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::cast_precision_loss, + clippy::cast_sign_loss, + clippy::dbg_macro, + clippy::deprecated_cfg_attr, + clippy::separated_literal_suffix, + deprecated +)] +#![forbid(deprecated_in_future)] +#![allow(clippy::missing_errors_doc, clippy::module_name_repetitions)] + use uuid::Uuid; pub mod db; @@ -17,11 +46,36 @@ pub fn chain_optional_iter( .collect() } +#[inline] #[must_use] -pub fn uuid_to_bytes(uuid: Uuid) -> Vec { +pub const fn u64_to_frontend(num: u64) -> (u32, u32) { + #[allow(clippy::cast_possible_truncation)] + { + // SAFETY: We're splitting in (high, low) parts, so we're not going to lose data on truncation + ((num >> 32) as u32, num as u32) + } +} + +#[inline] +#[must_use] +pub const fn i64_to_frontend(num: i64) -> (i32, u32) { + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + { + // SAFETY: We're splitting in (high, low) parts, so we're not going to lose data on truncation + ((num >> 32) as i32, num as u32) + } +} + +#[inline] +#[must_use] +pub fn uuid_to_bytes(uuid: &Uuid) -> Vec { uuid.as_bytes().to_vec() } +/// Converts a byte slice to a `Uuid` +/// # Panics +/// Panics if the byte slice is not a valid `Uuid` which means we have a corrupted database +#[inline] #[must_use] pub fn from_bytes_to_uuid(bytes: &[u8]) -> Uuid { Uuid::from_slice(bytes).expect("corrupted uuid in database") @@ -43,6 +97,8 @@ macro_rules! msgpack { // Only used for testing purposes. Do not use in production code. use std::any::type_name; +#[inline] +#[must_use] pub fn test_type_of(_: T) -> &'static str { type_name::() } diff --git a/interface/app/$libraryId/Explorer/FilePath/Thumb.tsx b/interface/app/$libraryId/Explorer/FilePath/Thumb.tsx index f533fe7ea..afd71e347 100644 --- a/interface/app/$libraryId/Explorer/FilePath/Thumb.tsx +++ b/interface/app/$libraryId/Explorer/FilePath/Thumb.tsx @@ -69,7 +69,7 @@ export const FileThumb = forwardRef((props, ref) = if ( loadState.thumbnail !== 'error' && itemData.hasLocalThumbnail && - itemData.thumbnailKey.length > 0 + itemData.thumbnailKey ) return { variant: 'thumbnail' }; @@ -87,7 +87,7 @@ export const FileThumb = forwardRef((props, ref) = break; case 'thumbnail': - if (itemData.thumbnailKey.length > 0) + if (itemData.thumbnailKey) return platform.getThumbnailUrlByThumbKey(itemData.thumbnailKey); break; diff --git a/interface/app/$libraryId/Explorer/store.ts b/interface/app/$libraryId/Explorer/store.ts index 810dc9f34..5ba8bfb96 100644 --- a/interface/app/$libraryId/Explorer/store.ts +++ b/interface/app/$libraryId/Explorer/store.ts @@ -1,4 +1,5 @@ import { + ThumbKey, resetStore, type DoubleClickAction, type ExplorerItem, @@ -113,14 +114,14 @@ const state = { quickRescanLastRun: Date.now() - 200 }; -export function flattenThumbnailKey(thumbKey: string[]) { - return thumbKey.join('/'); +export function flattenThumbnailKey(thumbKey: ThumbKey) { + return `${thumbKey.base_directory_str}/${thumbKey.shard_hex}/${thumbKey.cas_id}`; } export const explorerStore = proxy({ ...state, reset: (_state?: typeof state) => resetStore(explorerStore, _state || state), - addNewThumbnail: (thumbKey: string[]) => { + addNewThumbnail: (thumbKey: ThumbKey) => { explorerStore.newThumbnails.add(flattenThumbnailKey(thumbKey)); }, resetCache: () => { diff --git a/interface/app/$libraryId/Layout/Sidebar/JobManager/Job.tsx b/interface/app/$libraryId/Layout/Sidebar/JobManager/Job.tsx index 35553bb97..db1b29b3f 100644 --- a/interface/app/$libraryId/Layout/Sidebar/JobManager/Job.tsx +++ b/interface/app/$libraryId/Layout/Sidebar/JobManager/Job.tsx @@ -9,7 +9,7 @@ import { Trash } from '@phosphor-icons/react'; import { memo } from 'react'; -import { JobProgressEvent, JobReport, useJobInfo } from '@sd/client'; +import { JobProgressEvent, Report, useJobInfo } from '@sd/client'; import { ProgressBar } from '@sd/ui'; import { showAlertDialog } from '~/components'; import { useLocale } from '~/hooks'; @@ -17,7 +17,7 @@ import { useLocale } from '~/hooks'; import JobContainer from './JobContainer'; interface JobProps { - job: JobReport; + job: Report; className?: string; isChild?: boolean; progress: JobProgressEvent | null; @@ -41,12 +41,13 @@ function Job({ job, className, isChild, progress }: JobProps) { if (job.status === 'CompletedWithErrors') { const JobError = (
-				{job.errors_text.map((error, i) => (
+				{job.non_critical_errors.map((error, i) => (
 					

- {error.trim()} + {/* TODO: Report errors in a nicer way */} + {JSON.stringify(error)}

))}
diff --git a/interface/app/$libraryId/Layout/Sidebar/JobManager/JobGroup.tsx b/interface/app/$libraryId/Layout/Sidebar/JobManager/JobGroup.tsx index d579cef4a..b68449c8e 100644 --- a/interface/app/$libraryId/Layout/Sidebar/JobManager/JobGroup.tsx +++ b/interface/app/$libraryId/Layout/Sidebar/JobManager/JobGroup.tsx @@ -10,7 +10,7 @@ import { getTotalTasks, JobGroup, JobProgressEvent, - JobReport, + Report, useLibraryMutation, useTotalElapsedTimeText } from '@sd/client'; @@ -153,7 +153,7 @@ function Options({ setShowChildJobs, showChildJobs }: { - activeJob?: JobReport; + activeJob?: Report; group: JobGroup; setShowChildJobs: () => void; showChildJobs: boolean; @@ -201,7 +201,11 @@ function Options({ {(group.status === 'Queued' || group.status === 'Paused' || isJobPaused) && (