WIP

2024-07-04 12:13:27 +00:00 · 2022-03-29 01:59:04 -07:00 · 2022-03-29 01:59:04 -07:00 · b1253d78a7
parent 9fd8097376
commit b1253d78a7
11 changed files with 27839 additions and 15895 deletions
--- a/apps/debug/Cargo.lock
+++ b/apps/debug/Cargo.lock
--- a/apps/debug/Cargo.toml
+++ b/apps/debug/Cargo.toml
@ -6,4 +6,8 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-sdcorelib = { path = "../../packages/core" }
+anyhow = "1.0.56"
+data-encoding = "2.3.2"
+ring = "0.16.20"
+sha256 = "1.0.3"
+# sdcorelib = { path = "../../packages/core" }
--- a/apps/debug/src/main.rs
+++ b/apps/debug/src/main.rs
@ -1,13 +1,87 @@
-fn main() {
-    // let mounts = match get() {
-    //     Ok(mounts) => mounts,
-    //     Err(e) => {
-    //         dbg!(e);
-    //         return;
-    //     }
-    // };
+#![allow(dead_code)]

-    // for mount in mounts {
-    //     dbg!(mount);
-    // }
+use anyhow::Result;
+use data_encoding::HEXLOWER;
+use ring::digest::{Context, SHA256};
+use std::fs::{self, File};
+use std::io::{BufReader, Read};
+use std::os::unix::prelude::FileExt;
+use std::time::Instant;
+
+static BIG_FILE: &str = "/Users/jamie/Movies/2022-03-08 06-08-35.mkv";
+static LITTLE_FILE: &str = "/Users/jamie/Movies/client_state.json";
+
+fn main() {
+    println!("Generating hash from file {:?}", BIG_FILE);
+
+    let start = Instant::now();
+    let checksum = sampled_checksum(BIG_FILE).unwrap();
+    println!(
+        "Sampled checksum completed in {:?} {}",
+        start.elapsed(),
+        checksum
+    );
+
+    let start = Instant::now();
+    let checksum = full_checksum(BIG_FILE).unwrap();
+    println!(
+        "Full checksum completed in {:?} {}",
+        start.elapsed(),
+        checksum
+    );
+}
+
+static SAMPLE_COUNT: u64 = 6;
+static SAMPLE_SIZE: u64 = 10000;
+
+pub fn sampled_checksum(path: &str) -> Result<String> {
+    // get file size
+    let metadata = fs::metadata(path)?;
+    let size = metadata.len();
+    // open file reference
+    let file = File::open(path)?;
+
+    let mut context = Context::new(&SHA256);
+    // if size is small enough, just read the whole thing
+    if SAMPLE_COUNT * SAMPLE_SIZE > size {
+        let mut buf = vec![0u8; size.try_into()?];
+        file.read_exact_at(&mut buf, 0)?;
+        context.update(&buf);
+    } else {
+        // loop over samples
+        for i in 0..SAMPLE_COUNT {
+            let start_point = (size / SAMPLE_COUNT) * i;
+            let mut buf = vec![0u8; SAMPLE_SIZE.try_into()?];
+            file.read_exact_at(&mut buf, start_point)?;
+            context.update(&buf);
+        }
+        // sample end of file
+        let mut buf = vec![0u8; SAMPLE_SIZE.try_into()?];
+        file.read_exact_at(&mut buf, size - SAMPLE_SIZE)?;
+        context.update(&buf);
+    }
+
+    let digest = context.finish();
+    let hex = HEXLOWER.encode(digest.as_ref());
+
+    Ok(hex)
+}
+
+pub fn full_checksum(path: &str) -> Result<String> {
+    // read file as buffer and convert to digest
+    let mut reader = BufReader::new(File::open(path).unwrap());
+    let mut context = Context::new(&SHA256);
+    let mut buffer = [0; 1024];
+    loop {
+        let count = reader.read(&mut buffer)?;
+        if count == 0 {
+            break;
+        }
+        context.update(&buffer[..count]);
+    }
+    let digest = context.finish();
+    // create a lowercase hash from
+    let hex = HEXLOWER.encode(digest.as_ref());
+
+    Ok(hex)
 }
--- a/apps/desktop/src/components/primitive/Codeblock.tsx
+++ b/apps/desktop/src/components/primitive/Codeblock.tsx
@ -0,0 +1,23 @@
+import React from 'react';
+import ReactJson, { ReactJsonViewProps } from 'react-json-view';
+
+export interface CodeBlockProps extends ReactJsonViewProps {}
+
+export default function CodeBlock(props: CodeBlockProps) {
+  return (
+    <ReactJson
+      enableClipboard={false}
+      displayDataTypes={false}
+      theme="ocean"
+      style={{
+        padding: 20,
+        borderRadius: 5,
+        backgroundColor: '#101016',
+        border: 1,
+        borderColor: '#1E1E27',
+        borderStyle: 'solid'
+      }}
+      {...props}
+    />
+  );
+}
--- a/docs/FileIndexing.md
+++ b/docs/FileIndexing.md
@ -0,0 +1,4 @@
+
+- Initial scan, ingest file paths into database.
+- Query file_paths for all paths without a File identifier, chunks of 100 oldest to newest
+- 
--- a/packages/core/prisma/migrations/20220329045012_/migration.sql
+++ b/packages/core/prisma/migrations/20220329045012_/migration.sql
@ -0,0 +1,52 @@
+/*
+  Warnings:
+
+  - You are about to drop the column `extension` on the `files` table. All the data in the column will be lost.
+  - You are about to drop the column `id_hash` on the `files` table. All the data in the column will be lost.
+  - You are about to drop the column `name` on the `files` table. All the data in the column will be lost.
+  - Added the required column `partial_checksum` to the `files` table without a default value. This is not possible if the table is not empty.
+  - Added the required column `name` to the `file_paths` table without a default value. This is not possible if the table is not empty.
+
+*/
+-- RedefineTables
+PRAGMA foreign_keys=OFF;
+CREATE TABLE "new_files" (
+    "id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+    "type" INTEGER NOT NULL DEFAULT 0,
+    "size_in_bytes" TEXT NOT NULL,
+    "partial_checksum" TEXT NOT NULL,
+    "checksum" TEXT,
+    "encryption" INTEGER NOT NULL DEFAULT 0,
+    "ipfs_id" TEXT,
+    "date_created" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "date_modified" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "date_indexed" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+INSERT INTO "new_files" ("checksum", "date_created", "date_indexed", "date_modified", "encryption", "id", "ipfs_id", "size_in_bytes") SELECT "checksum", "date_created", "date_indexed", "date_modified", "encryption", "id", "ipfs_id", "size_in_bytes" FROM "files";
+DROP TABLE "files";
+ALTER TABLE "new_files" RENAME TO "files";
+CREATE UNIQUE INDEX "files_checksum_key" ON "files"("checksum");
+CREATE TABLE "new_file_paths" (
+    "id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+    "is_dir" BOOLEAN NOT NULL DEFAULT false,
+    "location_id" INTEGER NOT NULL,
+    "materialized_path" TEXT NOT NULL,
+    "name" TEXT NOT NULL,
+    "extension" TEXT,
+    "file_id" INTEGER,
+    "parent_id" INTEGER,
+    "encryption" INTEGER NOT NULL DEFAULT 0,
+    "permissions" TEXT,
+    "date_created" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "date_modified" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "date_indexed" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    CONSTRAINT "file_paths_location_id_fkey" FOREIGN KEY ("location_id") REFERENCES "locations" ("id") ON DELETE NO ACTION ON UPDATE NO ACTION,
+    CONSTRAINT "file_paths_file_id_fkey" FOREIGN KEY ("file_id") REFERENCES "files" ("id") ON DELETE CASCADE ON UPDATE CASCADE,
+    CONSTRAINT "file_paths_parent_id_fkey" FOREIGN KEY ("parent_id") REFERENCES "file_paths" ("id") ON DELETE SET NULL ON UPDATE CASCADE
+);
+INSERT INTO "new_file_paths" ("date_indexed", "file_id", "id", "is_dir", "location_id", "materialized_path", "parent_id", "permissions") SELECT "date_indexed", "file_id", "id", "is_dir", "location_id", "materialized_path", "parent_id", "permissions" FROM "file_paths";
+DROP TABLE "file_paths";
+ALTER TABLE "new_file_paths" RENAME TO "file_paths";
+CREATE UNIQUE INDEX "file_paths_location_id_materialized_path_name_extension_key" ON "file_paths"("location_id", "materialized_path", "name", "extension");
+PRAGMA foreign_key_check;
+PRAGMA foreign_keys=ON;
--- a/packages/core/prisma/schema.prisma
+++ b/packages/core/prisma/schema.prisma
@ -78,17 +78,17 @@ model Location {
 }

 model File {
-    id            Int      @id @default(autoincrement())
-    id_hash       String   @unique
-    name          String
-    extension     String?
-    checksum      String?
-    size_in_bytes String
-    encryption    Int      @default(0)
+    id               Int     @id @default(autoincrement())
+    kind             Int     @default(0)
+    size_in_bytes    String
+    partial_checksum String
+    checksum         String? @unique
+    encryption       Int     @default(0)
+    ipfs_id          String?
+
    date_created  DateTime @default(now())
    date_modified DateTime @default(now())
    date_indexed  DateTime @default(now())
-    ipfs_id       String?

    file_tags   TagOnFile[]
    file_labels LabelOnFile[]
@ -98,20 +98,27 @@ model File {
 }

 model FilePath {
-    id                Int        @id @default(autoincrement())
-    is_dir            Boolean    @default(false)
-    materialized_path String
-    file_id           Int?
-    file              File?      @relation(fields: [file_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
-    parent_id         Int?
-    parent            FilePath?  @relation("directory_file_paths", fields: [parent_id], references: [id])
-    children          FilePath[] @relation("directory_file_paths")
+    id                Int     @id @default(autoincrement())
+    is_dir            Boolean @default(false)
    location_id       Int
-    location          Location?  @relation(fields: [location_id], references: [id], onDelete: NoAction, onUpdate: NoAction)
-    date_indexed      DateTime   @default(now())
+    materialized_path String
+    name              String
+    extension         String?
+    file_id           Int?
+    parent_id         Int?
+    encryption        Int     @default(0)
    permissions       String?

-    @@unique([location_id, materialized_path, file_id])
+    date_created  DateTime @default(now())
+    date_modified DateTime @default(now())
+    date_indexed  DateTime @default(now())
+
+    file     File?      @relation(fields: [file_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
+    location Location?  @relation(fields: [location_id], references: [id], onDelete: NoAction, onUpdate: NoAction)
+    parent   FilePath?  @relation("directory_file_paths", fields: [parent_id], references: [id])
+    children FilePath[] @relation("directory_file_paths")
+
+    @@unique([location_id, materialized_path, name, extension])
    @@map("file_paths")
 }

--- a/packages/core/src/file/checksum.rs
+++ b/packages/core/src/file/checksum.rs
@ -1,9 +1,62 @@
+use anyhow::Result;
 use data_encoding::HEXLOWER;
 use ring::digest::{Context, Digest, SHA256};
-use sha256::digest;
-use std::io;
-use std::io::{BufReader, Read};
-use std::time::Instant;
+use std::convert::TryInto;
+use std::fs::{self, File};
+use std::io::{self, BufReader, Read};
+use std::os::unix::prelude::FileExt;
+
+static SAMPLE_COUNT: u64 = 4;
+static SAMPLE_SIZE: u64 = 10000;
+
+pub fn partial_checksum(path: &str, size: u64) -> Result<String> {
+	// open file reference
+	let file = File::open(path)?;
+
+	let mut context = Context::new(&SHA256);
+	// if size is small enough, just read the whole thing
+	if SAMPLE_COUNT * SAMPLE_SIZE > size {
+		let mut buf = vec![0u8; size.try_into()?];
+		file.read_exact_at(&mut buf, 0)?;
+		context.update(&buf);
+	} else {
+		// loop over samples
+		for i in 0..SAMPLE_COUNT {
+			let start_point = (size / SAMPLE_COUNT) * i;
+			let mut buf = vec![0u8; SAMPLE_SIZE.try_into()?];
+			file.read_exact_at(&mut buf, start_point)?;
+			context.update(&buf);
+		}
+		// sample end of file
+		let mut buf = vec![0u8; SAMPLE_SIZE.try_into()?];
+		file.read_exact_at(&mut buf, size - SAMPLE_SIZE)?;
+		context.update(&buf);
+	}
+
+	let digest = context.finish();
+	let hex = HEXLOWER.encode(digest.as_ref());
+
+	Ok(hex)
+}
+
+pub fn full_checksum(path: &str) -> Result<String> {
+	// read file as buffer and convert to digest
+	let mut reader = BufReader::new(File::open(path).unwrap());
+	let mut context = Context::new(&SHA256);
+	let mut buffer = [0; 1024];
+	loop {
+		let count = reader.read(&mut buffer)?;
+		if count == 0 {
+			break;
+		}
+		context.update(&buffer[..count]);
+	}
+	let digest = context.finish();
+	// create a lowercase hash from
+	let hex = HEXLOWER.encode(digest.as_ref());
+
+	Ok(hex)
+}

 pub fn sha256_digest<R: Read>(mut reader: R) -> io::Result<Digest> {
 	let mut context = Context::new(&SHA256);
@ -17,17 +70,3 @@ pub fn sha256_digest<R: Read>(mut reader: R) -> io::Result<Digest> {
 	}
 	Ok(context.finish())
 }
-
-pub async fn create_buffer_checksum(path: &str) -> io::Result<String> {
-	let start = Instant::now();
-	// read file as buffer and convert to digest
-	let digest = sha256_digest(BufReader::new(std::fs::File::open(path)?))?;
-	// create a lowercase hash from
-	let hex = HEXLOWER.encode(digest.as_ref());
-	println!("hashing complete in {:?} {}", start.elapsed(), hex);
-	Ok(hex)
-}
-
-pub fn create_meta_integrity_hash(uri: &str) -> io::Result<String> {
-	Ok(digest(format!("{}", uri)))
-}
--- a/packages/core/src/file/indexer.rs
+++ b/packages/core/src/file/indexer.rs
@ -5,9 +5,12 @@ use crate::util::time;
 use crate::CoreContext;
 use anyhow::{anyhow, Result};
 use serde::{Deserialize, Serialize};
+use std::ffi::OsStr;
 use std::{collections::HashMap, fs, path::Path, path::PathBuf, time::Instant};
 use walkdir::{DirEntry, WalkDir};

+use super::checksum::partial_checksum;
+
 #[derive(Debug)]
 pub struct IndexerJob {
 	pub path: String,
@ -73,6 +76,7 @@ pub async fn scan_path(
 	}
 	let dir_path = path.clone();

+	// spawn a dedicated thread to scan the directory for performance
 	let (paths, scan_start, on_progress) = tokio::task::spawn_blocking(move || {
 		// store every valid path discovered
 		let mut paths: Vec<(PathBuf, i64, Option<i64>)> = Vec::new();
@ -163,13 +167,13 @@ pub async fn scan_path(
 		}
 		let raw_sql = format!(
 			r#"
-                INSERT INTO file_paths (id, is_dir, location_id, parent_id, materialized_path, date_indexed) 
+                INSERT INTO file_paths (id, is_dir, location_id, materialized_path, name, extension, parent_id, date_indexed) 
                VALUES {}
            "#,
 			files.join(", ")
 		);
-		let _count = db._execute_raw(&raw_sql).await;
-		// println!("Inserted {:?} records", count);
+		let count = db._execute_raw(&raw_sql).await;
+		println!("Inserted {:?} records", count);
 	}
 	println!(
 		"scan of {:?} completed in {:?}. {:?} files found. db write completed in {:?}",
@ -191,7 +195,7 @@ fn prepare_values(
 	let metadata = fs::metadata(&file_path)?;
 	let location_path = location.path.as_ref().unwrap().as_str();
 	// let size = metadata.len();
-	// let name = extract_name(file_path.file_stem());
+	let name = extract_name(file_path.file_stem());
 	// let extension = extract_name(file_path.extension());

 	let materialized_path = match file_path.to_str() {
@ -203,43 +207,44 @@ fn prepare_values(
 		None => return Err(anyhow!("{}", file_path.to_str().unwrap_or_default())),
 	};

-	Ok(format!(
-		"({}, {}, {}, {}, \"{}\", \"{}\")",
+	let partial_checksum = {
+		if !metadata.is_dir() {
+			partial_checksum(&file_path.to_str().unwrap(), metadata.len()).unwrap()
+		} else {
+			"".to_string()
+		}
+	};
+
+	let values = format!(
+		"({}, {}, {}, \"{}\", \"{}\", \"{}\", \"{}\", \"{}\")",
 		id,
 		metadata.is_dir(),
 		location.id,
+		materialized_path,
+		name,
+		partial_checksum,
 		parent_id
 			.clone()
 			.map(|id| id.to_string())
 			.unwrap_or("NULL".to_string()),
-		materialized_path,
-		// &size.to_string(),
 		&time::system_time_to_date_time(metadata.created())
 			.unwrap()
 			.to_string(),
-		// &time::system_time_to_date_time(metadata.modified()).unwrap().to_string(),
-	))
-}
+	);

-pub async fn test_scan(path: &str) -> Result<()> {
-	let mut count: u32 = 0;
-	for entry in WalkDir::new(path).into_iter().filter_map(|e| e.ok()) {
-		let child_path = entry.path().to_str().unwrap();
-		count = count + 1;
-		println!("Reading file from dir {:?}", child_path);
-	}
-	println!("files found {}", count);
-	Ok(())
+	println!("{}", values);
+
+	Ok(values)
 }

 // extract name from OsStr returned by PathBuff
-// fn extract_name(os_string: Option<&OsStr>) -> String {
-// 	os_string
-// 		.unwrap_or_default()
-// 		.to_str()
-// 		.unwrap_or_default()
-// 		.to_owned()
-// }
+fn extract_name(os_string: Option<&OsStr>) -> String {
+	os_string
+		.unwrap_or_default()
+		.to_str()
+		.unwrap_or_default()
+		.to_owned()
+}

 fn is_hidden(entry: &DirEntry) -> bool {
 	entry
@ -284,47 +289,3 @@ fn is_app_bundle(entry: &DirEntry) -> bool {

 	is_app_bundle
 }
-// pub async fn scan_loc(core: &mut Core, location: &LocationResource) -> Result<()> {
-// 	// get location by location_id from db and include location_paths
-// 	// let job = core.queue(
-// 	// 	job::JobResource::new(core.state.client_uuid.clone(), job::JobAction::ScanLoc, 1, |loc| {
-// 	// 		if let Some(path) = &loc.path {
-// 	// 			scan_path(core, path).await?;
-// 	// 			watch_dir(path);
-// 	// 		}
-// 	// 	})
-// 	// 	.await?,
-// 	// );
-
-// 	Ok(())
-// }
-// pub async fn scan_loc(core: &mut Core, location: &LocationResource) -> Result<()> {
-// 	// get location by location_id from db and include location_paths
-// 	let job = core.queue(
-// 		job::JobResource::new(
-// 			core.state.client_uuid.clone(),
-// 			job::JobAction::ScanLoc,
-// 			1,
-// 			Some(vec![location.path.as_ref().unwrap().to_string()]),
-// 		)
-// 		.await?,
-// 	);
-
-// 	if let Some(path) = &location.path {
-// 		scan_path(core, path).await?;
-// 		watch_dir(path);
-// 	}
-// 	Ok(())
-// }
-
-// impl From<ScanProgress> for JobReportUpdate {
-// 	fn from(progress: ScanProgress) -> Self {
-// 		match progress {
-// 			ScanProgress::ChunkCount(count) => JobReportUpdate::TaskCount(count),
-// 			ScanProgress::SavedChunks(count) => {
-// 				JobReportUpdate::CompletedTaskCount(count)
-// 			},
-// 			ScanProgress::Message(message) => JobReportUpdate::Message(message),
-// 		}
-// 	}
-// }
--- a/packages/core/src/file/mod.rs
+++ b/packages/core/src/file/mod.rs
@ -14,13 +14,12 @@ pub mod indexer;
 pub mod thumb;
 pub mod watcher;

+// A unique file
 #[derive(Debug, Clone, Serialize, Deserialize, TS)]
 #[ts(export)]
 pub struct File {
 	pub id: i64,
-	pub id_hash: String,
-	pub name: String,
-	pub extension: Option<String>,
+	pub partial_checksum: String,
 	pub checksum: Option<String>,
 	pub size_in_bytes: String,
 	pub encryption: EncryptionAlgorithm,
@ -35,15 +34,16 @@ pub struct File {
 	pub file_paths: Vec<FilePath>,
 }

+// A physical file path
 #[derive(Debug, Clone, Serialize, Deserialize, TS)]
 #[ts(export)]
 pub struct FilePath {
 	pub id: i64,
 	pub is_dir: bool,
+	pub location_id: i64,
 	pub materialized_path: String,
 	pub file_id: Option<i64>,
 	pub parent_id: Option<i64>,
-	pub location_id: i64,
 	#[ts(type = "string")]
 	pub date_indexed: chrono::DateTime<chrono::Utc>,
 	pub permissions: Option<String>,
@ -68,9 +68,7 @@ impl Into<File> for FileData {
 	fn into(self) -> File {
 		File {
 			id: self.id,
-			id_hash: self.id_hash,
-			name: self.name,
-			extension: self.extension,
+			partial_checksum: self.partial_checksum,
 			checksum: self.checksum,
 			size_in_bytes: self.size_in_bytes.to_string(),
 			encryption: EncryptionAlgorithm::from_int(self.encryption).unwrap(),
--- a/packages/core/src/prisma.rs
+++ b/packages/core/src/prisma.rs