This commit is contained in:
Jamie Pine 2022-03-29 01:59:04 -07:00
parent 9fd8097376
commit b1253d78a7
11 changed files with 27839 additions and 15895 deletions

6659
apps/debug/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -6,4 +6,8 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
sdcorelib = { path = "../../packages/core" }
anyhow = "1.0.56"
data-encoding = "2.3.2"
ring = "0.16.20"
sha256 = "1.0.3"
# sdcorelib = { path = "../../packages/core" }

View file

@ -1,13 +1,87 @@
fn main() {
// let mounts = match get() {
// Ok(mounts) => mounts,
// Err(e) => {
// dbg!(e);
// return;
// }
// };
#![allow(dead_code)]
// for mount in mounts {
// dbg!(mount);
// }
use anyhow::Result;
use data_encoding::HEXLOWER;
use ring::digest::{Context, SHA256};
use std::fs::{self, File};
use std::io::{BufReader, Read};
use std::os::unix::prelude::FileExt;
use std::time::Instant;
static BIG_FILE: &str = "/Users/jamie/Movies/2022-03-08 06-08-35.mkv";
static LITTLE_FILE: &str = "/Users/jamie/Movies/client_state.json";
fn main() {
println!("Generating hash from file {:?}", BIG_FILE);
let start = Instant::now();
let checksum = sampled_checksum(BIG_FILE).unwrap();
println!(
"Sampled checksum completed in {:?} {}",
start.elapsed(),
checksum
);
let start = Instant::now();
let checksum = full_checksum(BIG_FILE).unwrap();
println!(
"Full checksum completed in {:?} {}",
start.elapsed(),
checksum
);
}
static SAMPLE_COUNT: u64 = 6;
static SAMPLE_SIZE: u64 = 10000;
pub fn sampled_checksum(path: &str) -> Result<String> {
// get file size
let metadata = fs::metadata(path)?;
let size = metadata.len();
// open file reference
let file = File::open(path)?;
let mut context = Context::new(&SHA256);
// if size is small enough, just read the whole thing
if SAMPLE_COUNT * SAMPLE_SIZE > size {
let mut buf = vec![0u8; size.try_into()?];
file.read_exact_at(&mut buf, 0)?;
context.update(&buf);
} else {
// loop over samples
for i in 0..SAMPLE_COUNT {
let start_point = (size / SAMPLE_COUNT) * i;
let mut buf = vec![0u8; SAMPLE_SIZE.try_into()?];
file.read_exact_at(&mut buf, start_point)?;
context.update(&buf);
}
// sample end of file
let mut buf = vec![0u8; SAMPLE_SIZE.try_into()?];
file.read_exact_at(&mut buf, size - SAMPLE_SIZE)?;
context.update(&buf);
}
let digest = context.finish();
let hex = HEXLOWER.encode(digest.as_ref());
Ok(hex)
}
pub fn full_checksum(path: &str) -> Result<String> {
// read file as buffer and convert to digest
let mut reader = BufReader::new(File::open(path).unwrap());
let mut context = Context::new(&SHA256);
let mut buffer = [0; 1024];
loop {
let count = reader.read(&mut buffer)?;
if count == 0 {
break;
}
context.update(&buffer[..count]);
}
let digest = context.finish();
// create a lowercase hash from
let hex = HEXLOWER.encode(digest.as_ref());
Ok(hex)
}

View file

@ -0,0 +1,23 @@
import React from 'react';
import ReactJson, { ReactJsonViewProps } from 'react-json-view';
export interface CodeBlockProps extends ReactJsonViewProps {}
export default function CodeBlock(props: CodeBlockProps) {
return (
<ReactJson
enableClipboard={false}
displayDataTypes={false}
theme="ocean"
style={{
padding: 20,
borderRadius: 5,
backgroundColor: '#101016',
border: 1,
borderColor: '#1E1E27',
borderStyle: 'solid'
}}
{...props}
/>
);
}

4
docs/FileIndexing.md Normal file
View file

@ -0,0 +1,4 @@
- Initial scan, ingest file paths into database.
- Query file_paths for all paths without a File identifier, chunks of 100 oldest to newest
-

View file

@ -0,0 +1,52 @@
/*
Warnings:
- You are about to drop the column `extension` on the `files` table. All the data in the column will be lost.
- You are about to drop the column `id_hash` on the `files` table. All the data in the column will be lost.
- You are about to drop the column `name` on the `files` table. All the data in the column will be lost.
- Added the required column `partial_checksum` to the `files` table without a default value. This is not possible if the table is not empty.
- Added the required column `name` to the `file_paths` table without a default value. This is not possible if the table is not empty.
*/
-- RedefineTables
PRAGMA foreign_keys=OFF;
CREATE TABLE "new_files" (
"id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"type" INTEGER NOT NULL DEFAULT 0,
"size_in_bytes" TEXT NOT NULL,
"partial_checksum" TEXT NOT NULL,
"checksum" TEXT,
"encryption" INTEGER NOT NULL DEFAULT 0,
"ipfs_id" TEXT,
"date_created" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
"date_modified" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
"date_indexed" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
);
INSERT INTO "new_files" ("checksum", "date_created", "date_indexed", "date_modified", "encryption", "id", "ipfs_id", "size_in_bytes") SELECT "checksum", "date_created", "date_indexed", "date_modified", "encryption", "id", "ipfs_id", "size_in_bytes" FROM "files";
DROP TABLE "files";
ALTER TABLE "new_files" RENAME TO "files";
CREATE UNIQUE INDEX "files_checksum_key" ON "files"("checksum");
CREATE TABLE "new_file_paths" (
"id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"is_dir" BOOLEAN NOT NULL DEFAULT false,
"location_id" INTEGER NOT NULL,
"materialized_path" TEXT NOT NULL,
"name" TEXT NOT NULL,
"extension" TEXT,
"file_id" INTEGER,
"parent_id" INTEGER,
"encryption" INTEGER NOT NULL DEFAULT 0,
"permissions" TEXT,
"date_created" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
"date_modified" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
"date_indexed" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT "file_paths_location_id_fkey" FOREIGN KEY ("location_id") REFERENCES "locations" ("id") ON DELETE NO ACTION ON UPDATE NO ACTION,
CONSTRAINT "file_paths_file_id_fkey" FOREIGN KEY ("file_id") REFERENCES "files" ("id") ON DELETE CASCADE ON UPDATE CASCADE,
CONSTRAINT "file_paths_parent_id_fkey" FOREIGN KEY ("parent_id") REFERENCES "file_paths" ("id") ON DELETE SET NULL ON UPDATE CASCADE
);
INSERT INTO "new_file_paths" ("date_indexed", "file_id", "id", "is_dir", "location_id", "materialized_path", "parent_id", "permissions") SELECT "date_indexed", "file_id", "id", "is_dir", "location_id", "materialized_path", "parent_id", "permissions" FROM "file_paths";
DROP TABLE "file_paths";
ALTER TABLE "new_file_paths" RENAME TO "file_paths";
CREATE UNIQUE INDEX "file_paths_location_id_materialized_path_name_extension_key" ON "file_paths"("location_id", "materialized_path", "name", "extension");
PRAGMA foreign_key_check;
PRAGMA foreign_keys=ON;

View file

@ -78,17 +78,17 @@ model Location {
}
model File {
id Int @id @default(autoincrement())
id_hash String @unique
name String
extension String?
checksum String?
size_in_bytes String
encryption Int @default(0)
id Int @id @default(autoincrement())
kind Int @default(0)
size_in_bytes String
partial_checksum String
checksum String? @unique
encryption Int @default(0)
ipfs_id String?
date_created DateTime @default(now())
date_modified DateTime @default(now())
date_indexed DateTime @default(now())
ipfs_id String?
file_tags TagOnFile[]
file_labels LabelOnFile[]
@ -98,20 +98,27 @@ model File {
}
model FilePath {
id Int @id @default(autoincrement())
is_dir Boolean @default(false)
materialized_path String
file_id Int?
file File? @relation(fields: [file_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
parent_id Int?
parent FilePath? @relation("directory_file_paths", fields: [parent_id], references: [id])
children FilePath[] @relation("directory_file_paths")
id Int @id @default(autoincrement())
is_dir Boolean @default(false)
location_id Int
location Location? @relation(fields: [location_id], references: [id], onDelete: NoAction, onUpdate: NoAction)
date_indexed DateTime @default(now())
materialized_path String
name String
extension String?
file_id Int?
parent_id Int?
encryption Int @default(0)
permissions String?
@@unique([location_id, materialized_path, file_id])
date_created DateTime @default(now())
date_modified DateTime @default(now())
date_indexed DateTime @default(now())
file File? @relation(fields: [file_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
location Location? @relation(fields: [location_id], references: [id], onDelete: NoAction, onUpdate: NoAction)
parent FilePath? @relation("directory_file_paths", fields: [parent_id], references: [id])
children FilePath[] @relation("directory_file_paths")
@@unique([location_id, materialized_path, name, extension])
@@map("file_paths")
}

View file

@ -1,9 +1,62 @@
use anyhow::Result;
use data_encoding::HEXLOWER;
use ring::digest::{Context, Digest, SHA256};
use sha256::digest;
use std::io;
use std::io::{BufReader, Read};
use std::time::Instant;
use std::convert::TryInto;
use std::fs::{self, File};
use std::io::{self, BufReader, Read};
use std::os::unix::prelude::FileExt;
static SAMPLE_COUNT: u64 = 4;
static SAMPLE_SIZE: u64 = 10000;
pub fn partial_checksum(path: &str, size: u64) -> Result<String> {
// open file reference
let file = File::open(path)?;
let mut context = Context::new(&SHA256);
// if size is small enough, just read the whole thing
if SAMPLE_COUNT * SAMPLE_SIZE > size {
let mut buf = vec![0u8; size.try_into()?];
file.read_exact_at(&mut buf, 0)?;
context.update(&buf);
} else {
// loop over samples
for i in 0..SAMPLE_COUNT {
let start_point = (size / SAMPLE_COUNT) * i;
let mut buf = vec![0u8; SAMPLE_SIZE.try_into()?];
file.read_exact_at(&mut buf, start_point)?;
context.update(&buf);
}
// sample end of file
let mut buf = vec![0u8; SAMPLE_SIZE.try_into()?];
file.read_exact_at(&mut buf, size - SAMPLE_SIZE)?;
context.update(&buf);
}
let digest = context.finish();
let hex = HEXLOWER.encode(digest.as_ref());
Ok(hex)
}
pub fn full_checksum(path: &str) -> Result<String> {
// read file as buffer and convert to digest
let mut reader = BufReader::new(File::open(path).unwrap());
let mut context = Context::new(&SHA256);
let mut buffer = [0; 1024];
loop {
let count = reader.read(&mut buffer)?;
if count == 0 {
break;
}
context.update(&buffer[..count]);
}
let digest = context.finish();
// create a lowercase hash from
let hex = HEXLOWER.encode(digest.as_ref());
Ok(hex)
}
pub fn sha256_digest<R: Read>(mut reader: R) -> io::Result<Digest> {
let mut context = Context::new(&SHA256);
@ -17,17 +70,3 @@ pub fn sha256_digest<R: Read>(mut reader: R) -> io::Result<Digest> {
}
Ok(context.finish())
}
pub async fn create_buffer_checksum(path: &str) -> io::Result<String> {
let start = Instant::now();
// read file as buffer and convert to digest
let digest = sha256_digest(BufReader::new(std::fs::File::open(path)?))?;
// create a lowercase hash from
let hex = HEXLOWER.encode(digest.as_ref());
println!("hashing complete in {:?} {}", start.elapsed(), hex);
Ok(hex)
}
pub fn create_meta_integrity_hash(uri: &str) -> io::Result<String> {
Ok(digest(format!("{}", uri)))
}

View file

@ -5,9 +5,12 @@ use crate::util::time;
use crate::CoreContext;
use anyhow::{anyhow, Result};
use serde::{Deserialize, Serialize};
use std::ffi::OsStr;
use std::{collections::HashMap, fs, path::Path, path::PathBuf, time::Instant};
use walkdir::{DirEntry, WalkDir};
use super::checksum::partial_checksum;
#[derive(Debug)]
pub struct IndexerJob {
pub path: String,
@ -73,6 +76,7 @@ pub async fn scan_path(
}
let dir_path = path.clone();
// spawn a dedicated thread to scan the directory for performance
let (paths, scan_start, on_progress) = tokio::task::spawn_blocking(move || {
// store every valid path discovered
let mut paths: Vec<(PathBuf, i64, Option<i64>)> = Vec::new();
@ -163,13 +167,13 @@ pub async fn scan_path(
}
let raw_sql = format!(
r#"
INSERT INTO file_paths (id, is_dir, location_id, parent_id, materialized_path, date_indexed)
INSERT INTO file_paths (id, is_dir, location_id, materialized_path, name, extension, parent_id, date_indexed)
VALUES {}
"#,
files.join(", ")
);
let _count = db._execute_raw(&raw_sql).await;
// println!("Inserted {:?} records", count);
let count = db._execute_raw(&raw_sql).await;
println!("Inserted {:?} records", count);
}
println!(
"scan of {:?} completed in {:?}. {:?} files found. db write completed in {:?}",
@ -191,7 +195,7 @@ fn prepare_values(
let metadata = fs::metadata(&file_path)?;
let location_path = location.path.as_ref().unwrap().as_str();
// let size = metadata.len();
// let name = extract_name(file_path.file_stem());
let name = extract_name(file_path.file_stem());
// let extension = extract_name(file_path.extension());
let materialized_path = match file_path.to_str() {
@ -203,43 +207,44 @@ fn prepare_values(
None => return Err(anyhow!("{}", file_path.to_str().unwrap_or_default())),
};
Ok(format!(
"({}, {}, {}, {}, \"{}\", \"{}\")",
let partial_checksum = {
if !metadata.is_dir() {
partial_checksum(&file_path.to_str().unwrap(), metadata.len()).unwrap()
} else {
"".to_string()
}
};
let values = format!(
"({}, {}, {}, \"{}\", \"{}\", \"{}\", \"{}\", \"{}\")",
id,
metadata.is_dir(),
location.id,
materialized_path,
name,
partial_checksum,
parent_id
.clone()
.map(|id| id.to_string())
.unwrap_or("NULL".to_string()),
materialized_path,
// &size.to_string(),
&time::system_time_to_date_time(metadata.created())
.unwrap()
.to_string(),
// &time::system_time_to_date_time(metadata.modified()).unwrap().to_string(),
))
}
);
pub async fn test_scan(path: &str) -> Result<()> {
let mut count: u32 = 0;
for entry in WalkDir::new(path).into_iter().filter_map(|e| e.ok()) {
let child_path = entry.path().to_str().unwrap();
count = count + 1;
println!("Reading file from dir {:?}", child_path);
}
println!("files found {}", count);
Ok(())
println!("{}", values);
Ok(values)
}
// extract name from OsStr returned by PathBuff
// fn extract_name(os_string: Option<&OsStr>) -> String {
// os_string
// .unwrap_or_default()
// .to_str()
// .unwrap_or_default()
// .to_owned()
// }
fn extract_name(os_string: Option<&OsStr>) -> String {
os_string
.unwrap_or_default()
.to_str()
.unwrap_or_default()
.to_owned()
}
fn is_hidden(entry: &DirEntry) -> bool {
entry
@ -284,47 +289,3 @@ fn is_app_bundle(entry: &DirEntry) -> bool {
is_app_bundle
}
// pub async fn scan_loc(core: &mut Core, location: &LocationResource) -> Result<()> {
// // get location by location_id from db and include location_paths
// // let job = core.queue(
// // job::JobResource::new(core.state.client_uuid.clone(), job::JobAction::ScanLoc, 1, |loc| {
// // if let Some(path) = &loc.path {
// // scan_path(core, path).await?;
// // watch_dir(path);
// // }
// // })
// // .await?,
// // );
// Ok(())
// }
// pub async fn scan_loc(core: &mut Core, location: &LocationResource) -> Result<()> {
// // get location by location_id from db and include location_paths
// let job = core.queue(
// job::JobResource::new(
// core.state.client_uuid.clone(),
// job::JobAction::ScanLoc,
// 1,
// Some(vec![location.path.as_ref().unwrap().to_string()]),
// )
// .await?,
// );
// if let Some(path) = &location.path {
// scan_path(core, path).await?;
// watch_dir(path);
// }
// Ok(())
// }
// impl From<ScanProgress> for JobReportUpdate {
// fn from(progress: ScanProgress) -> Self {
// match progress {
// ScanProgress::ChunkCount(count) => JobReportUpdate::TaskCount(count),
// ScanProgress::SavedChunks(count) => {
// JobReportUpdate::CompletedTaskCount(count)
// },
// ScanProgress::Message(message) => JobReportUpdate::Message(message),
// }
// }
// }

View file

@ -14,13 +14,12 @@ pub mod indexer;
pub mod thumb;
pub mod watcher;
// A unique file
#[derive(Debug, Clone, Serialize, Deserialize, TS)]
#[ts(export)]
pub struct File {
pub id: i64,
pub id_hash: String,
pub name: String,
pub extension: Option<String>,
pub partial_checksum: String,
pub checksum: Option<String>,
pub size_in_bytes: String,
pub encryption: EncryptionAlgorithm,
@ -35,15 +34,16 @@ pub struct File {
pub file_paths: Vec<FilePath>,
}
// A physical file path
#[derive(Debug, Clone, Serialize, Deserialize, TS)]
#[ts(export)]
pub struct FilePath {
pub id: i64,
pub is_dir: bool,
pub location_id: i64,
pub materialized_path: String,
pub file_id: Option<i64>,
pub parent_id: Option<i64>,
pub location_id: i64,
#[ts(type = "string")]
pub date_indexed: chrono::DateTime<chrono::Utc>,
pub permissions: Option<String>,
@ -68,9 +68,7 @@ impl Into<File> for FileData {
fn into(self) -> File {
File {
id: self.id,
id_hash: self.id_hash,
name: self.name,
extension: self.extension,
partial_checksum: self.partial_checksum,
checksum: self.checksum,
size_in_bytes: self.size_in_bytes.to_string(),
encryption: EncryptionAlgorithm::from_int(self.encryption).unwrap(),

File diff suppressed because one or more lines are too long