Fix cas_id sample hashing logic (#672)

This commit is contained in:
Vítor Vasconcellos 2023-04-04 10:59:19 -03:00 committed by GitHub
parent ce9be10cdb
commit b711fe8b27
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -15,9 +15,10 @@ const HEADER_OR_FOOTER_SIZE: u64 = 1024 * 8;
const MINIMUM_FILE_SIZE: u64 = 1024 * 100;
// Asserting that nobody messed up our consts
const_assert!(
HEADER_OR_FOOTER_SIZE + SAMPLE_COUNT * SAMPLE_SIZE + HEADER_OR_FOOTER_SIZE < MINIMUM_FILE_SIZE
);
const_assert!((HEADER_OR_FOOTER_SIZE * 2 + SAMPLE_COUNT * SAMPLE_SIZE) < MINIMUM_FILE_SIZE);
// Asserting that the sample size is larger than header/footer size, as the same buffer is used for both
const_assert!(SAMPLE_SIZE > HEADER_OR_FOOTER_SIZE);
pub async fn generate_cas_id(path: impl AsRef<Path>, size: u64) -> Result<String, io::Error> {
let mut hasher = Hasher::new();
@ -25,26 +26,28 @@ pub async fn generate_cas_id(path: impl AsRef<Path>, size: u64) -> Result<String
if size <= MINIMUM_FILE_SIZE {
// For small files, we hash the whole file
fs::read(path).await.map(|buf| {
hasher.update(&buf);
})?;
hasher.update(&fs::read(path).await?);
} else {
let mut file = File::open(path).await?;
let mut buf = vec![0; SAMPLE_SIZE as usize].into_boxed_slice();
// Hashing the header
file.read_exact(&mut buf[..HEADER_OR_FOOTER_SIZE as usize])
.await?;
hasher.update(&buf);
let mut current_pos = file
.read_exact(&mut buf[..HEADER_OR_FOOTER_SIZE as usize])
.await? as u64;
hasher.update(&buf[..HEADER_OR_FOOTER_SIZE as usize]);
// Sample hashing the inner content of the file
for _ in 0..SAMPLE_COUNT {
file.seek(SeekFrom::Current(
((size - HEADER_OR_FOOTER_SIZE * 2) / SAMPLE_COUNT) as i64,
))
.await?;
let seek_jump = (size - HEADER_OR_FOOTER_SIZE * 2) / SAMPLE_COUNT;
loop {
file.read_exact(&mut buf).await?;
hasher.update(&buf);
if current_pos >= (HEADER_OR_FOOTER_SIZE + seek_jump * (SAMPLE_COUNT - 1)) {
break;
}
current_pos = file.seek(SeekFrom::Start(current_pos + seek_jump)).await?;
}
// Hashing the footer
@ -52,7 +55,7 @@ pub async fn generate_cas_id(path: impl AsRef<Path>, size: u64) -> Result<String
.await?;
file.read_exact(&mut buf[..HEADER_OR_FOOTER_SIZE as usize])
.await?;
hasher.update(&buf);
hasher.update(&buf[..HEADER_OR_FOOTER_SIZE as usize]);
}
Ok(hasher.finalize().to_hex()[..16].to_string())