diff options
author | Stefan Kreutz <mail@skreutz.com> | 2024-03-24 15:04:09 +0100 |
---|---|---|
committer | Stefan Kreutz <mail@skreutz.com> | 2024-03-24 15:04:09 +0100 |
commit | c1fa48e9bd617d70e823efef5d6dcea41b1d2087 (patch) | |
tree | 421e69c512ac54bf65495ef23fd7d9ec5a5e67d5 /src/core.rs | |
download | brck-c1fa48e9bd617d70e823efef5d6dcea41b1d2087.tar |
Add initial implementationbrck-0.1.0
Diffstat (limited to 'src/core.rs')
-rw-r--r-- | src/core.rs | 306 |
1 files changed, 306 insertions, 0 deletions
diff --git a/src/core.rs b/src/core.rs new file mode 100644 index 0000000..6ac3ba5 --- /dev/null +++ b/src/core.rs @@ -0,0 +1,306 @@ +//! Core functionality. +//! +//! This module contains Brck's core types and functions. + +use std::{ + cmp::Ordering, + fs::OpenOptions, + io::{BufReader, Read}, + path::{Path, PathBuf}, + sync::{atomic::AtomicBool, Arc}, +}; + +use chrono::{DateTime, Utc}; +use either::Either; +use flate2::bufread::GzDecoder; +use itertools::{EitherOrBoth, Itertools}; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; +use walkdir::WalkDir; + +/// Produces an iterator over the regular files below the given path. +/// +/// The files are sorted by their full path. +/// Symlinks are not followed, except for the original `path`. +pub fn find_files<P: AsRef<Path>>( + path: P, +) -> impl Iterator<Item = Result<PathBuf, walkdir::Error>> { + WalkDir::new(path.as_ref()) + .sort_by_path() + .into_iter() + .filter_ok(|entry| entry.file_type().is_file()) + .map_ok(|entry| entry.into_path()) +} + +/// Extension for [WalkDir]. +trait WalkDirExt { + /// Sort directory entries by full path. + /// + /// Yiedls "foo bar/buzz" before "foo/buzz". + fn sort_by_path(self) -> WalkDir; +} + +impl WalkDirExt for WalkDir { + fn sort_by_path(self) -> WalkDir { + self.sort_by_key(|a| { + if a.file_type().is_dir() { + let mut name = a.file_name().to_os_string(); + name.push(std::path::MAIN_SEPARATOR.to_string()); + name + } else { + a.file_name().to_os_string() + } + }) + } +} + +/// Returns an iterator over the records of the given database. +pub fn read_db<P: AsRef<Path>>( + path: P, +) -> Result<impl Iterator<Item = Result<Record, serde_json::Error>>, std::io::Error> { + let file = OpenOptions::new().read(true).open(path)?; + let buf = BufReader::new(file); + let dec = GzDecoder::new(buf); + let iter = serde_json::Deserializer::from_reader(dec).into_iter::<Record>(); + Ok(iter) +} + +/// A record of a file's modification time and hash sum. +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct Record { + pub path: PathBuf, + pub modified: DateTime<Utc>, + #[serde(with = "hex::serde")] + pub sha256: Vec<u8>, +} + +impl Record { + /// Constructs a `Record` of the given file. + /// + /// This function reads the given file in chunks of at most `chunk_size` bytes. It checks the + /// `terminate` flag before reading each chunk, returning the `RecordError::Interrupt` error + /// when set. + pub fn from_path<P: AsRef<Path>>( + path: P, + chunk_size: u64, + terminate: Arc<AtomicBool>, + ) -> Result<Record, RecordError> { + let modified = std::fs::symlink_metadata(&path)?.modified()?; + let mut file = std::fs::File::open(&path)?; + let mut hasher = Sha256::new(); + loop { + if terminate.load(std::sync::atomic::Ordering::SeqCst) { + return Err(RecordError::Interrupt); + } + let mut reader = std::io::Read::by_ref(&mut file).take(chunk_size); + if std::io::copy(&mut reader, &mut hasher)? == 0 { + break; + } + } + let hash = hasher.finalize(); + Ok(Record { + path: path.as_ref().into(), + modified: modified.into(), + sha256: hash.as_slice().into(), + }) + } +} + +/// The possible errors of [Record::from_path]. +#[derive(Debug)] +pub enum RecordError { + IO(std::io::Error), + Interrupt, +} + +impl std::fmt::Display for RecordError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RecordError::IO(err) => write!(f, "{}", err), + RecordError::Interrupt => write!(f, "Received interrupt signal"), + } + } +} + +impl std::error::Error for RecordError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + RecordError::IO(err) => Some(err), + RecordError::Interrupt => None, + } + } +} + +impl From<std::io::Error> for RecordError { + fn from(value: std::io::Error) -> Self { + RecordError::IO(value) + } +} + +/// Returns an iterator over the difference of the given files. +pub fn diff<I, J, E, F>(cached: I, found: J) -> impl Iterator<Item = Result<Diff, Either<E, F>>> +where + I: Iterator<Item = Result<Record, E>>, + J: Iterator<Item = Result<Record, F>>, +{ + cached + .merge_join_by(found, |cached, found| match (cached, found) { + (Err(_), _) => Ordering::Less, + (Ok(_), Err(_)) => Ordering::Greater, + (Ok(cached), Ok(found)) => cached.path.cmp(&found.path), + }) + .map(|item| match item { + EitherOrBoth::Left(Err(err)) => Err(Either::Left(err)), + EitherOrBoth::Right(Err(err)) => Err(Either::Right(err)), + EitherOrBoth::Left(Ok(old)) => Ok(Diff::Removed { old }), + EitherOrBoth::Right(Ok(new)) => Ok(Diff::Added { new }), + EitherOrBoth::Both(Ok(old), Ok(new)) => { + if old.modified == new.modified { + if old.sha256 == new.sha256 { + Ok(Diff::Unchanged { old, new }) + } else { + Ok(Diff::Corrupted { old, new }) + } + } else if old.sha256 == new.sha256 { + Ok(Diff::Touched { old, new }) + } else { + Ok(Diff::Changed { old, new }) + } + } + _ => unreachable!(), + }) +} + +/// The list of possible types of differences. +#[derive(Debug, Clone, Hash, Eq, PartialEq, clap::ValueEnum)] +pub enum DiffKind { + Added, + Touched, + Changed, + Unchanged, + Removed, + Corrupted, +} + +/// The difference between two records of a file. +#[derive(Debug, Clone, Eq, PartialEq, Serialize)] +#[serde(tag = "type", rename_all = "lowercase")] +pub enum Diff { + // Modeled variants as structs to improve their JSON representation. + Added { new: Record }, + Touched { old: Record, new: Record }, + Changed { old: Record, new: Record }, + Unchanged { old: Record, new: Record }, + Removed { old: Record }, + Corrupted { old: Record, new: Record }, +} + +impl Diff { + pub fn kind(&self) -> DiffKind { + match self { + Diff::Added { .. } => DiffKind::Added, + Diff::Touched { .. } => DiffKind::Touched, + Diff::Changed { .. } => DiffKind::Changed, + Diff::Unchanged { .. } => DiffKind::Unchanged, + Diff::Removed { .. } => DiffKind::Removed, + Diff::Corrupted { .. } => DiffKind::Corrupted, + } + } +} + +impl std::fmt::Display for Diff { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Diff::Added { new, .. } => write!(f, "added: {}", new.path.display()), + Diff::Touched { old, .. } => write!(f, "touched: {}", old.path.display()), + Diff::Changed { old, .. } => write!(f, "changed: {}", old.path.display()), + Diff::Unchanged { old, .. } => write!(f, "unchanged: {}", old.path.display()), + Diff::Removed { old, .. } => write!(f, "removed: {}", old.path.display()), + Diff::Corrupted { old, .. } => write!(f, "corrupted: {}", old.path.display()), + } + } +} + +#[cfg(test)] +mod tests { + use testresult::TestResult; + + use super::*; + + fn record(path: &str, modified: &str, sha256: &str) -> Record { + Record { + path: path.into(), + modified: DateTime::parse_from_rfc3339(modified).unwrap().into(), + sha256: sha256.into(), + } + } + + #[test] + fn diff_works() -> TestResult { + let a = record("a", "2024-03-22T00:00:00Z", "apple"); + let b = record("b", "2024-03-22T00:00:00Z", "banana"); + let b_touched = record("b", "2024-03-22T01:00:00Z", "banana"); + let c = record("c", "2024-03-22T00:00:00Z", "cherry"); + let c_changed = record("c", "2024-03-22T01:00:00Z", "cashew"); + let d = record("d", "2024-03-22T00:00:00Z", "date"); + let e = record("e", "2024-03-22T00:00:00Z", "elderberry"); + let f = record("f", "2024-03-22T00:00:00Z", "fig"); + let f_corrupted = record("f", "2024-03-22T00:00:00Z", "feijoa"); + + let db = [ + Err("foo"), + Ok(b.clone()), + Ok(c.clone()), + Ok(d.clone()), + Ok(e.clone()), + Ok(f.clone()), + ]; + + let fs = [ + Ok(a.clone()), + Ok(b_touched.clone()), + Err("bar"), + Ok(c_changed.clone()), + Ok(d.clone()), + Ok(f_corrupted.clone()), + ]; + + let mut result = diff(db.into_iter(), fs.into_iter()); + + assert_eq!(result.next(), Some(Err(Either::Left("foo")))); + assert_eq!(result.next(), Some(Ok(Diff::Added { new: a }))); + assert_eq!( + result.next(), + Some(Ok(Diff::Touched { + old: b, + new: b_touched + })) + ); + assert_eq!(result.next(), Some(Err(Either::Right("bar")))); + assert_eq!( + result.next(), + Some(Ok(Diff::Changed { + old: c, + new: c_changed + })) + ); + assert_eq!( + result.next(), + Some(Ok(Diff::Unchanged { + old: d.clone(), + new: d + })) + ); + assert_eq!(result.next(), Some(Ok(Diff::Removed { old: e }))); + assert_eq!( + result.next(), + Some(Ok(Diff::Corrupted { + old: f, + new: f_corrupted + })) + ); + assert_eq!(result.next(), None); + + Ok(()) + } +} |