From fa3473cdc2702cd29ad2169b6bde3eb355a80ff1 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 30 Sep 2025 18:32:03 +0200 Subject: [PATCH] add generation script for file stats --- code/files.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++ justfile | 7 ++++-- 2 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 code/files.py diff --git a/code/files.py b/code/files.py new file mode 100644 index 0000000..578ebeb --- /dev/null +++ b/code/files.py @@ -0,0 +1,67 @@ +import csv +from collections.abc import Generator +from datetime import datetime +from pathlib import Path + + +def filesize_csv(input_dir: Path, output_dir: Path) -> None: + output_file = output_dir / "files.csv" + with output_file.open("w") as fw: + writer = csv.writer(fw) + writer.writerow(["date", "filename", "mtime", "filesize"]) + + for j in input_dir.glob("*.json"): + p_date = j.stem + p_fname = j.name + stat = j.stat() + p_mtime = stat.st_mtime + p_size = stat.st_size + writer.writerow([p_date, p_fname, p_mtime, p_size]) + + +def newer_than_last_clean( + input_glob: Generator[Path], output_glob: Generator[Path], desc: str | None = None +) -> bool: + last_raw_update: datetime = datetime(1, 1, 1) + oldest_clean: datetime = datetime.now() + for file in input_glob: + mdate = datetime.fromtimestamp(file.stat().st_mtime) + if mdate > last_raw_update: + last_raw_update = mdate + for file in output_glob: + mdate = datetime.fromtimestamp(file.stat().st_mtime) + if mdate < oldest_clean: + oldest_clean = mdate + if oldest_clean > last_raw_update: + print( + "INFO: Skip creating cleaned data" + + f"{f' for {desc}' if desc else ''}" + + f", last processing {oldest_clean} newer than {last_raw_update}" + ) + return False + return True + + +def ensure_dirs(input_dir: Path, output_dir: Path): + if not input_dir.is_dir(): + raise ValueError + output_dir.mkdir(exist_ok=True, parents=True) + + +def main(input: str, output: str) -> None: + input_dir = Path(input) + output_dir = Path(output) + ensure_dirs(input_dir, output_dir) + + filesize_csv(input_dir, output_dir) + + +if __name__ == "__main__": + import sys + + if not len(sys.argv) == 3: + print("Please provide exactly one input directory and one output directory.") + sys.exit(1) + inp = sys.argv[1] + out = sys.argv[2] + main(inp, out) diff --git a/justfile b/justfile index a40ee33..0c45ee5 100644 --- a/justfile +++ b/justfile @@ -1,7 +1,10 @@ -default: kernels unique +default: files kernels unique + +files: + python code/files.py input output kernels: python code/kernels.py input output unique: - python ./code/unique.py input output + python code/unique.py input output