import csv from collections.abc import Generator from datetime import datetime from pathlib import Path def filesize_csv(input_dir: Path, output_dir: Path) -> None: output_file = output_dir / "files.csv" with output_file.open("w") as fw: writer = csv.writer(fw) writer.writerow(["date", "filename", "mtime", "filesize"]) for j in input_dir.glob("*.json"): p_date = j.stem p_fname = j.name stat = j.stat() p_mtime = stat.st_mtime p_size = stat.st_size writer.writerow([p_date, p_fname, p_mtime, p_size]) def newer_than_last_clean( input_glob: Generator[Path], output_glob: Generator[Path], desc: str | None = None ) -> bool: last_raw_update: datetime = datetime(1, 1, 1) oldest_clean: datetime = datetime.now() for file in input_glob: mdate = datetime.fromtimestamp(file.stat().st_mtime) if mdate > last_raw_update: last_raw_update = mdate for file in output_glob: mdate = datetime.fromtimestamp(file.stat().st_mtime) if mdate < oldest_clean: oldest_clean = mdate if oldest_clean > last_raw_update: print( "INFO: Skip creating cleaned data" + f"{f' for {desc}' if desc else ''}" + f", last processing {oldest_clean} newer than {last_raw_update}" ) return False return True def ensure_dirs(input_dir: Path, output_dir: Path): if not input_dir.is_dir(): raise ValueError output_dir.mkdir(exist_ok=True, parents=True) def main(input: str, output: str) -> None: input_dir = Path(input) output_dir = Path(output) ensure_dirs(input_dir, output_dir) filesize_csv(input_dir, output_dir) if __name__ == "__main__": import sys if not len(sys.argv) == 3: print("Please provide exactly one input directory and one output directory.") sys.exit(1) inp = sys.argv[1] out = sys.argv[2] main(inp, out)