From 28eb0a39ae0b387fb5a786fbd5ad8a54abdd8f31 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 30 Sep 2025 18:48:45 +0200 Subject: [PATCH] add generation script for package stats --- code/files.py | 23 ----------------- code/packages.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ justfile | 5 +++- 3 files changed, 69 insertions(+), 24 deletions(-) create mode 100644 code/packages.py diff --git a/code/files.py b/code/files.py index 578ebeb..8a6776b 100644 --- a/code/files.py +++ b/code/files.py @@ -19,29 +19,6 @@ def filesize_csv(input_dir: Path, output_dir: Path) -> None: writer.writerow([p_date, p_fname, p_mtime, p_size]) -def newer_than_last_clean( - input_glob: Generator[Path], output_glob: Generator[Path], desc: str | None = None -) -> bool: - last_raw_update: datetime = datetime(1, 1, 1) - oldest_clean: datetime = datetime.now() - for file in input_glob: - mdate = datetime.fromtimestamp(file.stat().st_mtime) - if mdate > last_raw_update: - last_raw_update = mdate - for file in output_glob: - mdate = datetime.fromtimestamp(file.stat().st_mtime) - if mdate < oldest_clean: - oldest_clean = mdate - if oldest_clean > last_raw_update: - print( - "INFO: Skip creating cleaned data" - + f"{f' for {desc}' if desc else ''}" - + f", last processing {oldest_clean} newer than {last_raw_update}" - ) - return False - return True - - def ensure_dirs(input_dir: Path, output_dir: Path): if not input_dir.is_dir(): raise ValueError diff --git a/code/packages.py b/code/packages.py new file mode 100644 index 0000000..06faf94 --- /dev/null +++ b/code/packages.py @@ -0,0 +1,65 @@ +import csv +import json +from pathlib import Path +from typing import cast + + +def packages_csv(input_dir: Path, output_dir: Path) -> None: + output_file = output_dir / "packages.csv" + with output_file.open("w") as fw: + writer = csv.writer(fw) + writer.writerow(["date", "package", "version", "count"]) + + for j in input_dir.glob("*.json"): + with open(j) as fr: + date = j.stem + data: dict[str, object] = {} + try: + data = json.load(fr) + except json.JSONDecodeError: + print(f"WARN: Could not decode JSON data for file {j}") + continue + + if "Versions" not in data or not isinstance(data["Versions"], dict): + print( + f"WARN: No correct json structure containing 'Versions' field in file {j}" + ) + continue + + data_versions = cast(dict[str, dict[str, int]], data["Versions"]) + for package_name, package_versions in data_versions.items(): + if not isinstance(package_versions, dict): + print( + f"WARN: No correct json version structure containing versions in the Version field in file {j}" + ) + continue + for version, count in package_versions.items(): + p_name = package_name + p_version = version + v_count = count + p_date = date + writer.writerow([p_date, p_name, p_version, v_count]) + + +def ensure_dirs(input_dir: Path, output_dir: Path): + if not input_dir.is_dir(): + raise ValueError + output_dir.mkdir(exist_ok=True, parents=True) + + +def main(input: str, output: str) -> None: + input_dir = Path(input) + output_dir = Path(output) + ensure_dirs(input_dir, output_dir) + packages_csv(input_dir, output_dir) + + +if __name__ == "__main__": + import sys + + if not len(sys.argv) == 3: + print("Please provide exactly one input directory and one output directory.") + sys.exit(1) + inp = sys.argv[1] + out = sys.argv[2] + main(inp, out) diff --git a/justfile b/justfile index 0c45ee5..2f88a67 100644 --- a/justfile +++ b/justfile @@ -1,4 +1,4 @@ -default: files kernels unique +default: files kernels unique packages files: python code/files.py input output @@ -8,3 +8,6 @@ kernels: unique: python code/unique.py input output + +packages: + python code/packages.py input output