From 91d64f428ca5a3f256d39d0e3d08bc3fb9931bcd Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Mon, 29 Sep 2025 16:17:27 +0200 Subject: [PATCH] Add data cleaning script --- clean.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 clean.py diff --git a/clean.py b/clean.py new file mode 100644 index 0000000..1cbecbf --- /dev/null +++ b/clean.py @@ -0,0 +1,70 @@ +import csv +import json +from datetime import datetime +from pathlib import Path + + +def json_to_daily_pkg( + input_dir: Path, output_dir: Path, force: bool = False +) -> int | None: + if not input_dir.is_dir(): + raise ValueError + output_dir.mkdir(exist_ok=True, parents=True) + + # only clean if raw files have been updated since last time + if not force: + last_raw_update: datetime = datetime(1, 1, 1) + oldest_clean: datetime = datetime.now() + for file in input_dir.glob("*.json"): + mdate = datetime.fromtimestamp(file.stat().st_mtime) + if mdate > last_raw_update: + last_raw_update = mdate + for file in output_dir.glob("*.csv"): + mdate = datetime.fromtimestamp(file.stat().st_mtime) + if mdate < oldest_clean: + oldest_clean = mdate + if oldest_clean > last_raw_update: + print( + "INFO: Skip creating cleaned data," + f" last processing {oldest_clean} newer than {last_raw_update}" + ) + return + + for j in input_dir.glob("*.json"): + with open(j) as fr: + date = j.stem + output_file = output_dir / f"{date}.csv" + try: + data = json.load(fr) + except json.JSONDecodeError: + print(f"WARN: Could not decode JSON data for file {j}") + + if "Packages" not in data: + print( + f"WARN: No correct json structure containing 'Packages' field in file {j}" + ) + + with open(output_file, "w") as fw: + writer = csv.writer(fw) + writer.writerow(["date", "package", "downloads"]) + for entry in data["Packages"]: + p_name = entry + p_count = data["Packages"][entry] + p_date = date + writer.writerow([p_date, p_name, p_count]) + # print(output_file, p_date, p_name, p_count) + + +def main(input: str, output: str) -> None: + json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True) + + +if __name__ == "__main__": + import sys + + if not len(sys.argv) == 3: + print("Please provide exactly one input directory and one output directory.") + sys.exit(1) + inp = sys.argv[1] + out = sys.argv[2] + main(inp, out)