import csv import json from datetime import datetime from pathlib import Path from typing import Generator def json_to_daily_pkg(input_dir: Path, output_dir: Path, force: bool = False) -> None: if not input_dir.is_dir(): raise ValueError output_dir.mkdir(exist_ok=True, parents=True) # only clean if raw files have been updated since last time if not force and not newer_than_last_clean( input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily packages" ): return for j in input_dir.glob("*.json"): with open(j) as fr: date = j.stem output_file = output_dir / f"{date}.csv" try: data = json.load(fr) except json.JSONDecodeError: print(f"WARN: Could not decode JSON data for file {j}") if "Packages" not in data: print( f"WARN: No correct json structure containing 'Packages' field in file {j}" ) with open(output_file, "w") as fw: writer = csv.writer(fw) writer.writerow(["date", "package", "downloads"]) for entry in data["Packages"]: p_name = entry p_count = data["Packages"][entry] p_date = date writer.writerow([p_date, p_name, p_count]) # print(output_file, p_date, p_name, p_count) def json_to_unique_csv(input_dir: Path, output_dir: Path, force: bool = False) -> None: if not input_dir.is_dir(): raise ValueError output_dir.mkdir(exist_ok=True, parents=True) # only clean if raw files have been updated since last time if not force and not newer_than_last_clean( input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily unique installs" ): return output_file = output_dir / "unique_installs.csv" with open(output_file, "w") as fw: writer = csv.writer(fw) writer.writerow(["date", "unique"]) for j in input_dir.glob("*.json"): with open(j) as fr: date = j.stem try: data = json.load(fr) except json.JSONDecodeError: print(f"WARN: Could not decode JSON data for file {j}") if "UniqueInstalls" not in data: print( f"WARN: No correct json structure containing 'UniqueInstalls' field in file {j}" ) p_date = date p_count = data["UniqueInstalls"] writer.writerow([p_date, p_count]) # print(output_file, p_date, p_count) def json_to_daily_kernel_csv( input_dir: Path, output_dir: Path, force: bool = False ) -> None: if not input_dir.is_dir(): raise ValueError output_dir.mkdir(exist_ok=True, parents=True) # only clean if raw files have been updated since last time if not force and not newer_than_last_clean( input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily kernels" ): return for j in input_dir.glob("*.json"): with open(j) as fr: date = j.stem output_file = output_dir / f"{date}.csv" try: data = json.load(fr) except json.JSONDecodeError: print(f"WARN: Could not decode JSON data for file {j}") if "Packages" not in data: print( f"WARN: No correct json structure containing 'XuKernel' field in file {j}" ) with open(output_file, "w") as fw: writer = csv.writer(fw) writer.writerow(["date", "kernel", "downloads"]) for entry in data["XuKernel"]: p_name = entry p_count = data["XuKernel"][entry] p_date = date writer.writerow([p_date, p_name, p_count]) # print(output_file, p_date, p_name, p_count) def newer_than_last_clean( input_glob: Generator, output_glob: Generator, desc: str | None = None ) -> bool: last_raw_update: datetime = datetime(1, 1, 1) oldest_clean: datetime = datetime.now() for file in input_glob: mdate = datetime.fromtimestamp(file.stat().st_mtime) if mdate > last_raw_update: last_raw_update = mdate for file in output_glob: mdate = datetime.fromtimestamp(file.stat().st_mtime) if mdate < oldest_clean: oldest_clean = mdate if oldest_clean > last_raw_update: print( "INFO: Skip creating cleaned data" f"{f' for {desc}' if desc else ''}" f", last processing {oldest_clean} newer than {last_raw_update}" ) return False return True def main(input: str, output: str) -> None: json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True) json_to_unique_csv(Path(input) / "daily", Path(output), force=True) json_to_daily_kernel_csv( Path(input) / "daily", Path(output) / "kernels", force=True ) if __name__ == "__main__": import sys if not len(sys.argv) == 3: print("Please provide exactly one input directory and one output directory.") sys.exit(1) inp = sys.argv[1] out = sys.argv[2] main(inp, out)