diff --git a/clean.py b/clean.py index 3cb6707..5b13ac3 100644 --- a/clean.py +++ b/clean.py @@ -41,6 +41,79 @@ def json_to_daily_pkg(input_dir: Path, output_dir: Path, force: bool = False) -> # print(output_file, p_date, p_name, p_count) +def json_to_unique_csv(input_dir: Path, output_dir: Path, force: bool = False) -> None: + if not input_dir.is_dir(): + raise ValueError + output_dir.mkdir(exist_ok=True, parents=True) + + # only clean if raw files have been updated since last time + if not force and not newer_than_last_clean( + input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily unique installs" + ): + return + + output_file = output_dir / "unique_installs.csv" + with open(output_file, "w") as fw: + writer = csv.writer(fw) + writer.writerow(["date", "unique"]) + + for j in input_dir.glob("*.json"): + with open(j) as fr: + date = j.stem + try: + data = json.load(fr) + except json.JSONDecodeError: + print(f"WARN: Could not decode JSON data for file {j}") + + if "UniqueInstalls" not in data: + print( + f"WARN: No correct json structure containing 'UniqueInstalls' field in file {j}" + ) + + p_date = date + p_count = data["UniqueInstalls"] + writer.writerow([p_date, p_count]) + # print(output_file, p_date, p_count) + + +def json_to_daily_kernel_csv( + input_dir: Path, output_dir: Path, force: bool = False +) -> None: + if not input_dir.is_dir(): + raise ValueError + output_dir.mkdir(exist_ok=True, parents=True) + + # only clean if raw files have been updated since last time + if not force and not newer_than_last_clean( + input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily kernels" + ): + return + + for j in input_dir.glob("*.json"): + with open(j) as fr: + date = j.stem + output_file = output_dir / f"{date}.csv" + try: + data = json.load(fr) + except json.JSONDecodeError: + print(f"WARN: Could not decode JSON data for file {j}") + + if "Packages" not in data: + print( + f"WARN: No correct json structure containing 'XuKernel' field in file {j}" + ) + + with open(output_file, "w") as fw: + writer = csv.writer(fw) + writer.writerow(["date", "kernel", "downloads"]) + for entry in data["XuKernel"]: + p_name = entry + p_count = data["XuKernel"][entry] + p_date = date + writer.writerow([p_date, p_name, p_count]) + # print(output_file, p_date, p_name, p_count) + + def newer_than_last_clean( input_glob: Generator, output_glob: Generator, desc: str | None = None ) -> bool: @@ -66,6 +139,10 @@ def newer_than_last_clean( def main(input: str, output: str) -> None: json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True) + json_to_unique_csv(Path(input) / "daily", Path(output), force=True) + json_to_daily_kernel_csv( + Path(input) / "daily", Path(output) / "kernels", force=True + ) if __name__ == "__main__": diff --git a/popcorn.py b/popcorn.py index 00d8d1a..a55100b 100644 --- a/popcorn.py +++ b/popcorn.py @@ -36,6 +36,12 @@ def _(): clean.json_to_daily_pkg( Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False ) + clean.json_to_unique_csv( + Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False + ) + clean.json_to_daily_kernel_csv( + Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False + ) @app.cell