From 66b04648097ccd0f15dc49ae35a85237e891ed63 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 1 Oct 2025 14:42:07 +0200 Subject: [PATCH] Remove reduntant cleaning files --- clean.py | 156 ----------------------------- grab_all.sh | 13 --- popcorn.py => notebooks/popcorn.py | 0 3 files changed, 169 deletions(-) delete mode 100644 clean.py delete mode 100755 grab_all.sh rename popcorn.py => notebooks/popcorn.py (100%) diff --git a/clean.py b/clean.py deleted file mode 100644 index 5b13ac3..0000000 --- a/clean.py +++ /dev/null @@ -1,156 +0,0 @@ -import csv -import json -from datetime import datetime -from pathlib import Path -from typing import Generator - - -def json_to_daily_pkg(input_dir: Path, output_dir: Path, force: bool = False) -> None: - if not input_dir.is_dir(): - raise ValueError - output_dir.mkdir(exist_ok=True, parents=True) - - # only clean if raw files have been updated since last time - if not force and not newer_than_last_clean( - input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily packages" - ): - return - - for j in input_dir.glob("*.json"): - with open(j) as fr: - date = j.stem - output_file = output_dir / f"{date}.csv" - try: - data = json.load(fr) - except json.JSONDecodeError: - print(f"WARN: Could not decode JSON data for file {j}") - - if "Packages" not in data: - print( - f"WARN: No correct json structure containing 'Packages' field in file {j}" - ) - - with open(output_file, "w") as fw: - writer = csv.writer(fw) - writer.writerow(["date", "package", "downloads"]) - for entry in data["Packages"]: - p_name = entry - p_count = data["Packages"][entry] - p_date = date - writer.writerow([p_date, p_name, p_count]) - # print(output_file, p_date, p_name, p_count) - - -def json_to_unique_csv(input_dir: Path, output_dir: Path, force: bool = False) -> None: - if not input_dir.is_dir(): - raise ValueError - output_dir.mkdir(exist_ok=True, parents=True) - - # only clean if raw files have been updated since last time - if not force and not newer_than_last_clean( - input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily unique installs" - ): - return - - output_file = output_dir / "unique_installs.csv" - with open(output_file, "w") as fw: - writer = csv.writer(fw) - writer.writerow(["date", "unique"]) - - for j in input_dir.glob("*.json"): - with open(j) as fr: - date = j.stem - try: - data = json.load(fr) - except json.JSONDecodeError: - print(f"WARN: Could not decode JSON data for file {j}") - - if "UniqueInstalls" not in data: - print( - f"WARN: No correct json structure containing 'UniqueInstalls' field in file {j}" - ) - - p_date = date - p_count = data["UniqueInstalls"] - writer.writerow([p_date, p_count]) - # print(output_file, p_date, p_count) - - -def json_to_daily_kernel_csv( - input_dir: Path, output_dir: Path, force: bool = False -) -> None: - if not input_dir.is_dir(): - raise ValueError - output_dir.mkdir(exist_ok=True, parents=True) - - # only clean if raw files have been updated since last time - if not force and not newer_than_last_clean( - input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily kernels" - ): - return - - for j in input_dir.glob("*.json"): - with open(j) as fr: - date = j.stem - output_file = output_dir / f"{date}.csv" - try: - data = json.load(fr) - except json.JSONDecodeError: - print(f"WARN: Could not decode JSON data for file {j}") - - if "Packages" not in data: - print( - f"WARN: No correct json structure containing 'XuKernel' field in file {j}" - ) - - with open(output_file, "w") as fw: - writer = csv.writer(fw) - writer.writerow(["date", "kernel", "downloads"]) - for entry in data["XuKernel"]: - p_name = entry - p_count = data["XuKernel"][entry] - p_date = date - writer.writerow([p_date, p_name, p_count]) - # print(output_file, p_date, p_name, p_count) - - -def newer_than_last_clean( - input_glob: Generator, output_glob: Generator, desc: str | None = None -) -> bool: - last_raw_update: datetime = datetime(1, 1, 1) - oldest_clean: datetime = datetime.now() - for file in input_glob: - mdate = datetime.fromtimestamp(file.stat().st_mtime) - if mdate > last_raw_update: - last_raw_update = mdate - for file in output_glob: - mdate = datetime.fromtimestamp(file.stat().st_mtime) - if mdate < oldest_clean: - oldest_clean = mdate - if oldest_clean > last_raw_update: - print( - "INFO: Skip creating cleaned data" - f"{f' for {desc}' if desc else ''}" - f", last processing {oldest_clean} newer than {last_raw_update}" - ) - return False - return True - - -def main(input: str, output: str) -> None: - json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True) - json_to_unique_csv(Path(input) / "daily", Path(output), force=True) - json_to_daily_kernel_csv( - Path(input) / "daily", Path(output) / "kernels", force=True - ) - - -if __name__ == "__main__": - import sys - - if not len(sys.argv) == 3: - print("Please provide exactly one input directory and one output directory.") - sys.exit(1) - inp = sys.argv[1] - out = sys.argv[2] - main(inp, out) diff --git a/grab_all.sh b/grab_all.sh deleted file mode 100755 index 24e2c1c..0000000 --- a/grab_all.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -base_url="https://popcorn.voidlinux.org/popcorn_" -ft=".json" - -start_date="2018-05-08" -end_date="2025-09-25" -while [[ "$start_date" != "$end_date" ]]; do - echo "$start_date" - start_date=$(date --date "$start_date + 1 day" +"%Y-%m-%d") - wget -O "${start_date}.json" "${base_url}${start_date}${ft}" -done -Copy diff --git a/popcorn.py b/notebooks/popcorn.py similarity index 100% rename from popcorn.py rename to notebooks/popcorn.py