From 66b04648097ccd0f15dc49ae35a85237e891ed63 Mon Sep 17 00:00:00 2001
From: Marty Oehme <contact@martyoeh.me>
Date: Wed, 1 Oct 2025 14:42:07 +0200
Subject: [PATCH] Remove reduntant cleaning files

---
 clean.py                           | 156 -----------------------------
 grab_all.sh                        |  13 ---
 popcorn.py => notebooks/popcorn.py |   0
 3 files changed, 169 deletions(-)
 delete mode 100644 clean.py
 delete mode 100755 grab_all.sh
 rename popcorn.py => notebooks/popcorn.py (100%)

diff --git a/clean.py b/clean.py
deleted file mode 100644
index 5b13ac3..0000000
--- a/clean.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import csv
-import json
-from datetime import datetime
-from pathlib import Path
-from typing import Generator
-
-
-def json_to_daily_pkg(input_dir: Path, output_dir: Path, force: bool = False) -> None:
-    if not input_dir.is_dir():
-        raise ValueError
-    output_dir.mkdir(exist_ok=True, parents=True)
-
-    # only clean if raw files have been updated since last time
-    if not force and not newer_than_last_clean(
-        input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily packages"
-    ):
-        return
-
-    for j in input_dir.glob("*.json"):
-        with open(j) as fr:
-            date = j.stem
-            output_file = output_dir / f"{date}.csv"
-            try:
-                data = json.load(fr)
-            except json.JSONDecodeError:
-                print(f"WARN: Could not decode JSON data for file {j}")
-
-            if "Packages" not in data:
-                print(
-                    f"WARN: No correct json structure containing 'Packages' field in file {j}"
-                )
-
-            with open(output_file, "w") as fw:
-                writer = csv.writer(fw)
-                writer.writerow(["date", "package", "downloads"])
-                for entry in data["Packages"]:
-                    p_name = entry
-                    p_count = data["Packages"][entry]
-                    p_date = date
-                    writer.writerow([p_date, p_name, p_count])
-                    # print(output_file, p_date, p_name, p_count)
-
-
-def json_to_unique_csv(input_dir: Path, output_dir: Path, force: bool = False) -> None:
-    if not input_dir.is_dir():
-        raise ValueError
-    output_dir.mkdir(exist_ok=True, parents=True)
-
-    # only clean if raw files have been updated since last time
-    if not force and not newer_than_last_clean(
-        input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily unique installs"
-    ):
-        return
-
-    output_file = output_dir / "unique_installs.csv"
-    with open(output_file, "w") as fw:
-        writer = csv.writer(fw)
-        writer.writerow(["date", "unique"])
-
-        for j in input_dir.glob("*.json"):
-            with open(j) as fr:
-                date = j.stem
-                try:
-                    data = json.load(fr)
-                except json.JSONDecodeError:
-                    print(f"WARN: Could not decode JSON data for file {j}")
-
-                if "UniqueInstalls" not in data:
-                    print(
-                        f"WARN: No correct json structure containing 'UniqueInstalls' field in file {j}"
-                    )
-
-                p_date = date
-                p_count = data["UniqueInstalls"]
-                writer.writerow([p_date, p_count])
-                # print(output_file, p_date, p_count)
-
-
-def json_to_daily_kernel_csv(
-    input_dir: Path, output_dir: Path, force: bool = False
-) -> None:
-    if not input_dir.is_dir():
-        raise ValueError
-    output_dir.mkdir(exist_ok=True, parents=True)
-
-    # only clean if raw files have been updated since last time
-    if not force and not newer_than_last_clean(
-        input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily kernels"
-    ):
-        return
-
-    for j in input_dir.glob("*.json"):
-        with open(j) as fr:
-            date = j.stem
-            output_file = output_dir / f"{date}.csv"
-            try:
-                data = json.load(fr)
-            except json.JSONDecodeError:
-                print(f"WARN: Could not decode JSON data for file {j}")
-
-            if "Packages" not in data:
-                print(
-                    f"WARN: No correct json structure containing 'XuKernel' field in file {j}"
-                )
-
-            with open(output_file, "w") as fw:
-                writer = csv.writer(fw)
-                writer.writerow(["date", "kernel", "downloads"])
-                for entry in data["XuKernel"]:
-                    p_name = entry
-                    p_count = data["XuKernel"][entry]
-                    p_date = date
-                    writer.writerow([p_date, p_name, p_count])
-                    # print(output_file, p_date, p_name, p_count)
-
-
-def newer_than_last_clean(
-    input_glob: Generator, output_glob: Generator, desc: str | None = None
-) -> bool:
-    last_raw_update: datetime = datetime(1, 1, 1)
-    oldest_clean: datetime = datetime.now()
-    for file in input_glob:
-        mdate = datetime.fromtimestamp(file.stat().st_mtime)
-        if mdate > last_raw_update:
-            last_raw_update = mdate
-    for file in output_glob:
-        mdate = datetime.fromtimestamp(file.stat().st_mtime)
-        if mdate < oldest_clean:
-            oldest_clean = mdate
-    if oldest_clean > last_raw_update:
-        print(
-            "INFO: Skip creating cleaned data"
-            f"{f' for {desc}' if desc else ''}"
-            f", last processing {oldest_clean} newer than {last_raw_update}"
-        )
-        return False
-    return True
-
-
-def main(input: str, output: str) -> None:
-    json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True)
-    json_to_unique_csv(Path(input) / "daily", Path(output), force=True)
-    json_to_daily_kernel_csv(
-        Path(input) / "daily", Path(output) / "kernels", force=True
-    )
-
-
-if __name__ == "__main__":
-    import sys
-
-    if not len(sys.argv) == 3:
-        print("Please provide exactly one input directory and one output directory.")
-        sys.exit(1)
-    inp = sys.argv[1]
-    out = sys.argv[2]
-    main(inp, out)
diff --git a/grab_all.sh b/grab_all.sh
deleted file mode 100755
index 24e2c1c..0000000
--- a/grab_all.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-
-base_url="https://popcorn.voidlinux.org/popcorn_"
-ft=".json"
-
-start_date="2018-05-08"
-end_date="2025-09-25"
-while [[ "$start_date" != "$end_date" ]]; do
-    echo "$start_date"
-    start_date=$(date --date "$start_date + 1 day" +"%Y-%m-%d")
-    wget -O "${start_date}.json" "${base_url}${start_date}${ft}"
-done
-Copy
diff --git a/popcorn.py b/notebooks/popcorn.py
similarity index 100%
rename from popcorn.py
rename to notebooks/popcorn.py