From 728ec37bdab0d70bd4b3de1f2093aa23d7923ce4 Mon Sep 17 00:00:00 2001
From: Marty Oehme <contact@martyoeh.me>
Date: Wed, 1 Oct 2025 15:05:36 +0200
Subject: [PATCH] Modify popcorn notebook to run with new data structure

---
 notebooks/popcorn.py | 178 +++++++++++++++++++++----------------------
 1 file changed, 87 insertions(+), 91 deletions(-)

diff --git a/notebooks/popcorn.py b/notebooks/popcorn.py
index 2361179..214b4ea 100644
--- a/notebooks/popcorn.py
+++ b/notebooks/popcorn.py
@@ -12,10 +12,8 @@ with app.setup:
     import marimo as mo
     import polars as pl
 
-    LIMIT_ROWS = 50_000
-    DATA_RAW_DIR = "data/raw"
-    DATA_CLEAN_DIR = "data/cleaned"
-    DATA_PARQUET_DIR = "data/parquet"
+    LIMIT_ROWS = 500_000
+    DATA_DIR = "input/popcorn/output"
 
 
 @app.cell(hide_code=True)
@@ -29,45 +27,25 @@ def _():
     return
 
 
-# run data prep
 @app.cell
 def _():
-    import clean
-
-    clean.json_to_daily_pkg(
-        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
-    )
-    clean.json_to_unique_csv(
-        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False
-    )
-    clean.json_to_daily_kernel_csv(
-        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False
-    )
-
-
-@app.cell
-def _():
-    def parse_size(size_str):
-        try:
-            return float(re.search(r"(\d+.?\d+) kB", size_str).group(1))  # pyright: ignore[reportOptionalMemberAccess]
-        except AttributeError:
-            return None
-
-    sizes_df_raw = (
-        pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
-        .with_columns(
-            pl.col("name")
-            .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
-            .str.to_date()
-            .alias("date"),
-            pl.col("size")
-            .map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
-            .alias("size_num"),
+    sizes_df = (
+        pl.read_csv(
+            f"{DATA_DIR}/files.csv",
+            schema={
+                "date": pl.Date,
+                "filename": pl.String,
+                "mtime": pl.Float32,
+                "filesize": pl.UInt32,
+            },
         )
-        .select(["date", "size_num", "size", "modified"])
+        .with_columns(
+            (pl.col("filesize") / 1024).alias("filesize_kb"),
+            pl.from_epoch("mtime").alias("modified"),
+        )
+        .select(["date", "filesize", "filesize_kb", "modified"])
     )
-    sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
-    return sizes_df, sizes_df_raw
+    return sizes_df
 
 
 @app.cell(hide_code=True)
@@ -104,13 +82,14 @@ def _():
 @app.cell
 def _(sizes_df):
     (
-        lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
+        lp.ggplot(sizes_df, lp.aes(x="date", y="filesize_kb"))
         + lp.geom_point()
         + lp.geom_smooth(method="lm")
         + lp.labs(
             title="Size growth",
             subtitle="Size of daily popcorn statistics files over time",
             caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
+            y="filesize in kB",
         )
     )
     return
@@ -141,20 +120,16 @@ def _():
 
 @app.cell
 def _():
-    df_pkg_lazy = (
-        pl.scan_csv(
-            f"{DATA_CLEAN_DIR}/daily/*.csv",
-            include_file_paths="file",
-            schema={
-                "date": pl.Date,
-                "package": pl.String,
-                "downloads": pl.UInt16,
-            },
-        )
-        .drop("file")
-        .fill_null(0)
+    df_pkg_lazy = pl.scan_csv(
+        f"{DATA_DIR}/packages.csv",
+        schema={
+            "date": pl.Date,
+            "package": pl.String,
+            "version": pl.String,
+            "count": pl.UInt16,
+        },
     )
-    if LIMIT_ROWS: # NOTE: this is only for debugging purposes
+    if LIMIT_ROWS:  # NOTE: this is only for debugging purposes
         df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
     # give small df preview
     df_pkg_lazy.head(100).collect(engine="streaming")
@@ -183,18 +158,22 @@ def _():
 def _(df_pkg_lazy: pl.LazyFrame):
     def _():
         weekly_packages = (
-            df_pkg_lazy.sort("date")
+            df_pkg_lazy
+            # .sort("date")
             .group_by_dynamic("date", every="1w")
-            .agg(pl.col("downloads").sum())
+            .agg(pl.col("count").sum())
             .sort("date")
         )
         return (
-            lp.ggplot(weekly_packages.collect(engine="streaming"), lp.aes("date", "downloads"))
+            lp.ggplot(
+                weekly_packages.collect(engine="streaming"), lp.aes("date", "count")
+            )
             + lp.geom_line()
             + lp.geom_smooth(method="loess")
             + lp.labs(
                 title="Weekly package ownership",
-                caption="Count of all installed packages aggregated for each week",
+                subtitle="Count of all installed packages aggregated for each week",
+                y="number of packages",
             )
         )
 
@@ -235,11 +214,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
             .alias("weekday")
         )
         return (
-            lp.ggplot(weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "downloads"))
+            lp.ggplot(
+                weekday_downloads.collect(engine="streaming"),
+                lp.aes("weekday", "count"),
+            )
             + lp.geom_bar()
             + lp.labs(
-                title="Weekday downloads",
-                caption="Downloads aggregated per day of the week they took place.",
+                title="Ownership per weekday",
+                caption="Package ownership per day of the week over all time",
             )
         )
 
@@ -254,11 +236,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
             pl.col("date").dt.month().alias("month")
         )
         return (
-            lp.ggplot(month_agg_downloads.collect(engine="streaming"), lp.aes("month", "downloads"))
+            lp.ggplot(
+                month_agg_downloads.collect(engine="streaming"),
+                lp.aes("month", "count"),
+            )
             + lp.geom_bar()
             + lp.labs(
-                title="Monthwise downloads",
-                caption="Downloads aggregated per month of the year.",
+                title="Monthwise ownership",
+                caption="Package ownership per month of the year over all time",
             )
         )
 
@@ -271,7 +256,7 @@ def _():
     (
         lp.ggplot(
             pl.read_csv(
-                f"{DATA_CLEAN_DIR}/unique_installs.csv",
+                f"{DATA_DIR}/unique_installs.csv",
                 schema={"date": pl.Date, "unique": pl.UInt16},
             ),
             lp.aes("date", "unique"),
@@ -288,7 +273,7 @@ def _():
 
 @app.cell
 def _(df_pkg_lazy: pl.LazyFrame):
-    df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
+    df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum())
 
     def _():
         DISPLAY_LIMIT = 20
@@ -296,10 +281,10 @@ def _(df_pkg_lazy: pl.LazyFrame):
         return lp.gggrid(
             [
                 lp.ggplot(
-                    df_pkg_dl.sort("downloads", descending=True)
+                    df_pkg_dl.sort("count", descending=True)
                     .head(DISPLAY_LIMIT)
                     .collect(engine="streaming"),
-                    lp.aes("package", "downloads"),
+                    lp.aes("package", "count"),
                 )
                 + lp.geom_bar(stat="identity")
                 + lp.labs(
@@ -307,11 +292,11 @@ def _(df_pkg_lazy: pl.LazyFrame):
                     caption="Most installed packages over all time",
                 ),
                 lp.ggplot(
-                    df_pkg_dl.sort("downloads", descending=False)
+                    df_pkg_dl.sort("count", descending=False)
                     # this seems arbitrary but gives a better result?
                     .head(DISPLAY_LIMIT)
                     .collect(engine="streaming"),
-                    lp.aes("package", "downloads"),
+                    lp.aes("package", "count"),
                 )
                 + lp.geom_bar(stat="identity")
                 + lp.labs(
@@ -330,7 +315,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
 def _(df_pkg_dl: pl.LazyFrame):
     def _():
         return (
-            lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("downloads"))
+            lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count"))
             + lp.geom_freqpoly(stat="bin")
             + lp.labs(
                 title="Package installation count distribution",
@@ -347,22 +332,26 @@ def _(df_pkg_dl: pl.LazyFrame):
         def get_num(df: pl.LazyFrame) -> int:
             return df.count().collect(engine="streaming").item(0, 0)
 
-        one_install = df_pkg_dl.sort("downloads", descending=False).filter(
-            pl.col("downloads") == 1
+        one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
+            (pl.col("count") >= 1) & (pl.col("count") < 10)
         )
-        two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
-            (pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
+        ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
+            (pl.col("count") >= 10) & (pl.col("count") < 20)
         )
-        three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
-            (pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
+        twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
+            (pl.col("count") >= 20) & (pl.col("count") < 30)
+        )
+        thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
+            (pl.col("count") >= 30)
         )
         # TODO: Fix for new filters above
         return mo.md(rf"""
 
-        There are {get_num(one_install)} packages which have exactly a single
-        installation in the data, {get_num(two_installs)} packages with exactly
-        two installations, and {get_num(three_installs)} packages with exactly
-        three.
+        There are {get_num(one_ten_installs):,} packages which have between one
+        and ten installations in the data, {get_num(ten_twenty_installs):,}
+        packages between eleven and 20 installations, and
+        {get_num(twenty_thirty):,} packages between 21 and 30 installations.
+        {get_num(thirty_plus):,} packages have over 30 installations.
 
         """)
 
@@ -381,7 +370,7 @@ def _():
 def _():
     kernel_df_lazy = (
         pl.scan_csv(
-            f"{DATA_CLEAN_DIR}/kernels/*.csv",
+            f"{DATA_DIR}/kernels.csv",
             schema={
                 "date": pl.Date,
                 "kernel": pl.String,
@@ -400,7 +389,9 @@ def _():
     )
 
     kernel_df_v99 = (
-        kernel_df_lazy.filter(pl.col("major_ver") == 99).collect(engine="streaming").select("date")
+        kernel_df_lazy.filter(pl.col("major_ver") == 99)
+        .collect(engine="streaming")
+        .select("date")
     )
     kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
 
@@ -509,9 +500,14 @@ def _():
 
 
 @app.cell
-def _(sizes_df_raw):
-    sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
-    sizes_df_null.select(["date", "size"]).style.tab_header(
+def _(sizes_df: pl.DataFrame):
+    date_range = pl.date_range(
+        sizes_df.select("date").min().item(), sizes_df.select("date").max().item()
+    )
+
+    pl.DataFrame().select(date_range).filter(
+        ~date_range.is_in(sizes_df["date"].implode())
+    ).style.tab_header(
         title="Missing Days",
         subtitle="Days with 0B size due to missing on the popcorn server.",
     )
@@ -521,18 +517,16 @@ def _(sizes_df_raw):
 @app.cell
 def _(sizes_df):
     def _():
-        different_modification_date = sizes_df.with_columns(
-            pl.col("modified")
-            .str.to_datetime(format="%F %T %:z", strict=False)
-            .alias("modified_dt"),
-        ).filter(pl.col("date") != pl.col("modified_dt").dt.date())
+        different_modification_date = sizes_df.filter(
+            pl.col("date") != pl.col("modified").dt.date()
+        )
         # This does not work well what are we showing?
         # 'true' capture date on X but then what on Y - the
         # same date for each? the difference in dt?
         return (
             lp.ggplot(
                 different_modification_date,
-                lp.aes("date", "modified_dt"),
+                lp.aes("date", "modified"),
             )
             + lp.geom_freqpoly()
         )
@@ -543,6 +537,8 @@ def _(sizes_df):
 
 # further ideas:
 #
+# _relative_ package amounts: absolute packages counts / absolute unique installs
+#
 # - daily download habits:
 #   - are we downloading further spread of versions on specific days
 #   - are there 'update' days, where things converge? specific weekday/on holidays/etc?