Modify popcorn notebook to run with new data structure

2025-10-01 15:05:36 +02:00 · 2025-10-01 15:05:36 +02:00 · 728ec37bda
commit 728ec37bda
parent 9628d7a4d8
1 changed files with 87 additions and 91 deletions
--- a/notebooks/popcorn.py
+++ b/notebooks/popcorn.py
@ -12,10 +12,8 @@ with app.setup:
    import marimo as mo
    import polars as pl

-    LIMIT_ROWS = 50_000
-    DATA_RAW_DIR = "data/raw"
-    DATA_CLEAN_DIR = "data/cleaned"
-    DATA_PARQUET_DIR = "data/parquet"
+    LIMIT_ROWS = 500_000
+    DATA_DIR = "input/popcorn/output"


@app.cell(hide_code=True)
@ -29,45 +27,25 @@ def _():
    return


-# run data prep
@app.cell
 def _():
-    import clean
-
-    clean.json_to_daily_pkg(
-        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
-    )
-    clean.json_to_unique_csv(
-        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False
-    )
-    clean.json_to_daily_kernel_csv(
-        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False
-    )
-
-
-@app.cell
-def _():
-    def parse_size(size_str):
-        try:
-            return float(re.search(r"(\d+.?\d+) kB", size_str).group(1))  # pyright: ignore[reportOptionalMemberAccess]
-        except AttributeError:
-            return None
-
-    sizes_df_raw = (
-        pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
-        .with_columns(
-            pl.col("name")
-            .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
-            .str.to_date()
-            .alias("date"),
-            pl.col("size")
-            .map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
-            .alias("size_num"),
+    sizes_df = (
+        pl.read_csv(
+            f"{DATA_DIR}/files.csv",
+            schema={
+                "date": pl.Date,
+                "filename": pl.String,
+                "mtime": pl.Float32,
+                "filesize": pl.UInt32,
+            },
        )
-        .select(["date", "size_num", "size", "modified"])
+        .with_columns(
+            (pl.col("filesize") / 1024).alias("filesize_kb"),
+            pl.from_epoch("mtime").alias("modified"),
+        )
+        .select(["date", "filesize", "filesize_kb", "modified"])
    )
-    sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
-    return sizes_df, sizes_df_raw
+    return sizes_df


@app.cell(hide_code=True)
@ -104,13 +82,14 @@ def _():
@app.cell
 def _(sizes_df):
    (
-        lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
+        lp.ggplot(sizes_df, lp.aes(x="date", y="filesize_kb"))
        + lp.geom_point()
        + lp.geom_smooth(method="lm")
        + lp.labs(
            title="Size growth",
            subtitle="Size of daily popcorn statistics files over time",
            caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
+            y="filesize in kB",
        )
    )
    return
@ -141,20 +120,16 @@ def _():

@app.cell
 def _():
-    df_pkg_lazy = (
-        pl.scan_csv(
-            f"{DATA_CLEAN_DIR}/daily/*.csv",
-            include_file_paths="file",
-            schema={
-                "date": pl.Date,
-                "package": pl.String,
-                "downloads": pl.UInt16,
-            },
-        )
-        .drop("file")
-        .fill_null(0)
+    df_pkg_lazy = pl.scan_csv(
+        f"{DATA_DIR}/packages.csv",
+        schema={
+            "date": pl.Date,
+            "package": pl.String,
+            "version": pl.String,
+            "count": pl.UInt16,
+        },
    )
-    if LIMIT_ROWS: # NOTE: this is only for debugging purposes
+    if LIMIT_ROWS:  # NOTE: this is only for debugging purposes
        df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
    # give small df preview
    df_pkg_lazy.head(100).collect(engine="streaming")
@ -183,18 +158,22 @@ def _():
 def _(df_pkg_lazy: pl.LazyFrame):
    def _():
        weekly_packages = (
-            df_pkg_lazy.sort("date")
+            df_pkg_lazy
+            # .sort("date")
            .group_by_dynamic("date", every="1w")
-            .agg(pl.col("downloads").sum())
+            .agg(pl.col("count").sum())
            .sort("date")
        )
        return (
-            lp.ggplot(weekly_packages.collect(engine="streaming"), lp.aes("date", "downloads"))
+            lp.ggplot(
+                weekly_packages.collect(engine="streaming"), lp.aes("date", "count")
+            )
            + lp.geom_line()
            + lp.geom_smooth(method="loess")
            + lp.labs(
                title="Weekly package ownership",
-                caption="Count of all installed packages aggregated for each week",
+                subtitle="Count of all installed packages aggregated for each week",
+                y="number of packages",
            )
        )

@ -235,11 +214,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
            .alias("weekday")
        )
        return (
-            lp.ggplot(weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "downloads"))
+            lp.ggplot(
+                weekday_downloads.collect(engine="streaming"),
+                lp.aes("weekday", "count"),
+            )
            + lp.geom_bar()
            + lp.labs(
-                title="Weekday downloads",
-                caption="Downloads aggregated per day of the week they took place.",
+                title="Ownership per weekday",
+                caption="Package ownership per day of the week over all time",
            )
        )

@ -254,11 +236,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
            pl.col("date").dt.month().alias("month")
        )
        return (
-            lp.ggplot(month_agg_downloads.collect(engine="streaming"), lp.aes("month", "downloads"))
+            lp.ggplot(
+                month_agg_downloads.collect(engine="streaming"),
+                lp.aes("month", "count"),
+            )
            + lp.geom_bar()
            + lp.labs(
-                title="Monthwise downloads",
-                caption="Downloads aggregated per month of the year.",
+                title="Monthwise ownership",
+                caption="Package ownership per month of the year over all time",
            )
        )

@ -271,7 +256,7 @@ def _():
    (
        lp.ggplot(
            pl.read_csv(
-                f"{DATA_CLEAN_DIR}/unique_installs.csv",
+                f"{DATA_DIR}/unique_installs.csv",
                schema={"date": pl.Date, "unique": pl.UInt16},
            ),
            lp.aes("date", "unique"),
@ -288,7 +273,7 @@ def _():

@app.cell
 def _(df_pkg_lazy: pl.LazyFrame):
-    df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
+    df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum())

    def _():
        DISPLAY_LIMIT = 20
@ -296,10 +281,10 @@ def _(df_pkg_lazy: pl.LazyFrame):
        return lp.gggrid(
            [
                lp.ggplot(
-                    df_pkg_dl.sort("downloads", descending=True)
+                    df_pkg_dl.sort("count", descending=True)
                    .head(DISPLAY_LIMIT)
                    .collect(engine="streaming"),
-                    lp.aes("package", "downloads"),
+                    lp.aes("package", "count"),
                )
                + lp.geom_bar(stat="identity")
                + lp.labs(
@ -307,11 +292,11 @@ def _(df_pkg_lazy: pl.LazyFrame):
                    caption="Most installed packages over all time",
                ),
                lp.ggplot(
-                    df_pkg_dl.sort("downloads", descending=False)
+                    df_pkg_dl.sort("count", descending=False)
                    # this seems arbitrary but gives a better result?
                    .head(DISPLAY_LIMIT)
                    .collect(engine="streaming"),
-                    lp.aes("package", "downloads"),
+                    lp.aes("package", "count"),
                )
                + lp.geom_bar(stat="identity")
                + lp.labs(
@ -330,7 +315,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
 def _(df_pkg_dl: pl.LazyFrame):
    def _():
        return (
-            lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("downloads"))
+            lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count"))
            + lp.geom_freqpoly(stat="bin")
            + lp.labs(
                title="Package installation count distribution",
@ -347,22 +332,26 @@ def _(df_pkg_dl: pl.LazyFrame):
        def get_num(df: pl.LazyFrame) -> int:
            return df.count().collect(engine="streaming").item(0, 0)

-        one_install = df_pkg_dl.sort("downloads", descending=False).filter(
-            pl.col("downloads") == 1
+        one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
+            (pl.col("count") >= 1) & (pl.col("count") < 10)
        )
-        two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
-            (pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
+        ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
+            (pl.col("count") >= 10) & (pl.col("count") < 20)
        )
-        three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
-            (pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
+        twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
+            (pl.col("count") >= 20) & (pl.col("count") < 30)
+        )
+        thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
+            (pl.col("count") >= 30)
        )
        # TODO: Fix for new filters above
        return mo.md(rf"""

-        There are {get_num(one_install)} packages which have exactly a single
-        installation in the data, {get_num(two_installs)} packages with exactly
-        two installations, and {get_num(three_installs)} packages with exactly
-        three.
+        There are {get_num(one_ten_installs):,} packages which have between one
+        and ten installations in the data, {get_num(ten_twenty_installs):,}
+        packages between eleven and 20 installations, and
+        {get_num(twenty_thirty):,} packages between 21 and 30 installations.
+        {get_num(thirty_plus):,} packages have over 30 installations.

        """)

@ -381,7 +370,7 @@ def _():
 def _():
    kernel_df_lazy = (
        pl.scan_csv(
-            f"{DATA_CLEAN_DIR}/kernels/*.csv",
+            f"{DATA_DIR}/kernels.csv",
            schema={
                "date": pl.Date,
                "kernel": pl.String,
@ -400,7 +389,9 @@ def _():
    )

    kernel_df_v99 = (
-        kernel_df_lazy.filter(pl.col("major_ver") == 99).collect(engine="streaming").select("date")
+        kernel_df_lazy.filter(pl.col("major_ver") == 99)
+        .collect(engine="streaming")
+        .select("date")
    )
    kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)

@ -509,9 +500,14 @@ def _():


@app.cell
-def _(sizes_df_raw):
-    sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
-    sizes_df_null.select(["date", "size"]).style.tab_header(
+def _(sizes_df: pl.DataFrame):
+    date_range = pl.date_range(
+        sizes_df.select("date").min().item(), sizes_df.select("date").max().item()
+    )
+
+    pl.DataFrame().select(date_range).filter(
+        ~date_range.is_in(sizes_df["date"].implode())
+    ).style.tab_header(
        title="Missing Days",
        subtitle="Days with 0B size due to missing on the popcorn server.",
    )
@ -521,18 +517,16 @@ def _(sizes_df_raw):
@app.cell
 def _(sizes_df):
    def _():
-        different_modification_date = sizes_df.with_columns(
-            pl.col("modified")
-            .str.to_datetime(format="%F %T %:z", strict=False)
-            .alias("modified_dt"),
-        ).filter(pl.col("date") != pl.col("modified_dt").dt.date())
+        different_modification_date = sizes_df.filter(
+            pl.col("date") != pl.col("modified").dt.date()
+        )
        # This does not work well what are we showing?
        # 'true' capture date on X but then what on Y - the
        # same date for each? the difference in dt?
        return (
            lp.ggplot(
                different_modification_date,
-                lp.aes("date", "modified_dt"),
+                lp.aes("date", "modified"),
            )
            + lp.geom_freqpoly()
        )
@ -543,6 +537,8 @@ def _(sizes_df):

 # further ideas:
 #
+# _relative_ package amounts: absolute packages counts / absolute unique installs
+#
 # - daily download habits:
 #   - are we downloading further spread of versions on specific days
 #   - are there 'update' days, where things converge? specific weekday/on holidays/etc?