Adapt functions to new csv data layout

2025-09-29 16:17:27 +02:00 · 2025-09-29 16:17:27 +02:00 · 4c9518cf67
commit 4c9518cf67
parent 91d64f428c
1 changed files with 41 additions and 21 deletions
--- a/popcorn.py
+++ b/popcorn.py
@ -6,12 +6,15 @@ app = marimo.App(width="medium")
 with app.setup:
    # Initialization code that runs beimpofore all other cells
    import re
+    from pathlib import Path

    import lets_plot as lp
    import marimo as mo
    import polars as pl

-    LIMIT_ROWS = 200
+    LIMIT_ROWS = 500_000
+    DATA_RAW_DIR = "data/raw"
+    DATA_CLEAN_DIR = "data/cleaned"


@app.cell(hide_code=True)
@ -25,6 +28,16 @@ def _():
    return


+# run data prep
+@app.cell
+def _():
+    import clean
+
+    clean.json_to_daily_pkg(
+        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
+    )
+
+
@app.cell
 def _():
    def parse_size(size_str):
@ -34,7 +47,7 @@ def _():
            return None

    sizes_df_raw = (
-        pl.read_csv("data/file_sizes.csv")
+        pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
        .with_columns(
            pl.col("name")
            .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
@ -118,21 +131,21 @@ def _():

@app.cell
 def _():
-    df_lazy = (
-        pl.scan_ndjson("data/daily/*", include_file_paths="file")
-        .head(LIMIT_ROWS)  # FIXME: take out after debug
-        .with_columns(
-            pl.col("file")
-            .str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
-            .str.to_date()
-            .alias("date")
-        )
-    )
    df_pkg_lazy = (
-        df_lazy.select("date", pl.col("Packages").struct.unnest())
+        pl.scan_csv(
+            f"{DATA_CLEAN_DIR}/daily/*.csv",
+            include_file_paths="file",
+            schema={
+                "date": pl.Date,
+                "package": pl.String,
+                "downloads": pl.UInt16,
+            },
+        )
+        .drop("file")
        .fill_null(0)
-        .unpivot(index="date", variable_name="package", value_name="downloads")
+        .head(LIMIT_ROWS)  # FIXME: take out after debug
    )
+    df_pkg_lazy.collect()
    return


@ -281,6 +294,20 @@ def _(df_pkg_lazy: pl.LazyFrame):
    return


+# - which kernels have been DL when? (simplified for semver)
+@app.cell
+def _(df_lazy):
+    kernel_df_lazy = df_lazy.select("date", "XuKernel")
+    kernel_df = (
+        kernel_df_lazy.with_columns(pl.col("XuKernel").struct.unnest())
+        .fill_null(0)
+        # .unpivot(index="date", variable_name="kernel", value_name="downloads")
+        .collect()
+    )
+
+    df_lazy.collect()
+
+
@app.cell
 def _():
    mo.md(
@ -325,19 +352,12 @@ def _(sizes_df):
    return


-@app.cell
-def _(df_lazy):
-    kernel_df = df_lazy.select("date", pl.col("Kernels").struct.unnest())
-    kernel_df
-
-
 # further ideas:
 #
 # - daily download habits:
 #   - are we downloading further spread of versions on specific days
 #   - are there 'update' days, where things converge? specific weekday/on holidays/etc?
 #
-# - which kernels have been DL when? (simplified for semver)
 # - when did specific kernels enter the repos?
 #
 # - which arches are/were most prevalent over time?