diff --git a/popcorn.py b/popcorn.py
index d095f4d..cb09ba5 100644
--- a/popcorn.py
+++ b/popcorn.py
@@ -6,12 +6,15 @@ app = marimo.App(width="medium")
 with app.setup:
     # Initialization code that runs beimpofore all other cells
     import re
+    from pathlib import Path
 
     import lets_plot as lp
     import marimo as mo
     import polars as pl
 
-    LIMIT_ROWS = 200
+    LIMIT_ROWS = 500_000
+    DATA_RAW_DIR = "data/raw"
+    DATA_CLEAN_DIR = "data/cleaned"
 
 
 @app.cell(hide_code=True)
@@ -25,6 +28,16 @@ def _():
     return
 
 
+# run data prep
+@app.cell
+def _():
+    import clean
+
+    clean.json_to_daily_pkg(
+        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
+    )
+
+
 @app.cell
 def _():
     def parse_size(size_str):
@@ -34,7 +47,7 @@ def _():
             return None
 
     sizes_df_raw = (
-        pl.read_csv("data/file_sizes.csv")
+        pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
         .with_columns(
             pl.col("name")
             .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
@@ -118,21 +131,21 @@ def _():
 
 @app.cell
 def _():
-    df_lazy = (
-        pl.scan_ndjson("data/daily/*", include_file_paths="file")
-        .head(LIMIT_ROWS)  # FIXME: take out after debug
-        .with_columns(
-            pl.col("file")
-            .str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
-            .str.to_date()
-            .alias("date")
-        )
-    )
     df_pkg_lazy = (
-        df_lazy.select("date", pl.col("Packages").struct.unnest())
+        pl.scan_csv(
+            f"{DATA_CLEAN_DIR}/daily/*.csv",
+            include_file_paths="file",
+            schema={
+                "date": pl.Date,
+                "package": pl.String,
+                "downloads": pl.UInt16,
+            },
+        )
+        .drop("file")
         .fill_null(0)
-        .unpivot(index="date", variable_name="package", value_name="downloads")
+        .head(LIMIT_ROWS)  # FIXME: take out after debug
     )
+    df_pkg_lazy.collect()
     return
 
 
@@ -281,6 +294,20 @@ def _(df_pkg_lazy: pl.LazyFrame):
     return
 
 
+# - which kernels have been DL when? (simplified for semver)
+@app.cell
+def _(df_lazy):
+    kernel_df_lazy = df_lazy.select("date", "XuKernel")
+    kernel_df = (
+        kernel_df_lazy.with_columns(pl.col("XuKernel").struct.unnest())
+        .fill_null(0)
+        # .unpivot(index="date", variable_name="kernel", value_name="downloads")
+        .collect()
+    )
+
+    df_lazy.collect()
+
+
 @app.cell
 def _():
     mo.md(
@@ -325,19 +352,12 @@ def _(sizes_df):
     return
 
 
-@app.cell
-def _(df_lazy):
-    kernel_df = df_lazy.select("date", pl.col("Kernels").struct.unnest())
-    kernel_df
-
-
 # further ideas:
 #
 # - daily download habits:
 #   - are we downloading further spread of versions on specific days
 #   - are there 'update' days, where things converge? specific weekday/on holidays/etc?
 #
-# - which kernels have been DL when? (simplified for semver)
 # - when did specific kernels enter the repos?
 #
 # - which arches are/were most prevalent over time?