diff --git a/popcorn.py b/popcorn.py index d095f4d..cb09ba5 100644 --- a/popcorn.py +++ b/popcorn.py @@ -6,12 +6,15 @@ app = marimo.App(width="medium") with app.setup: # Initialization code that runs beimpofore all other cells import re + from pathlib import Path import lets_plot as lp import marimo as mo import polars as pl - LIMIT_ROWS = 200 + LIMIT_ROWS = 500_000 + DATA_RAW_DIR = "data/raw" + DATA_CLEAN_DIR = "data/cleaned" @app.cell(hide_code=True) @@ -25,6 +28,16 @@ def _(): return +# run data prep +@app.cell +def _(): + import clean + + clean.json_to_daily_pkg( + Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False + ) + + @app.cell def _(): def parse_size(size_str): @@ -34,7 +47,7 @@ def _(): return None sizes_df_raw = ( - pl.read_csv("data/file_sizes.csv") + pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv") .with_columns( pl.col("name") .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}") @@ -118,21 +131,21 @@ def _(): @app.cell def _(): - df_lazy = ( - pl.scan_ndjson("data/daily/*", include_file_paths="file") - .head(LIMIT_ROWS) # FIXME: take out after debug - .with_columns( - pl.col("file") - .str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}") - .str.to_date() - .alias("date") - ) - ) df_pkg_lazy = ( - df_lazy.select("date", pl.col("Packages").struct.unnest()) + pl.scan_csv( + f"{DATA_CLEAN_DIR}/daily/*.csv", + include_file_paths="file", + schema={ + "date": pl.Date, + "package": pl.String, + "downloads": pl.UInt16, + }, + ) + .drop("file") .fill_null(0) - .unpivot(index="date", variable_name="package", value_name="downloads") + .head(LIMIT_ROWS) # FIXME: take out after debug ) + df_pkg_lazy.collect() return @@ -281,6 +294,20 @@ def _(df_pkg_lazy: pl.LazyFrame): return +# - which kernels have been DL when? (simplified for semver) +@app.cell +def _(df_lazy): + kernel_df_lazy = df_lazy.select("date", "XuKernel") + kernel_df = ( + kernel_df_lazy.with_columns(pl.col("XuKernel").struct.unnest()) + .fill_null(0) + # .unpivot(index="date", variable_name="kernel", value_name="downloads") + .collect() + ) + + df_lazy.collect() + + @app.cell def _(): mo.md( @@ -325,19 +352,12 @@ def _(sizes_df): return -@app.cell -def _(df_lazy): - kernel_df = df_lazy.select("date", pl.col("Kernels").struct.unnest()) - kernel_df - - # further ideas: # # - daily download habits: # - are we downloading further spread of versions on specific days # - are there 'update' days, where things converge? specific weekday/on holidays/etc? # -# - which kernels have been DL when? (simplified for semver) # - when did specific kernels enter the repos? # # - which arches are/were most prevalent over time?