diff --git a/popcorn.py b/popcorn.py index 826082c..3634939 100644 --- a/popcorn.py +++ b/popcorn.py @@ -118,7 +118,9 @@ def _(): .str.to_date() .alias("date") ) - .select("date", pl.col("Packages").struct.unnest()) + ) + df_pkg_lazy = ( + df_lazy.select("date", pl.col("Packages").struct.unnest()) .fill_null(0) .unpivot(index="date", variable_name="package", value_name="downloads") ) @@ -126,10 +128,10 @@ def _(): @app.cell -def _(df_lazy: pl.LazyFrame): +def _(df_pkg_lazy: pl.LazyFrame): def _(): weekly_downloads = ( - df_lazy.sort("date") + df_pkg_lazy.sort("date") .group_by_dynamic("date", every="1w") .agg(pl.col("downloads").sum()) .sort("date") @@ -149,10 +151,10 @@ def _(df_lazy: pl.LazyFrame): @app.cell -def _(df_lazy: pl.LazyFrame): +def _(df_pkg_lazy: pl.LazyFrame): def _(): weekday_downloads = ( - df_lazy.sort("date") + df_pkg_lazy.sort("date") .with_columns( pl.col("date") .dt.weekday() @@ -187,10 +189,10 @@ def _(df_lazy: pl.LazyFrame): @app.cell -def _(df_lazy: pl.LazyFrame): +def _(df_pkg_lazy: pl.LazyFrame): def _(): month_agg_downloads = ( - df_lazy.sort("date") + df_pkg_lazy.sort("date") .with_columns(pl.col("date").dt.month().alias("month")) .collect() ) @@ -208,19 +210,8 @@ def _(df_lazy: pl.LazyFrame): @app.cell -def _(): - df_unique_downloads = ( - pl.scan_ndjson("data/daily/*", include_file_paths="file") - .head(LIMIT_ROWS) # FIXME: take out after debug - .with_columns( - pl.col("file") - .str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}") - .str.to_date() - .alias("date") - ) - .select(["date", "UniqueInstalls"]) - .collect() - ) +def _(df_lazy:pl.LazyFrame): + df_unique_downloads = df_lazy.select(["date", "UniqueInstalls"]).collect() ( lp.ggplot(df_unique_downloads, lp.aes("date", "UniqueInstalls")) + lp.geom_line()