diff --git a/popcorn.py b/popcorn.py index e1dd707..7a65155 100644 --- a/popcorn.py +++ b/popcorn.py @@ -109,7 +109,7 @@ def _(): @app.cell def _(): - df = ( + df_lazy = ( pl.scan_ndjson("data/daily/*", include_file_paths="file") .head(LIMIT_ROWS) # FIXME: take out after debug .with_columns( @@ -121,22 +121,21 @@ def _(): .select("date", pl.col("Packages").struct.unnest()) .fill_null(0) .unpivot(index="date", variable_name="package", value_name="downloads") - .collect() ) - df + df_lazy return @app.cell -def _(df: pl.DataFrame): +def _(df_lazy: pl.LazyFrame): def _(): weekly_downloads = ( - df.sort("date") + df_lazy.sort("date") .group_by_dynamic("date", every="1w") .agg(pl.col("downloads").sum()) .sort("date") + .collect() ) - return ( lp.ggplot(weekly_downloads, lp.aes("date", "downloads")) + lp.geom_line() @@ -145,30 +144,33 @@ def _(df: pl.DataFrame): title="Weekly downloads", ) ) - _() return @app.cell -def _(df: pl.DataFrame): +def _(df_lazy: pl.LazyFrame): def _(): - weekday_downloads = df.sort("date").with_columns( - pl.col("date") - .dt.weekday() - .sort() - .replace_strict( - { - 1: "Mon", - 2: "Tue", - 3: "Wed", - 4: "Thu", - 5: "Fri", - 6: "Sat", - 7: "Sun", - } + weekday_downloads = ( + df_lazy.sort("date") + .with_columns( + pl.col("date") + .dt.weekday() + .sort() + .replace_strict( + { + 1: "Mon", + 2: "Tue", + 3: "Wed", + 4: "Thu", + 5: "Fri", + 6: "Sat", + 7: "Sun", + } + ) + .alias("weekday") ) - .alias("weekday") + .collect() ) return ( @@ -179,7 +181,6 @@ def _(df: pl.DataFrame): caption="Downloads aggregated per day of the week they took place.", ) ) - _() return