Turn main dataframe into lazyframe

This commit is contained in:
Marty Oehme 2025-09-28 21:08:48 +02:00
parent 07c45ca205
commit 86b3659f0f
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -109,7 +109,7 @@ def _():
@app.cell @app.cell
def _(): def _():
df = ( df_lazy = (
pl.scan_ndjson("data/daily/*", include_file_paths="file") pl.scan_ndjson("data/daily/*", include_file_paths="file")
.head(LIMIT_ROWS) # FIXME: take out after debug .head(LIMIT_ROWS) # FIXME: take out after debug
.with_columns( .with_columns(
@ -121,22 +121,21 @@ def _():
.select("date", pl.col("Packages").struct.unnest()) .select("date", pl.col("Packages").struct.unnest())
.fill_null(0) .fill_null(0)
.unpivot(index="date", variable_name="package", value_name="downloads") .unpivot(index="date", variable_name="package", value_name="downloads")
.collect()
) )
df df_lazy
return return
@app.cell @app.cell
def _(df: pl.DataFrame): def _(df_lazy: pl.LazyFrame):
def _(): def _():
weekly_downloads = ( weekly_downloads = (
df.sort("date") df_lazy.sort("date")
.group_by_dynamic("date", every="1w") .group_by_dynamic("date", every="1w")
.agg(pl.col("downloads").sum()) .agg(pl.col("downloads").sum())
.sort("date") .sort("date")
.collect()
) )
return ( return (
lp.ggplot(weekly_downloads, lp.aes("date", "downloads")) lp.ggplot(weekly_downloads, lp.aes("date", "downloads"))
+ lp.geom_line() + lp.geom_line()
@ -145,30 +144,33 @@ def _(df: pl.DataFrame):
title="Weekly downloads", title="Weekly downloads",
) )
) )
_() _()
return return
@app.cell @app.cell
def _(df: pl.DataFrame): def _(df_lazy: pl.LazyFrame):
def _(): def _():
weekday_downloads = df.sort("date").with_columns( weekday_downloads = (
pl.col("date") df_lazy.sort("date")
.dt.weekday() .with_columns(
.sort() pl.col("date")
.replace_strict( .dt.weekday()
{ .sort()
1: "Mon", .replace_strict(
2: "Tue", {
3: "Wed", 1: "Mon",
4: "Thu", 2: "Tue",
5: "Fri", 3: "Wed",
6: "Sat", 4: "Thu",
7: "Sun", 5: "Fri",
} 6: "Sat",
7: "Sun",
}
)
.alias("weekday")
) )
.alias("weekday") .collect()
) )
return ( return (
@ -179,7 +181,6 @@ def _(df: pl.DataFrame):
caption="Downloads aggregated per day of the week they took place.", caption="Downloads aggregated per day of the week they took place.",
) )
) )
_() _()
return return