Provide raw lazyframe

This commit is contained in:
Marty Oehme 2025-09-28 21:56:16 +02:00
parent 499a819c90
commit 4984289f69
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -118,7 +118,9 @@ def _():
.str.to_date()
.alias("date")
)
.select("date", pl.col("Packages").struct.unnest())
)
df_pkg_lazy = (
df_lazy.select("date", pl.col("Packages").struct.unnest())
.fill_null(0)
.unpivot(index="date", variable_name="package", value_name="downloads")
)
@ -126,10 +128,10 @@ def _():
@app.cell
def _(df_lazy: pl.LazyFrame):
def _(df_pkg_lazy: pl.LazyFrame):
def _():
weekly_downloads = (
df_lazy.sort("date")
df_pkg_lazy.sort("date")
.group_by_dynamic("date", every="1w")
.agg(pl.col("downloads").sum())
.sort("date")
@ -149,10 +151,10 @@ def _(df_lazy: pl.LazyFrame):
@app.cell
def _(df_lazy: pl.LazyFrame):
def _(df_pkg_lazy: pl.LazyFrame):
def _():
weekday_downloads = (
df_lazy.sort("date")
df_pkg_lazy.sort("date")
.with_columns(
pl.col("date")
.dt.weekday()
@ -187,10 +189,10 @@ def _(df_lazy: pl.LazyFrame):
@app.cell
def _(df_lazy: pl.LazyFrame):
def _(df_pkg_lazy: pl.LazyFrame):
def _():
month_agg_downloads = (
df_lazy.sort("date")
df_pkg_lazy.sort("date")
.with_columns(pl.col("date").dt.month().alias("month"))
.collect()
)
@ -208,19 +210,8 @@ def _(df_lazy: pl.LazyFrame):
@app.cell
def _():
df_unique_downloads = (
pl.scan_ndjson("data/daily/*", include_file_paths="file")
.head(LIMIT_ROWS) # FIXME: take out after debug
.with_columns(
pl.col("file")
.str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
.str.to_date()
.alias("date")
)
.select(["date", "UniqueInstalls"])
.collect()
)
def _(df_lazy:pl.LazyFrame):
df_unique_downloads = df_lazy.select(["date", "UniqueInstalls"]).collect()
(
lp.ggplot(df_unique_downloads, lp.aes("date", "UniqueInstalls"))
+ lp.geom_line()