Change to streaming engine for lazy operations

This commit is contained in:
Marty Oehme 2025-09-30 11:13:30 +02:00
parent e393768d30
commit 07bd122aaf
Signed by: Marty
GPG key ID: 4E535BC19C61886E
3 changed files with 286 additions and 11 deletions

View file

@ -155,7 +155,7 @@ def _():
.head(LIMIT_ROWS) # FIXME: take out after debug
)
# give small df preview
df_pkg_lazy.head(100).collect()
df_pkg_lazy.head(100).collect(engine="streaming")
return
@ -187,7 +187,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
.sort("date")
)
return (
lp.ggplot(weekly_packages.collect(), lp.aes("date", "downloads"))
lp.ggplot(weekly_packages.collect(engine="streaming"), lp.aes("date", "downloads"))
+ lp.geom_line()
+ lp.geom_smooth(method="loess")
+ lp.labs(
@ -233,7 +233,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
.alias("weekday")
)
return (
lp.ggplot(weekday_downloads.collect(), lp.aes("weekday", "downloads"))
lp.ggplot(weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "downloads"))
+ lp.geom_bar()
+ lp.labs(
title="Weekday downloads",
@ -252,7 +252,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
pl.col("date").dt.month().alias("month")
)
return (
lp.ggplot(month_agg_downloads.collect(), lp.aes("month", "downloads"))
lp.ggplot(month_agg_downloads.collect(engine="streaming"), lp.aes("month", "downloads"))
+ lp.geom_bar()
+ lp.labs(
title="Monthwise downloads",
@ -296,7 +296,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
lp.ggplot(
df_pkg_dl.sort("downloads", descending=True)
.head(DISPLAY_LIMIT)
.collect(),
.collect(engine="streaming"),
lp.aes("package", "downloads"),
)
+ lp.geom_bar(stat="identity")
@ -308,7 +308,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
df_pkg_dl.sort("downloads", descending=False)
# this seems arbitrary but gives a better result?
.head(DISPLAY_LIMIT)
.collect(),
.collect(engine="streaming"),
lp.aes("package", "downloads"),
)
+ lp.geom_bar(stat="identity")
@ -328,7 +328,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
def _(df_pkg_dl: pl.LazyFrame):
def _():
return (
lp.ggplot(df_pkg_dl.collect(), lp.aes("downloads"))
lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("downloads"))
+ lp.geom_freqpoly(stat="bin")
+ lp.labs(
title="Package installation count distribution",
@ -343,7 +343,7 @@ def _(df_pkg_dl: pl.LazyFrame):
def _(df_pkg_dl: pl.LazyFrame):
def _():
def get_num(df: pl.LazyFrame) -> int:
return df.count().collect().item(0, 0)
return df.count().collect(engine="streaming").item(0, 0)
one_install = df_pkg_dl.sort("downloads", descending=False).filter(
pl.col("downloads") == 1
@ -399,7 +399,7 @@ def _():
)
kernel_df_v99 = (
kernel_df_lazy.filter(pl.col("major_ver") == 99).collect().select("date")
kernel_df_lazy.filter(pl.col("major_ver") == 99).collect(engine="streaming").select("date")
)
kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
@ -409,7 +409,7 @@ def _():
.group_by("major_ver")
.agg(pl.col("downloads").sum())
.sort("major_ver")
.collect(),
.collect(engine="streaming"),
lp.aes("major_ver", "downloads"),
)
+ lp.geom_bar(stat="identity")
@ -448,7 +448,7 @@ def _(kernel_df_lazy: pl.LazyFrame):
.sort("date")
.group_by_dynamic("date", every="1w", group_by="major_ver")
.agg(pl.col("downloads").sum())
.collect()
.collect(engine="streaming")
)
(