Major performance improvements for weekly and monthly plots

Doing most of the aggregation in polars with the streaming engine
prevents memory overload (as compared to letting ggplot do it).
This commit is contained in:
Marty Oehme 2025-10-01 18:04:55 +02:00
parent 728ec37bda
commit 443c4c98cd
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -4,15 +4,11 @@ __generated_with = "0.16.2"
app = marimo.App(width="medium") app = marimo.App(width="medium")
with app.setup: with app.setup:
# Initialization code that runs beimpofore all other cells
import re
from pathlib import Path
import lets_plot as lp import lets_plot as lp
import marimo as mo import marimo as mo
import polars as pl import polars as pl
LIMIT_ROWS = 500_000 LIMIT_ROWS = False
DATA_DIR = "input/popcorn/output" DATA_DIR = "input/popcorn/output"
@ -120,7 +116,7 @@ def _():
@app.cell @app.cell
def _(): def _():
df_pkg_lazy = pl.scan_csv( df_versions_lazy = pl.scan_csv(
f"{DATA_DIR}/packages.csv", f"{DATA_DIR}/packages.csv",
schema={ schema={
"date": pl.Date, "date": pl.Date,
@ -129,6 +125,12 @@ def _():
"count": pl.UInt16, "count": pl.UInt16,
}, },
) )
df_pkg_lazy = (
df_versions_lazy.drop("version")
.group_by(["date", "package"])
.agg(pl.sum("count"))
.sort("date")
)
if LIMIT_ROWS: # NOTE: this is only for debugging purposes if LIMIT_ROWS: # NOTE: this is only for debugging purposes
df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS) df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
# give small df preview # give small df preview
@ -157,12 +159,8 @@ def _():
@app.cell @app.cell
def _(df_pkg_lazy: pl.LazyFrame): def _(df_pkg_lazy: pl.LazyFrame):
def _(): def _():
weekly_packages = ( weekly_packages = df_pkg_lazy.group_by_dynamic("date", every="1w").agg(
df_pkg_lazy pl.col("count").sum()
# .sort("date")
.group_by_dynamic("date", every="1w")
.agg(pl.col("count").sum())
.sort("date")
) )
return ( return (
lp.ggplot( lp.ggplot(
@ -196,29 +194,32 @@ def _():
@app.cell @app.cell
def _(df_pkg_lazy: pl.LazyFrame): def _(df_pkg_lazy: pl.LazyFrame):
def _(): def _():
weekday_downloads = df_pkg_lazy.sort("date").with_columns( weekday_downloads = (
pl.col("date") df_pkg_lazy.with_columns(
.dt.weekday() pl.col("date")
.sort() .dt.weekday()
.replace_strict( .replace_strict(
{ {
1: "Mon", 1: "Mon",
2: "Tue", 2: "Tue",
3: "Wed", 3: "Wed",
4: "Thu", 4: "Thu",
5: "Fri", 5: "Fri",
6: "Sat", 6: "Sat",
7: "Sun", 7: "Sun",
} }
)
.alias("weekday")
) )
.alias("weekday") .group_by("weekday")
.agg(pl.col("count").sum())
) )
return ( return (
lp.ggplot( lp.ggplot(
weekday_downloads.collect(engine="streaming"), weekday_downloads.collect(engine="streaming"),
lp.aes("weekday", "count"), lp.aes("weekday", "count"),
) )
+ lp.geom_bar() + lp.geom_bar(stat="identity")
+ lp.labs( + lp.labs(
title="Ownership per weekday", title="Ownership per weekday",
caption="Package ownership per day of the week over all time", caption="Package ownership per day of the week over all time",
@ -232,15 +233,17 @@ def _(df_pkg_lazy: pl.LazyFrame):
@app.cell @app.cell
def _(df_pkg_lazy: pl.LazyFrame): def _(df_pkg_lazy: pl.LazyFrame):
def _(): def _():
month_agg_downloads = df_pkg_lazy.sort("date").with_columns( month_agg_downloads = (
pl.col("date").dt.month().alias("month") df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
.group_by("month")
.agg(pl.col("count").sum())
) )
return ( return (
lp.ggplot( lp.ggplot(
month_agg_downloads.collect(engine="streaming"), month_agg_downloads.collect(engine="streaming"),
lp.aes("month", "count"), lp.aes("month", "count"),
) )
+ lp.geom_bar() + lp.geom_bar(stat="identity")
+ lp.labs( + lp.labs(
title="Monthwise ownership", title="Monthwise ownership",
caption="Package ownership per month of the year over all time", caption="Package ownership per month of the year over all time",
@ -328,6 +331,7 @@ def _(df_pkg_dl: pl.LazyFrame):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(df_pkg_dl: pl.LazyFrame): def _(df_pkg_dl: pl.LazyFrame):
# TODO: this is horrible performance-wise
def _(): def _():
def get_num(df: pl.LazyFrame) -> int: def get_num(df: pl.LazyFrame) -> int:
return df.count().collect(engine="streaming").item(0, 0) return df.count().collect(engine="streaming").item(0, 0)