Major performance improvements for weekly and monthly plots

Doing most of the aggregation in polars with the streaming engine
prevents memory overload (as compared to letting ggplot do it).
This commit is contained in:
Marty Oehme 2025-10-01 18:04:55 +02:00
parent 728ec37bda
commit 443c4c98cd
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -4,15 +4,11 @@ __generated_with = "0.16.2"
app = marimo.App(width="medium")
with app.setup:
# Initialization code that runs beimpofore all other cells
import re
from pathlib import Path
import lets_plot as lp
import marimo as mo
import polars as pl
LIMIT_ROWS = 500_000
LIMIT_ROWS = False
DATA_DIR = "input/popcorn/output"
@ -120,7 +116,7 @@ def _():
@app.cell
def _():
df_pkg_lazy = pl.scan_csv(
df_versions_lazy = pl.scan_csv(
f"{DATA_DIR}/packages.csv",
schema={
"date": pl.Date,
@ -129,6 +125,12 @@ def _():
"count": pl.UInt16,
},
)
df_pkg_lazy = (
df_versions_lazy.drop("version")
.group_by(["date", "package"])
.agg(pl.sum("count"))
.sort("date")
)
if LIMIT_ROWS: # NOTE: this is only for debugging purposes
df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
# give small df preview
@ -157,12 +159,8 @@ def _():
@app.cell
def _(df_pkg_lazy: pl.LazyFrame):
def _():
weekly_packages = (
df_pkg_lazy
# .sort("date")
.group_by_dynamic("date", every="1w")
.agg(pl.col("count").sum())
.sort("date")
weekly_packages = df_pkg_lazy.group_by_dynamic("date", every="1w").agg(
pl.col("count").sum()
)
return (
lp.ggplot(
@ -196,10 +194,10 @@ def _():
@app.cell
def _(df_pkg_lazy: pl.LazyFrame):
def _():
weekday_downloads = df_pkg_lazy.sort("date").with_columns(
weekday_downloads = (
df_pkg_lazy.with_columns(
pl.col("date")
.dt.weekday()
.sort()
.replace_strict(
{
1: "Mon",
@ -213,12 +211,15 @@ def _(df_pkg_lazy: pl.LazyFrame):
)
.alias("weekday")
)
.group_by("weekday")
.agg(pl.col("count").sum())
)
return (
lp.ggplot(
weekday_downloads.collect(engine="streaming"),
lp.aes("weekday", "count"),
)
+ lp.geom_bar()
+ lp.geom_bar(stat="identity")
+ lp.labs(
title="Ownership per weekday",
caption="Package ownership per day of the week over all time",
@ -232,15 +233,17 @@ def _(df_pkg_lazy: pl.LazyFrame):
@app.cell
def _(df_pkg_lazy: pl.LazyFrame):
def _():
month_agg_downloads = df_pkg_lazy.sort("date").with_columns(
pl.col("date").dt.month().alias("month")
month_agg_downloads = (
df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
.group_by("month")
.agg(pl.col("count").sum())
)
return (
lp.ggplot(
month_agg_downloads.collect(engine="streaming"),
lp.aes("month", "count"),
)
+ lp.geom_bar()
+ lp.geom_bar(stat="identity")
+ lp.labs(
title="Monthwise ownership",
caption="Package ownership per month of the year over all time",
@ -328,6 +331,7 @@ def _(df_pkg_dl: pl.LazyFrame):
@app.cell(hide_code=True)
def _(df_pkg_dl: pl.LazyFrame):
# TODO: this is horrible performance-wise
def _():
def get_num(df: pl.LazyFrame) -> int:
return df.count().collect(engine="streaming").item(0, 0)