From 443c4c98cdd2151d1583016a05e528c9bf3a5dba Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 1 Oct 2025 18:04:55 +0200 Subject: [PATCH] Major performance improvements for weekly and monthly plots Doing most of the aggregation in polars with the streaming engine prevents memory overload (as compared to letting ggplot do it). --- notebooks/popcorn.py | 66 +++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/notebooks/popcorn.py b/notebooks/popcorn.py index 214b4ea..e3ee4ca 100644 --- a/notebooks/popcorn.py +++ b/notebooks/popcorn.py @@ -4,15 +4,11 @@ __generated_with = "0.16.2" app = marimo.App(width="medium") with app.setup: - # Initialization code that runs beimpofore all other cells - import re - from pathlib import Path - import lets_plot as lp import marimo as mo import polars as pl - LIMIT_ROWS = 500_000 + LIMIT_ROWS = False DATA_DIR = "input/popcorn/output" @@ -120,7 +116,7 @@ def _(): @app.cell def _(): - df_pkg_lazy = pl.scan_csv( + df_versions_lazy = pl.scan_csv( f"{DATA_DIR}/packages.csv", schema={ "date": pl.Date, @@ -129,6 +125,12 @@ def _(): "count": pl.UInt16, }, ) + df_pkg_lazy = ( + df_versions_lazy.drop("version") + .group_by(["date", "package"]) + .agg(pl.sum("count")) + .sort("date") + ) if LIMIT_ROWS: # NOTE: this is only for debugging purposes df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS) # give small df preview @@ -157,12 +159,8 @@ def _(): @app.cell def _(df_pkg_lazy: pl.LazyFrame): def _(): - weekly_packages = ( - df_pkg_lazy - # .sort("date") - .group_by_dynamic("date", every="1w") - .agg(pl.col("count").sum()) - .sort("date") + weekly_packages = df_pkg_lazy.group_by_dynamic("date", every="1w").agg( + pl.col("count").sum() ) return ( lp.ggplot( @@ -196,29 +194,32 @@ def _(): @app.cell def _(df_pkg_lazy: pl.LazyFrame): def _(): - weekday_downloads = df_pkg_lazy.sort("date").with_columns( - pl.col("date") - .dt.weekday() - .sort() - .replace_strict( - { - 1: "Mon", - 2: "Tue", - 3: "Wed", - 4: "Thu", - 5: "Fri", - 6: "Sat", - 7: "Sun", - } + weekday_downloads = ( + df_pkg_lazy.with_columns( + pl.col("date") + .dt.weekday() + .replace_strict( + { + 1: "Mon", + 2: "Tue", + 3: "Wed", + 4: "Thu", + 5: "Fri", + 6: "Sat", + 7: "Sun", + } + ) + .alias("weekday") ) - .alias("weekday") + .group_by("weekday") + .agg(pl.col("count").sum()) ) return ( lp.ggplot( weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "count"), ) - + lp.geom_bar() + + lp.geom_bar(stat="identity") + lp.labs( title="Ownership per weekday", caption="Package ownership per day of the week over all time", @@ -232,15 +233,17 @@ def _(df_pkg_lazy: pl.LazyFrame): @app.cell def _(df_pkg_lazy: pl.LazyFrame): def _(): - month_agg_downloads = df_pkg_lazy.sort("date").with_columns( - pl.col("date").dt.month().alias("month") + month_agg_downloads = ( + df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month")) + .group_by("month") + .agg(pl.col("count").sum()) ) return ( lp.ggplot( month_agg_downloads.collect(engine="streaming"), lp.aes("month", "count"), ) - + lp.geom_bar() + + lp.geom_bar(stat="identity") + lp.labs( title="Monthwise ownership", caption="Package ownership per month of the year over all time", @@ -328,6 +331,7 @@ def _(df_pkg_dl: pl.LazyFrame): @app.cell(hide_code=True) def _(df_pkg_dl: pl.LazyFrame): + # TODO: this is horrible performance-wise def _(): def get_num(df: pl.LazyFrame) -> int: return df.count().collect(engine="streaming").item(0, 0)