Major performance improvements for weekly and monthly plots

Doing most of the aggregation in polars with the streaming engine prevents memory overload (as compared to letting ggplot do it).
2025-10-01 18:04:55 +02:00 · 2025-10-01 18:04:55 +02:00 · 443c4c98cd
commit 443c4c98cd
parent 728ec37bda
1 changed files with 35 additions and 31 deletions
--- a/notebooks/popcorn.py
+++ b/notebooks/popcorn.py
@ -4,15 +4,11 @@ __generated_with = "0.16.2"
 app = marimo.App(width="medium")
 with app.setup:
    # Initialization code that runs beimpofore all other cells
    import re
    from pathlib import Path
    import lets_plot as lp
    import marimo as mo
    import polars as pl
-    LIMIT_ROWS = 500_000
+    LIMIT_ROWS = False
    DATA_DIR = "input/popcorn/output"
@ -120,7 +116,7 @@ def _():
@app.cell
 def _():
-    df_pkg_lazy = pl.scan_csv(
+    df_versions_lazy = pl.scan_csv(
        f"{DATA_DIR}/packages.csv",
        schema={
            "date": pl.Date,
@ -129,6 +125,12 @@ def _():
            "count": pl.UInt16,
        },
    )
    df_pkg_lazy = (
        df_versions_lazy.drop("version")
        .group_by(["date", "package"])
        .agg(pl.sum("count"))
        .sort("date")
    )
    if LIMIT_ROWS:  # NOTE: this is only for debugging purposes
        df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
    # give small df preview
@ -157,12 +159,8 @@ def _():
@app.cell
 def _(df_pkg_lazy: pl.LazyFrame):
    def _():
-        weekly_packages = (
+        weekly_packages = df_pkg_lazy.group_by_dynamic("date", every="1w").agg(
-            df_pkg_lazy
+            pl.col("count").sum()
            # .sort("date")
            .group_by_dynamic("date", every="1w")
            .agg(pl.col("count").sum())
            .sort("date")
        )
        return (
            lp.ggplot(
@ -196,29 +194,32 @@ def _():
@app.cell
 def _(df_pkg_lazy: pl.LazyFrame):
    def _():
-        weekday_downloads = df_pkg_lazy.sort("date").with_columns(
+        weekday_downloads = (
-            pl.col("date")
+            df_pkg_lazy.with_columns(
-            .dt.weekday()
+                pl.col("date")
-            .sort()
+                .dt.weekday()
-            .replace_strict(
+                .replace_strict(
-                {
+                    {
-                    1: "Mon",
+                        1: "Mon",
-                    2: "Tue",
+                        2: "Tue",
-                    3: "Wed",
+                        3: "Wed",
-                    4: "Thu",
+                        4: "Thu",
-                    5: "Fri",
+                        5: "Fri",
-                    6: "Sat",
+                        6: "Sat",
-                    7: "Sun",
+                        7: "Sun",
-                }
+                    }
                )
                .alias("weekday")
            )
-            .alias("weekday")
+            .group_by("weekday")
            .agg(pl.col("count").sum())
        )
        return (
            lp.ggplot(
                weekday_downloads.collect(engine="streaming"),
                lp.aes("weekday", "count"),
            )
-            + lp.geom_bar()
+            + lp.geom_bar(stat="identity")
            + lp.labs(
                title="Ownership per weekday",
                caption="Package ownership per day of the week over all time",
@ -232,15 +233,17 @@ def _(df_pkg_lazy: pl.LazyFrame):
@app.cell
 def _(df_pkg_lazy: pl.LazyFrame):
    def _():
-        month_agg_downloads = df_pkg_lazy.sort("date").with_columns(
+        month_agg_downloads = (
-            pl.col("date").dt.month().alias("month")
+            df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
            .group_by("month")
            .agg(pl.col("count").sum())
        )
        return (
            lp.ggplot(
                month_agg_downloads.collect(engine="streaming"),
                lp.aes("month", "count"),
            )
-            + lp.geom_bar()
+            + lp.geom_bar(stat="identity")
            + lp.labs(
                title="Monthwise ownership",
                caption="Package ownership per month of the year over all time",
@ -328,6 +331,7 @@ def _(df_pkg_dl: pl.LazyFrame):
@app.cell(hide_code=True)
 def _(df_pkg_dl: pl.LazyFrame):
    # TODO: this is horrible performance-wise
    def _():
        def get_num(df: pl.LazyFrame) -> int:
            return df.count().collect(engine="streaming").item(0, 0)