Major performance improvements for weekly and monthly plots

Doing most of the aggregation in polars with the streaming engine prevents memory overload (as compared to letting ggplot do it).
2025-10-01 18:04:55 +02:00 · 2025-10-01 18:04:55 +02:00 · 443c4c98cd
commit 443c4c98cd
parent 728ec37bda
1 changed files with 35 additions and 31 deletions
--- a/notebooks/popcorn.py
+++ b/notebooks/popcorn.py
@ -4,15 +4,11 @@ __generated_with = "0.16.2"
 app = marimo.App(width="medium")

 with app.setup:
-    # Initialization code that runs beimpofore all other cells
-    import re
-    from pathlib import Path
-
    import lets_plot as lp
    import marimo as mo
    import polars as pl

-    LIMIT_ROWS = 500_000
+    LIMIT_ROWS = False
    DATA_DIR = "input/popcorn/output"


@ -120,7 +116,7 @@ def _():

@app.cell
 def _():
-    df_pkg_lazy = pl.scan_csv(
+    df_versions_lazy = pl.scan_csv(
        f"{DATA_DIR}/packages.csv",
        schema={
            "date": pl.Date,
@ -129,6 +125,12 @@ def _():
            "count": pl.UInt16,
        },
    )
+    df_pkg_lazy = (
+        df_versions_lazy.drop("version")
+        .group_by(["date", "package"])
+        .agg(pl.sum("count"))
+        .sort("date")
+    )
    if LIMIT_ROWS:  # NOTE: this is only for debugging purposes
        df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
    # give small df preview
@ -157,12 +159,8 @@ def _():
@app.cell
 def _(df_pkg_lazy: pl.LazyFrame):
    def _():
-        weekly_packages = (
-            df_pkg_lazy
-            # .sort("date")
-            .group_by_dynamic("date", every="1w")
-            .agg(pl.col("count").sum())
-            .sort("date")
+        weekly_packages = df_pkg_lazy.group_by_dynamic("date", every="1w").agg(
+            pl.col("count").sum()
        )
        return (
            lp.ggplot(
@ -196,10 +194,10 @@ def _():
@app.cell
 def _(df_pkg_lazy: pl.LazyFrame):
    def _():
-        weekday_downloads = df_pkg_lazy.sort("date").with_columns(
+        weekday_downloads = (
+            df_pkg_lazy.with_columns(
                pl.col("date")
                .dt.weekday()
-            .sort()
                .replace_strict(
                    {
                        1: "Mon",
@ -213,12 +211,15 @@ def _(df_pkg_lazy: pl.LazyFrame):
                )
                .alias("weekday")
            )
+            .group_by("weekday")
+            .agg(pl.col("count").sum())
+        )
        return (
            lp.ggplot(
                weekday_downloads.collect(engine="streaming"),
                lp.aes("weekday", "count"),
            )
-            + lp.geom_bar()
+            + lp.geom_bar(stat="identity")
            + lp.labs(
                title="Ownership per weekday",
                caption="Package ownership per day of the week over all time",
@ -232,15 +233,17 @@ def _(df_pkg_lazy: pl.LazyFrame):
@app.cell
 def _(df_pkg_lazy: pl.LazyFrame):
    def _():
-        month_agg_downloads = df_pkg_lazy.sort("date").with_columns(
-            pl.col("date").dt.month().alias("month")
+        month_agg_downloads = (
+            df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
+            .group_by("month")
+            .agg(pl.col("count").sum())
        )
        return (
            lp.ggplot(
                month_agg_downloads.collect(engine="streaming"),
                lp.aes("month", "count"),
            )
-            + lp.geom_bar()
+            + lp.geom_bar(stat="identity")
            + lp.labs(
                title="Monthwise ownership",
                caption="Package ownership per month of the year over all time",
@ -328,6 +331,7 @@ def _(df_pkg_dl: pl.LazyFrame):

@app.cell(hide_code=True)
 def _(df_pkg_dl: pl.LazyFrame):
+    # TODO: this is horrible performance-wise
    def _():
        def get_num(df: pl.LazyFrame) -> int:
            return df.count().collect(engine="streaming").item(0, 0)