From 06ee312c80dee9778d1cb211504d057f8a3fafe1 Mon Sep 17 00:00:00 2001
From: Marty Oehme <contact@martyoeh.me>
Date: Wed, 8 Oct 2025 20:30:39 +0200
Subject: [PATCH] Fix monthwise package accumulation

Cut off double-counted months from year.
---
 notebooks/popcorn.py | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/notebooks/popcorn.py b/notebooks/popcorn.py
index 28466c3..df90527 100644
--- a/notebooks/popcorn.py
+++ b/notebooks/popcorn.py
@@ -283,10 +283,13 @@ def plt_weekday_packages(df_pkg_lazy: pl.LazyFrame):
 
 @app.cell
 def plt_month_packages(df_pkg_lazy: pl.LazyFrame):
-    # FIXME: should be cut off after exact 12 months, or counts some months double
     def _():
         month_agg_downloads = (
             df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
+            .filter(
+                (pl.col("date") >= pl.datetime(2018, 10, 1))
+                & (pl.col("date") < pl.datetime(2025, 10, 1))
+            )
             .group_by("month")
             .agg(pl.col("count").sum())
         )
@@ -381,6 +384,7 @@ def tab_rarest_packages(df_pkg_dl: pl.LazyFrame):
     )
     return
 
+
 @app.cell(hide_code=True)
 def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
     def _():
@@ -395,6 +399,7 @@ def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
     _()
     return
 
+
 @app.cell
 def tab_percentiles(df_pkg_dl: pl.LazyFrame):
     def get_num(df: pl.LazyFrame) -> int:
@@ -409,13 +414,17 @@ def tab_percentiles(df_pkg_dl: pl.LazyFrame):
     twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
         (pl.col("count") >= 20) & (pl.col("count") < 30)
     )
-    thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30))
-    pl.DataFrame([
-        get_num(one_ten_installs),
-        get_num(ten_twenty_installs),
-        get_num(twenty_thirty),
-        get_num(thirty_plus),
-    ])
+    thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
+        (pl.col("count") >= 30)
+    )
+    pl.DataFrame(
+        [
+            get_num(one_ten_installs),
+            get_num(ten_twenty_installs),
+            get_num(twenty_thirty),
+            get_num(thirty_plus),
+        ]
+    )
     return
 
 
@@ -576,13 +585,23 @@ def tab_missing_days(sizes_df: pl.DataFrame):
 
 @app.cell
 def plt_modified_times(sizes_df):
+    # Disregard this cell.
+    # It was originally used to find days where the given date (i.e. filename)
+    # diverged from the modification date (i.e. unix mdate). But, there is not
+    # too much new info to be gained that the missing days above don't already
+    # provide imo. Aside from many files being externally modified on one day,
+    # pointing to being moved/changed all at once as part of a server migration
+    # or update.
+    #
+    # With the current data, this information is unfortunately lost, as the
+    # files have now also been modified (attributes) within the dataset itself.
+    # An updated dataset could make use of this information as part of its records,
+    # potentially.
     def _():
         different_modification_date = sizes_df.filter(
             pl.col("date") != pl.col("modified").dt.date()
         )
-        # This does not work well what are we showing?
-        # 'true' capture date on X but then what on Y - the
-        # same date for each? the difference in dt?
+        return different_modification_date
         return (
             lp.ggplot(
                 different_modification_date,