From 06ee312c80dee9778d1cb211504d057f8a3fafe1 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 8 Oct 2025 20:30:39 +0200 Subject: [PATCH] Fix monthwise package accumulation Cut off double-counted months from year. --- notebooks/popcorn.py | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/notebooks/popcorn.py b/notebooks/popcorn.py index 28466c3..df90527 100644 --- a/notebooks/popcorn.py +++ b/notebooks/popcorn.py @@ -283,10 +283,13 @@ def plt_weekday_packages(df_pkg_lazy: pl.LazyFrame): @app.cell def plt_month_packages(df_pkg_lazy: pl.LazyFrame): - # FIXME: should be cut off after exact 12 months, or counts some months double def _(): month_agg_downloads = ( df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month")) + .filter( + (pl.col("date") >= pl.datetime(2018, 10, 1)) + & (pl.col("date") < pl.datetime(2025, 10, 1)) + ) .group_by("month") .agg(pl.col("count").sum()) ) @@ -381,6 +384,7 @@ def tab_rarest_packages(df_pkg_dl: pl.LazyFrame): ) return + @app.cell(hide_code=True) def plt_package_distribution(df_pkg_dl: pl.LazyFrame): def _(): @@ -395,6 +399,7 @@ def plt_package_distribution(df_pkg_dl: pl.LazyFrame): _() return + @app.cell def tab_percentiles(df_pkg_dl: pl.LazyFrame): def get_num(df: pl.LazyFrame) -> int: @@ -409,13 +414,17 @@ def tab_percentiles(df_pkg_dl: pl.LazyFrame): twenty_thirty = df_pkg_dl.sort("count", descending=False).filter( (pl.col("count") >= 20) & (pl.col("count") < 30) ) - thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30)) - pl.DataFrame([ - get_num(one_ten_installs), - get_num(ten_twenty_installs), - get_num(twenty_thirty), - get_num(thirty_plus), - ]) + thirty_plus = df_pkg_dl.sort("count", descending=False).filter( + (pl.col("count") >= 30) + ) + pl.DataFrame( + [ + get_num(one_ten_installs), + get_num(ten_twenty_installs), + get_num(twenty_thirty), + get_num(thirty_plus), + ] + ) return @@ -576,13 +585,23 @@ def tab_missing_days(sizes_df: pl.DataFrame): @app.cell def plt_modified_times(sizes_df): + # Disregard this cell. + # It was originally used to find days where the given date (i.e. filename) + # diverged from the modification date (i.e. unix mdate). But, there is not + # too much new info to be gained that the missing days above don't already + # provide imo. Aside from many files being externally modified on one day, + # pointing to being moved/changed all at once as part of a server migration + # or update. + # + # With the current data, this information is unfortunately lost, as the + # files have now also been modified (attributes) within the dataset itself. + # An updated dataset could make use of this information as part of its records, + # potentially. def _(): different_modification_date = sizes_df.filter( pl.col("date") != pl.col("modified").dt.date() ) - # This does not work well what are we showing? - # 'true' capture date on X but then what on Y - the - # same date for each? the difference in dt? + return different_modification_date return ( lp.ggplot( different_modification_date,