From 728ec37bdab0d70bd4b3de1f2093aa23d7923ce4 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Wed, 1 Oct 2025 15:05:36 +0200 Subject: [PATCH] Modify popcorn notebook to run with new data structure --- notebooks/popcorn.py | 178 +++++++++++++++++++++---------------------- 1 file changed, 87 insertions(+), 91 deletions(-) diff --git a/notebooks/popcorn.py b/notebooks/popcorn.py index 2361179..214b4ea 100644 --- a/notebooks/popcorn.py +++ b/notebooks/popcorn.py @@ -12,10 +12,8 @@ with app.setup: import marimo as mo import polars as pl - LIMIT_ROWS = 50_000 - DATA_RAW_DIR = "data/raw" - DATA_CLEAN_DIR = "data/cleaned" - DATA_PARQUET_DIR = "data/parquet" + LIMIT_ROWS = 500_000 + DATA_DIR = "input/popcorn/output" @app.cell(hide_code=True) @@ -29,45 +27,25 @@ def _(): return -# run data prep @app.cell def _(): - import clean - - clean.json_to_daily_pkg( - Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False - ) - clean.json_to_unique_csv( - Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False - ) - clean.json_to_daily_kernel_csv( - Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False - ) - - -@app.cell -def _(): - def parse_size(size_str): - try: - return float(re.search(r"(\d+.?\d+) kB", size_str).group(1)) # pyright: ignore[reportOptionalMemberAccess] - except AttributeError: - return None - - sizes_df_raw = ( - pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv") - .with_columns( - pl.col("name") - .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}") - .str.to_date() - .alias("date"), - pl.col("size") - .map_elements(lambda x: parse_size(x), return_dtype=pl.Float32) - .alias("size_num"), + sizes_df = ( + pl.read_csv( + f"{DATA_DIR}/files.csv", + schema={ + "date": pl.Date, + "filename": pl.String, + "mtime": pl.Float32, + "filesize": pl.UInt32, + }, ) - .select(["date", "size_num", "size", "modified"]) + .with_columns( + (pl.col("filesize") / 1024).alias("filesize_kb"), + pl.from_epoch("mtime").alias("modified"), + ) + .select(["date", "filesize", "filesize_kb", "modified"]) ) - sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null()) - return sizes_df, sizes_df_raw + return sizes_df @app.cell(hide_code=True) @@ -104,13 +82,14 @@ def _(): @app.cell def _(sizes_df): ( - lp.ggplot(sizes_df, lp.aes(x="date", y="size")) + lp.ggplot(sizes_df, lp.aes(x="date", y="filesize_kb")) + lp.geom_point() + lp.geom_smooth(method="lm") + lp.labs( title="Size growth", subtitle="Size of daily popcorn statistics files over time", caption="Raw json file size, without any formatting, removal of markers, characters or newlines.", + y="filesize in kB", ) ) return @@ -141,20 +120,16 @@ def _(): @app.cell def _(): - df_pkg_lazy = ( - pl.scan_csv( - f"{DATA_CLEAN_DIR}/daily/*.csv", - include_file_paths="file", - schema={ - "date": pl.Date, - "package": pl.String, - "downloads": pl.UInt16, - }, - ) - .drop("file") - .fill_null(0) + df_pkg_lazy = pl.scan_csv( + f"{DATA_DIR}/packages.csv", + schema={ + "date": pl.Date, + "package": pl.String, + "version": pl.String, + "count": pl.UInt16, + }, ) - if LIMIT_ROWS: # NOTE: this is only for debugging purposes + if LIMIT_ROWS: # NOTE: this is only for debugging purposes df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS) # give small df preview df_pkg_lazy.head(100).collect(engine="streaming") @@ -183,18 +158,22 @@ def _(): def _(df_pkg_lazy: pl.LazyFrame): def _(): weekly_packages = ( - df_pkg_lazy.sort("date") + df_pkg_lazy + # .sort("date") .group_by_dynamic("date", every="1w") - .agg(pl.col("downloads").sum()) + .agg(pl.col("count").sum()) .sort("date") ) return ( - lp.ggplot(weekly_packages.collect(engine="streaming"), lp.aes("date", "downloads")) + lp.ggplot( + weekly_packages.collect(engine="streaming"), lp.aes("date", "count") + ) + lp.geom_line() + lp.geom_smooth(method="loess") + lp.labs( title="Weekly package ownership", - caption="Count of all installed packages aggregated for each week", + subtitle="Count of all installed packages aggregated for each week", + y="number of packages", ) ) @@ -235,11 +214,14 @@ def _(df_pkg_lazy: pl.LazyFrame): .alias("weekday") ) return ( - lp.ggplot(weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "downloads")) + lp.ggplot( + weekday_downloads.collect(engine="streaming"), + lp.aes("weekday", "count"), + ) + lp.geom_bar() + lp.labs( - title="Weekday downloads", - caption="Downloads aggregated per day of the week they took place.", + title="Ownership per weekday", + caption="Package ownership per day of the week over all time", ) ) @@ -254,11 +236,14 @@ def _(df_pkg_lazy: pl.LazyFrame): pl.col("date").dt.month().alias("month") ) return ( - lp.ggplot(month_agg_downloads.collect(engine="streaming"), lp.aes("month", "downloads")) + lp.ggplot( + month_agg_downloads.collect(engine="streaming"), + lp.aes("month", "count"), + ) + lp.geom_bar() + lp.labs( - title="Monthwise downloads", - caption="Downloads aggregated per month of the year.", + title="Monthwise ownership", + caption="Package ownership per month of the year over all time", ) ) @@ -271,7 +256,7 @@ def _(): ( lp.ggplot( pl.read_csv( - f"{DATA_CLEAN_DIR}/unique_installs.csv", + f"{DATA_DIR}/unique_installs.csv", schema={"date": pl.Date, "unique": pl.UInt16}, ), lp.aes("date", "unique"), @@ -288,7 +273,7 @@ def _(): @app.cell def _(df_pkg_lazy: pl.LazyFrame): - df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum()) + df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum()) def _(): DISPLAY_LIMIT = 20 @@ -296,10 +281,10 @@ def _(df_pkg_lazy: pl.LazyFrame): return lp.gggrid( [ lp.ggplot( - df_pkg_dl.sort("downloads", descending=True) + df_pkg_dl.sort("count", descending=True) .head(DISPLAY_LIMIT) .collect(engine="streaming"), - lp.aes("package", "downloads"), + lp.aes("package", "count"), ) + lp.geom_bar(stat="identity") + lp.labs( @@ -307,11 +292,11 @@ def _(df_pkg_lazy: pl.LazyFrame): caption="Most installed packages over all time", ), lp.ggplot( - df_pkg_dl.sort("downloads", descending=False) + df_pkg_dl.sort("count", descending=False) # this seems arbitrary but gives a better result? .head(DISPLAY_LIMIT) .collect(engine="streaming"), - lp.aes("package", "downloads"), + lp.aes("package", "count"), ) + lp.geom_bar(stat="identity") + lp.labs( @@ -330,7 +315,7 @@ def _(df_pkg_lazy: pl.LazyFrame): def _(df_pkg_dl: pl.LazyFrame): def _(): return ( - lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("downloads")) + lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count")) + lp.geom_freqpoly(stat="bin") + lp.labs( title="Package installation count distribution", @@ -347,22 +332,26 @@ def _(df_pkg_dl: pl.LazyFrame): def get_num(df: pl.LazyFrame) -> int: return df.count().collect(engine="streaming").item(0, 0) - one_install = df_pkg_dl.sort("downloads", descending=False).filter( - pl.col("downloads") == 1 + one_ten_installs = df_pkg_dl.sort("count", descending=False).filter( + (pl.col("count") >= 1) & (pl.col("count") < 10) ) - two_installs = df_pkg_dl.sort("downloads", descending=False).filter( - (pl.col("downloads") >= 2) & (pl.col("downloads") < 10) + ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter( + (pl.col("count") >= 10) & (pl.col("count") < 20) ) - three_installs = df_pkg_dl.sort("downloads", descending=False).filter( - (pl.col("downloads") >= 10) & (pl.col("downloads") < 20) + twenty_thirty = df_pkg_dl.sort("count", descending=False).filter( + (pl.col("count") >= 20) & (pl.col("count") < 30) + ) + thirty_plus = df_pkg_dl.sort("count", descending=False).filter( + (pl.col("count") >= 30) ) # TODO: Fix for new filters above return mo.md(rf""" - There are {get_num(one_install)} packages which have exactly a single - installation in the data, {get_num(two_installs)} packages with exactly - two installations, and {get_num(three_installs)} packages with exactly - three. + There are {get_num(one_ten_installs):,} packages which have between one + and ten installations in the data, {get_num(ten_twenty_installs):,} + packages between eleven and 20 installations, and + {get_num(twenty_thirty):,} packages between 21 and 30 installations. + {get_num(thirty_plus):,} packages have over 30 installations. """) @@ -381,7 +370,7 @@ def _(): def _(): kernel_df_lazy = ( pl.scan_csv( - f"{DATA_CLEAN_DIR}/kernels/*.csv", + f"{DATA_DIR}/kernels.csv", schema={ "date": pl.Date, "kernel": pl.String, @@ -400,7 +389,9 @@ def _(): ) kernel_df_v99 = ( - kernel_df_lazy.filter(pl.col("major_ver") == 99).collect(engine="streaming").select("date") + kernel_df_lazy.filter(pl.col("major_ver") == 99) + .collect(engine="streaming") + .select("date") ) kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99) @@ -509,9 +500,14 @@ def _(): @app.cell -def _(sizes_df_raw): - sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null()) - sizes_df_null.select(["date", "size"]).style.tab_header( +def _(sizes_df: pl.DataFrame): + date_range = pl.date_range( + sizes_df.select("date").min().item(), sizes_df.select("date").max().item() + ) + + pl.DataFrame().select(date_range).filter( + ~date_range.is_in(sizes_df["date"].implode()) + ).style.tab_header( title="Missing Days", subtitle="Days with 0B size due to missing on the popcorn server.", ) @@ -521,18 +517,16 @@ def _(sizes_df_raw): @app.cell def _(sizes_df): def _(): - different_modification_date = sizes_df.with_columns( - pl.col("modified") - .str.to_datetime(format="%F %T %:z", strict=False) - .alias("modified_dt"), - ).filter(pl.col("date") != pl.col("modified_dt").dt.date()) + different_modification_date = sizes_df.filter( + pl.col("date") != pl.col("modified").dt.date() + ) # This does not work well what are we showing? # 'true' capture date on X but then what on Y - the # same date for each? the difference in dt? return ( lp.ggplot( different_modification_date, - lp.aes("date", "modified_dt"), + lp.aes("date", "modified"), ) + lp.geom_freqpoly() ) @@ -543,6 +537,8 @@ def _(sizes_df): # further ideas: # +# _relative_ package amounts: absolute packages counts / absolute unique installs +# # - daily download habits: # - are we downloading further spread of versions on specific days # - are there 'update' days, where things converge? specific weekday/on holidays/etc?