From ce38024569a5875a0d86727540f0e750ea4d18fb Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Mon, 29 Sep 2025 20:52:19 +0200 Subject: [PATCH] Add kernel analysis plots --- popcorn.py | 92 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 7 deletions(-) diff --git a/popcorn.py b/popcorn.py index a55100b..63af80a 100644 --- a/popcorn.py +++ b/popcorn.py @@ -305,21 +305,99 @@ def _(df_pkg_lazy: pl.LazyFrame): return +@app.cell(hide_code=True) +def _(): + mo.md(r""" ## Kernel Analysis """) + return + + # - which kernels have been DL when? (simplified for semver) @app.cell -def _(df_lazy): - kernel_df_lazy = df_lazy.select("date", "XuKernel") - kernel_df = ( - kernel_df_lazy.with_columns(pl.col("XuKernel").struct.unnest()) +def _(): + kernel_df_lazy = ( + pl.scan_csv( + f"{DATA_CLEAN_DIR}/kernels/*.csv", + schema={ + "date": pl.Date, + "kernel": pl.String, + "downloads": pl.UInt16, + }, + ) .fill_null(0) - # .unpivot(index="date", variable_name="kernel", value_name="downloads") - .collect() + .with_columns(pl.col("kernel").str.replace(r"(\d+\.\d+\.\d+).*", "${1}")) + .with_columns( + pl.col("kernel") + .str.replace(r"(\d+).*", "${1}") + .str.to_integer(dtype=pl.UInt8) + .alias("major_ver"), + pl.col("kernel").str.replace(r"(\d+\.\d+).*", "${1}").alias("minor_ver"), + ) + .head(LIMIT_ROWS) # FIXME: take out after debug ) - df_lazy.collect() + kernel_df_v99 = ( + kernel_df_lazy.filter(pl.col("major_ver") == 99).collect().select("date") + ) + + kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99) + ( + lp.ggplot( + kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String)) + .group_by("major_ver") + .agg(pl.col("downloads").sum()) + .sort("major_ver") + .collect(), + lp.aes("major_ver", "downloads"), + ) + + lp.geom_bar(stat="identity") + + lp.labs( + title="Kernel versions used", + caption="For each daily download, add up the currently running kernel version", + ) + ) + return + + +@app.cell(hide_code=True) +def _(kernel_df_v99: pl.DataFrame): + mo.md( + rf""" + + When looking at the kernel versions used, we see a very strong jump between major kernel version + 4 and major kernel version 5. + + For this analysis we had to exclude {kernel_df_v99.select(pl.len()).item()} rows which were + apparently from the future, as they were running variations of major kernel version 99. In all + likelihood there is a custom kernel version out there which reports its own major version as 99. + The strange version starts appearing on {kernel_df_v99.select("date").row(0)[0]} and shows up + all the way until {kernel_df_v99.select("date").row(-1)[0]}. + + """ + ) + return @app.cell +def _(kernel_df_lazy: pl.LazyFrame): + ( + lp.ggplot( + kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String)) + .select(["date", "major_ver", "downloads"]) + .sort("date") + .group_by_dynamic("date", every="1w", group_by="major_ver") + .agg(pl.col("downloads").sum()) + .collect(), + lp.aes("date", "downloads", color="major_ver"), + ) + + lp.geom_line() + + lp.labs( + title="Kernels over time", + caption="For each daily download, count used kernel versions", + ) + ) + + +@app.cell(hide_code=True) def _(): mo.md( r"""