Add kernel analysis plots

This commit is contained in:
Marty Oehme 2025-09-29 20:52:19 +02:00
parent 4d5aa73de7
commit ce38024569
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -305,21 +305,99 @@ def _(df_pkg_lazy: pl.LazyFrame):
return
@app.cell(hide_code=True)
def _():
mo.md(r""" ## Kernel Analysis """)
return
# - which kernels have been DL when? (simplified for semver)
@app.cell
def _(df_lazy):
kernel_df_lazy = df_lazy.select("date", "XuKernel")
kernel_df = (
kernel_df_lazy.with_columns(pl.col("XuKernel").struct.unnest())
def _():
kernel_df_lazy = (
pl.scan_csv(
f"{DATA_CLEAN_DIR}/kernels/*.csv",
schema={
"date": pl.Date,
"kernel": pl.String,
"downloads": pl.UInt16,
},
)
.fill_null(0)
# .unpivot(index="date", variable_name="kernel", value_name="downloads")
.collect()
.with_columns(pl.col("kernel").str.replace(r"(\d+\.\d+\.\d+).*", "${1}"))
.with_columns(
pl.col("kernel")
.str.replace(r"(\d+).*", "${1}")
.str.to_integer(dtype=pl.UInt8)
.alias("major_ver"),
pl.col("kernel").str.replace(r"(\d+\.\d+).*", "${1}").alias("minor_ver"),
)
.head(LIMIT_ROWS) # FIXME: take out after debug
)
df_lazy.collect()
kernel_df_v99 = (
kernel_df_lazy.filter(pl.col("major_ver") == 99).collect().select("date")
)
kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
(
lp.ggplot(
kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
.group_by("major_ver")
.agg(pl.col("downloads").sum())
.sort("major_ver")
.collect(),
lp.aes("major_ver", "downloads"),
)
+ lp.geom_bar(stat="identity")
+ lp.labs(
title="Kernel versions used",
caption="For each daily download, add up the currently running kernel version",
)
)
return
@app.cell(hide_code=True)
def _(kernel_df_v99: pl.DataFrame):
mo.md(
rf"""
When looking at the kernel versions used, we see a very strong jump between major kernel version
4 and major kernel version 5.
For this analysis we had to exclude {kernel_df_v99.select(pl.len()).item()} rows which were
apparently from the future, as they were running variations of major kernel version 99. In all
likelihood there is a custom kernel version out there which reports its own major version as 99.
The strange version starts appearing on {kernel_df_v99.select("date").row(0)[0]} and shows up
all the way until {kernel_df_v99.select("date").row(-1)[0]}.
"""
)
return
@app.cell
def _(kernel_df_lazy: pl.LazyFrame):
(
lp.ggplot(
kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
.select(["date", "major_ver", "downloads"])
.sort("date")
.group_by_dynamic("date", every="1w", group_by="major_ver")
.agg(pl.col("downloads").sum())
.collect(),
lp.aes("date", "downloads", color="major_ver"),
)
+ lp.geom_line()
+ lp.labs(
title="Kernels over time",
caption="For each daily download, count used kernel versions",
)
)
@app.cell(hide_code=True)
def _():
mo.md(
r"""