diff --git a/notebooks/popcorn.py b/notebooks/popcorn.py index bf09e0e..145949a 100644 --- a/notebooks/popcorn.py +++ b/notebooks/popcorn.py @@ -83,7 +83,7 @@ def plt_filesize(sizes_df): + lp.geom_smooth(method="lm") + lp.labs( title="Size growth", - subtitle="Size of daily popcorn statistics files over time", + subtitle="Cumulative filesize of daily popcorn statistics over time", caption="Raw json file size, without any formatting, removal of markers, characters or newlines.", y="filesize in kB", ) @@ -115,7 +115,7 @@ def _(): @app.cell -def _(): +def tab_pkg(): df_versions_lazy = pl.scan_csv( f"{DATA_DIR}/packages.csv", schema={ @@ -157,7 +157,7 @@ def _(): @app.cell -def _(df_pkg_lazy: pl.LazyFrame): +def plt_weekly_packages(df_pkg_lazy: pl.LazyFrame): pkg_per_day = df_pkg_lazy.group_by("date").agg(pl.col("count").sum()).sort("date") def _(): @@ -222,7 +222,7 @@ def _(): @app.cell -def _(df_pkg_lazy: pl.LazyFrame): +def plt_weekday_packages(df_pkg_lazy: pl.LazyFrame): def _(): weekday_downloads = ( df_pkg_lazy.with_columns( @@ -261,7 +261,8 @@ def _(df_pkg_lazy: pl.LazyFrame): @app.cell -def _(df_pkg_lazy: pl.LazyFrame): +def plt_month_packages(df_pkg_lazy: pl.LazyFrame): + # FIXME: should be cut off after exact 12 months, or counts some months double def _(): month_agg_downloads = ( df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month")) @@ -285,7 +286,7 @@ def _(df_pkg_lazy: pl.LazyFrame): @app.cell -def _(): +def plt_unique_installs(): df_unique_installs = pl.read_csv( f"{DATA_DIR}/unique_installs.csv", schema={"date": pl.Date, "unique": pl.UInt16}, @@ -306,7 +307,7 @@ def _(): @app.cell -def _(df_pkg_lazy: pl.LazyFrame): +def plt_top_packages(df_pkg_lazy: pl.LazyFrame): df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum()) def _(): @@ -346,7 +347,7 @@ def _(df_pkg_lazy: pl.LazyFrame): @app.cell(hide_code=True) -def _(df_pkg_dl: pl.LazyFrame): +def plt_package_distribution(df_pkg_dl: pl.LazyFrame): def _(): return ( lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count")) @@ -360,39 +361,6 @@ def _(df_pkg_dl: pl.LazyFrame): return -@app.cell(hide_code=True) -def _(df_pkg_dl: pl.LazyFrame): - # TODO: this is horrible performance-wise - def _(): - def get_num(df: pl.LazyFrame) -> int: - return df.count().collect(engine="streaming").item(0, 0) - - one_ten_installs = df_pkg_dl.sort("count", descending=False).filter( - (pl.col("count") >= 1) & (pl.col("count") < 10) - ) - ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter( - (pl.col("count") >= 10) & (pl.col("count") < 20) - ) - twenty_thirty = df_pkg_dl.sort("count", descending=False).filter( - (pl.col("count") >= 20) & (pl.col("count") < 30) - ) - thirty_plus = df_pkg_dl.sort("count", descending=False).filter( - (pl.col("count") >= 30) - ) - # TODO: Fix for new filters above - return mo.md(rf""" - - There are {get_num(one_ten_installs):,} packages which have between one - and ten installations in the data, {get_num(ten_twenty_installs):,} - packages between eleven and 20 installations, and - {get_num(twenty_thirty):,} packages between 21 and 30 installations. - {get_num(thirty_plus):,} packages have over 30 installations. - - """) - - _() - return - @app.cell(hide_code=True) def _(): @@ -402,7 +370,7 @@ def _(): # - which kernels have been DL when? (simplified for semver) @app.cell -def _(): +def plt_kernel_versions(): kernel_df_lazy = ( pl.scan_csv( f"{DATA_DIR}/kernels.csv", @@ -468,7 +436,7 @@ def _(kernel_df_v99: pl.DataFrame): @app.cell -def _(kernel_df_lazy: pl.LazyFrame): +def plt_kernel_timeline(kernel_df_lazy: pl.LazyFrame): weekly_kernel_df = ( kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String)) .select(["date", "major_ver", "downloads"]) @@ -535,7 +503,7 @@ def _(): @app.cell -def _(sizes_df: pl.DataFrame): +def tab_missing_days(sizes_df: pl.DataFrame): date_range = pl.date_range( sizes_df.select("date").min().item(), sizes_df.select("date").max().item() ) @@ -550,7 +518,7 @@ def _(sizes_df: pl.DataFrame): @app.cell -def _(sizes_df): +def plt_modified_times(sizes_df): def _(): different_modification_date = sizes_df.filter( pl.col("date") != pl.col("modified").dt.date() diff --git a/popcorn.qmd b/popcorn.qmd index 6e423a1..de468f7 100644 --- a/popcorn.qmd +++ b/popcorn.qmd @@ -1,22 +1,220 @@ --- -title: "Popcorn analysis" +title: "Voidlinux popcorn" +subtitle: "Analysis of voidlinux package and kernel statistics" --- -## Quarto - -Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see . +This notebook analyses the daily package repository statistics files, +colloquially known as 'popcorn' files, that are generated by the Void Linux +package manager `xbps` and uploaded by users who have opted in to share. ```{python} +# | echo: false +import os +from typing import Any, Awaitable, Mapping + +import lets_plot as lp +import polars as pl from lets_plot import LetsPlot +from marimo import Cell + + +def run_cell(cell: Cell) -> tuple[Any, Mapping[str, Any]]: + ret = cell.run() + if isinstance(ret, Awaitable): + raise NotImplementedError + else: + output, defs = ret + return (output, defs) + + +fig_width, fig_height = ( + int(os.getenv("QUARTO_FIG_WIDTH") or 7), + int(os.getenv("QUARTO_FIG_HEIGHT") or 5), +) + + +def pplot(cell: Cell) -> Any: + outp, _ = run_cell(cell) + return ( + outp + + lp.flavor_darcula() + + lp.ggsize(width=fig_width * 1000, height=fig_height * 1000) + ) + + LetsPlot.setup_html() ``` -Testing plot +## Daily statistics file size + +The simplest operation we can do is look at the overall file size for each of the daily +statistics files over time. The files consist of a long list of packages which have been checked +from the repositories that day, along with the number of package instances. It also consists of +the same list separated by specifically installed versions of packages, so if somebody has +v0.9.1 and somebody else v0.9.3 instead this would count both packages separately. + +Another count is the number of different Kernels that have been used on that day, with their +exact kernel name including major version, minor version and any suffix. + +These are the major things that will lead to size increases in the file, but not just for an +increased amount of absolute users, packages or uploads --- we will get to those shortly. + +No, an increase in file size here mainly suggests an increase in the 'breadth' of files on offer +in the repository, whether that be a wider variety of program versions or more different +packages that people are interested in, and those that the community chooses to use. + +So while the overall amount of packages gives a general estimate of the interest in the +distribution, this can show a more 'distributor'-aligned view on how many different aisles of +the buffet people are eating from. ```{python} -#| column: page +# | echo: true from notebooks.popcorn import plt_filesize -outp, defs = plt_filesize.run() +pplot(plt_filesize) +``` + +As we can see, the difference over time is massive. Especially early on, between 2019 and the +start of 2021, the amount of different packages and package versions used grew rapidly, with the +pace picking up once again starting 2023. + +There are a few outlier days with a size of 0 kB, which we will remove from the data. In all +likelihood, those days were not reported correctly or there was some kind of issue on the +backend so the stats for those days are lost. + +There are also a few days where the modification date of the file does not correspond to the +represented statistical date but those are kept. This rather points to certain times when the +files have been moved on the backend, or recreated externally but does not mean the data are +bad. + +```{python} +from notebooks.popcorn import tab_pkg +outp, defs = tab_pkg.run() +outp +``` + +## Package statistics + +Now that we have an idea of how the overall interest in the distribution has changed over time, +let's look at the actual package statistics. + +The popcorn files contain two main pieces of information: the number of installs per package +(e.g. how many people have rsync installed) and the number of unique installs (i.e. unique +machines providing statistics). We will look at both of these in turn. + +```{python} +from notebooks.popcorn import plt_weekly_packages +pplot(plt_weekly_packages) +``` + +```{python} +from notebooks.popcorn import plt_pkg_relative +pplot(plt_pkg_relative) +``` + +The amount of packages installed on all machines increases strongly over time. + +```{python} +from notebooks.popcorn import plt_weekday_packages +pplot(plt_weekday_packages) +``` + +```{python} +from notebooks.popcorn import plt_month_packages +pplot(plt_month_packages) +``` + +```{python} +from notebooks.popcorn import plt_top_packages +pplot(plt_top_packages) +``` + +```{python} +from notebooks.popcorn import plt_package_distribution +pplot(plt_package_distribution) +``` + +```{python} +from notebooks.popcorn import plt_top_packages +_, defs = plt_top_packages.run() +df_pkg_dl = defs["df_pkg_dl"] +def get_num(df: pl.LazyFrame) -> int: + return df.count().collect(engine="streaming").item(0, 0) + +one_ten_installs = df_pkg_dl.sort("count", descending=False).filter( + (pl.col("count") >= 1) & (pl.col("count") < 10) +) +ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter( + (pl.col("count") >= 10) & (pl.col("count") < 20) +) +twenty_thirty = df_pkg_dl.sort("count", descending=False).filter( + (pl.col("count") >= 20) & (pl.col("count") < 30) +) +thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30)) +``` + +There are `{python} f"{get_num(one_ten_installs):,}"` packages which have between one +and ten installations in the data, `{python} f"{get_num(ten_twenty_installs):,}"` +packages between eleven and 20 installations, and +`{python} f"{get_num(twenty_thirty):,}"` packages between 21 and 30 installations. +`{python} f"{get_num(thirty_plus):,}"` packages have over 30 installations. + +## Kernel Analysis + +```{python} +from notebooks.popcorn import plt_kernel_versions +pplot(plt_kernel_versions) +``` + +When looking at the kernel versions used, we see a very strong jump between major kernel version +4 and major kernel version 5. + +For this analysis we had to exclude {kernel_df_v99.select(pl.len()).item()} rows which were +apparently from the future, as they were running variations of major kernel version 99. In all +likelihood there is a custom kernel version out there which reports its own major version as 99. +The strange version starts appearing on {kernel_df_v99.select("date").row(0)[0]} and shows up +all the way until {kernel_df_v99.select("date").row(-1)[0]}. + +```{python} +from notebooks.popcorn import plt_kernel_timeline +pplot(plt_kernel_timeline) +``` + +```{python} +from datetime import date +from notebooks.popcorn import plt_kernel_timeline +_, defs = plt_kernel_timeline.run() +weekly_kernel_df = defs["weekly_kernel_df"] + +last_kernel4: date = weekly_kernel_df.filter(pl.col("major_ver") == "4")[-1][ + "date" +].item() +first_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[0][ + "date" +].item() +last_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[-1][ + "date" +].item() +``` + +A timeline analysis of the kernels used to report daily downloads shows that people generally +adopt new major kernel versons at roughly the same time. This change is especially stark between +major kernel versions 5 and 6, which seem to have traded place in usage almost over night. + +The first time that major version 5 of the kernel shows up is on {first_kernel5}. From here, it +took a long time for the last of the version 4 kernels to disappear, coinciding with the big +switch between major version 5 and 6. The last time a major version 4 is seen is on +{last_kernel4}, while the last major version 5 kernels still pop up. +It would seem, then, that the people still running kernel version 4 used the opportunity of +everybody switching to the stable version of 6 to also upgrade their machines. + +## Odds and Ends + +There are some missing days in the statistics. + +```{python} +from notebooks.popcorn import tab_missing_days + +outp, defs = tab_missing_days.run() outp ``` @@ -46,3 +244,10 @@ outp - things we can't see (limitations) - packages on offer in the repositories - this could shed light on the bumps of users and relative package ownership + +Modified date != descriptive (named) date + +```{python} +from notebooks.popcorn import plt_modified_times +pplot(plt_modified_times) +```