Start transferring to quarto file

2025-10-07 21:56:54 +02:00 · 2025-10-07 21:56:54 +02:00 · cfc8ecc4fd
commit cfc8ecc4fd
parent c51970507d
2 changed files with 225 additions and 52 deletions
--- a/notebooks/popcorn.py
+++ b/notebooks/popcorn.py
@ -83,7 +83,7 @@ def plt_filesize(sizes_df):
        + lp.geom_smooth(method="lm")
        + lp.labs(
            title="Size growth",
-            subtitle="Size of daily popcorn statistics files over time",
+            subtitle="Cumulative filesize of daily popcorn statistics over time",
            caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
            y="filesize in kB",
        )
@ -115,7 +115,7 @@ def _():


@app.cell
-def _():
+def tab_pkg():
    df_versions_lazy = pl.scan_csv(
        f"{DATA_DIR}/packages.csv",
        schema={
@ -157,7 +157,7 @@ def _():


@app.cell
-def _(df_pkg_lazy: pl.LazyFrame):
+def plt_weekly_packages(df_pkg_lazy: pl.LazyFrame):
    pkg_per_day = df_pkg_lazy.group_by("date").agg(pl.col("count").sum()).sort("date")

    def _():
@ -222,7 +222,7 @@ def _():


@app.cell
-def _(df_pkg_lazy: pl.LazyFrame):
+def plt_weekday_packages(df_pkg_lazy: pl.LazyFrame):
    def _():
        weekday_downloads = (
            df_pkg_lazy.with_columns(
@ -261,7 +261,8 @@ def _(df_pkg_lazy: pl.LazyFrame):


@app.cell
-def _(df_pkg_lazy: pl.LazyFrame):
+def plt_month_packages(df_pkg_lazy: pl.LazyFrame):
+    # FIXME: should be cut off after exact 12 months, or counts some months double
    def _():
        month_agg_downloads = (
            df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
@ -285,7 +286,7 @@ def _(df_pkg_lazy: pl.LazyFrame):


@app.cell
-def _():
+def plt_unique_installs():
    df_unique_installs = pl.read_csv(
        f"{DATA_DIR}/unique_installs.csv",
        schema={"date": pl.Date, "unique": pl.UInt16},
@ -306,7 +307,7 @@ def _():


@app.cell
-def _(df_pkg_lazy: pl.LazyFrame):
+def plt_top_packages(df_pkg_lazy: pl.LazyFrame):
    df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum())

    def _():
@ -346,7 +347,7 @@ def _(df_pkg_lazy: pl.LazyFrame):


@app.cell(hide_code=True)
-def _(df_pkg_dl: pl.LazyFrame):
+def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
    def _():
        return (
            lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count"))
@ -360,39 +361,6 @@ def _(df_pkg_dl: pl.LazyFrame):
    return


-@app.cell(hide_code=True)
-def _(df_pkg_dl: pl.LazyFrame):
-    # TODO: this is horrible performance-wise
-    def _():
-        def get_num(df: pl.LazyFrame) -> int:
-            return df.count().collect(engine="streaming").item(0, 0)
-
-        one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
-            (pl.col("count") >= 1) & (pl.col("count") < 10)
-        )
-        ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
-            (pl.col("count") >= 10) & (pl.col("count") < 20)
-        )
-        twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
-            (pl.col("count") >= 20) & (pl.col("count") < 30)
-        )
-        thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
-            (pl.col("count") >= 30)
-        )
-        # TODO: Fix for new filters above
-        return mo.md(rf"""
-
-        There are {get_num(one_ten_installs):,} packages which have between one
-        and ten installations in the data, {get_num(ten_twenty_installs):,}
-        packages between eleven and 20 installations, and
-        {get_num(twenty_thirty):,} packages between 21 and 30 installations.
-        {get_num(thirty_plus):,} packages have over 30 installations.
-
-        """)
-
-    _()
-    return
-

@app.cell(hide_code=True)
 def _():
@ -402,7 +370,7 @@ def _():

 # - which kernels have been DL when? (simplified for semver)
@app.cell
-def _():
+def plt_kernel_versions():
    kernel_df_lazy = (
        pl.scan_csv(
            f"{DATA_DIR}/kernels.csv",
@ -468,7 +436,7 @@ def _(kernel_df_v99: pl.DataFrame):


@app.cell
-def _(kernel_df_lazy: pl.LazyFrame):
+def plt_kernel_timeline(kernel_df_lazy: pl.LazyFrame):
    weekly_kernel_df = (
        kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
        .select(["date", "major_ver", "downloads"])
@ -535,7 +503,7 @@ def _():


@app.cell
-def _(sizes_df: pl.DataFrame):
+def tab_missing_days(sizes_df: pl.DataFrame):
    date_range = pl.date_range(
        sizes_df.select("date").min().item(), sizes_df.select("date").max().item()
    )
@ -550,7 +518,7 @@ def _(sizes_df: pl.DataFrame):


@app.cell
-def _(sizes_df):
+def plt_modified_times(sizes_df):
    def _():
        different_modification_date = sizes_df.filter(
            pl.col("date") != pl.col("modified").dt.date()
--- a/popcorn.qmd
+++ b/popcorn.qmd
@ -1,22 +1,220 @@
 ---
-title: "Popcorn analysis"
+title: "Voidlinux popcorn"
+subtitle: "Analysis of voidlinux package and kernel statistics"
 ---

-## Quarto
-
-Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see <https://quarto.org>.
+This notebook analyses the daily package repository statistics files,
+colloquially known as 'popcorn' files, that are generated by the Void Linux
+package manager `xbps` and uploaded by users who have opted in to share.

 ```{python}
+# | echo: false
+import os
+from typing import Any, Awaitable, Mapping
+
+import lets_plot as lp
+import polars as pl
 from lets_plot import LetsPlot
+from marimo import Cell
+
+
+def run_cell(cell: Cell) -> tuple[Any, Mapping[str, Any]]:
+    ret = cell.run()
+    if isinstance(ret, Awaitable):
+        raise NotImplementedError
+    else:
+        output, defs = ret
+    return (output, defs)
+
+
+fig_width, fig_height = (
+    int(os.getenv("QUARTO_FIG_WIDTH") or 7),
+    int(os.getenv("QUARTO_FIG_HEIGHT") or 5),
+)
+
+
+def pplot(cell: Cell) -> Any:
+    outp, _ = run_cell(cell)
+    return (
+        outp
+        + lp.flavor_darcula()
+        + lp.ggsize(width=fig_width * 1000, height=fig_height * 1000)
+    )
+
+
 LetsPlot.setup_html()
 ```

-Testing plot
+## Daily statistics file size
+
+The simplest operation we can do is look at the overall file size for each of the daily
+statistics files over time. The files consist of a long list of packages which have been checked
+from the repositories that day, along with the number of package instances. It also consists of
+the same list separated by specifically installed versions of packages, so if somebody has
+v0.9.1 and somebody else v0.9.3 instead this would count both packages separately.
+
+Another count is the number of different Kernels that have been used on that day, with their
+exact kernel name including major version, minor version and any suffix.
+
+These are the major things that will lead to size increases in the file, but not just for an
+increased amount of absolute users, packages or uploads --- we will get to those shortly.
+
+No, an increase in file size here mainly suggests an increase in the 'breadth' of files on offer
+in the repository, whether that be a wider variety of program versions or more different
+packages that people are interested in, and those that the community chooses to use.
+
+So while the overall amount of packages gives a general estimate of the interest in the
+distribution, this can show a more 'distributor'-aligned view on how many different aisles of
+the buffet people are eating from.

 ```{python}
-#| column: page
+# | echo: true
 from notebooks.popcorn import plt_filesize
-outp, defs = plt_filesize.run()
+pplot(plt_filesize)
+```
+
+As we can see, the difference over time is massive. Especially early on, between 2019 and the
+start of 2021, the amount of different packages and package versions used grew rapidly, with the
+pace picking up once again starting 2023.
+
+There are a few outlier days with a size of 0 kB, which we will remove from the data. In all
+likelihood, those days were not reported correctly or there was some kind of issue on the
+backend so the stats for those days are lost.
+
+There are also a few days where the modification date of the file does not correspond to the
+represented statistical date but those are kept. This rather points to certain times when the
+files have been moved on the backend, or recreated externally but does not mean the data are
+bad.
+
+```{python}
+from notebooks.popcorn import tab_pkg
+outp, defs = tab_pkg.run()
+outp
+```
+
+## Package statistics
+
+Now that we have an idea of how the overall interest in the distribution has changed over time,
+let's look at the actual package statistics.
+
+The popcorn files contain two main pieces of information: the number of installs per package
+(e.g. how many people have rsync installed) and the number of unique installs (i.e. unique
+machines providing statistics). We will look at both of these in turn.
+
+```{python}
+from notebooks.popcorn import plt_weekly_packages
+pplot(plt_weekly_packages)
+```
+
+```{python}
+from notebooks.popcorn import plt_pkg_relative
+pplot(plt_pkg_relative)
+```
+
+The amount of packages installed on all machines increases strongly over time.
+
+```{python}
+from notebooks.popcorn import plt_weekday_packages
+pplot(plt_weekday_packages)
+```
+
+```{python}
+from notebooks.popcorn import plt_month_packages
+pplot(plt_month_packages)
+```
+
+```{python}
+from notebooks.popcorn import plt_top_packages
+pplot(plt_top_packages)
+```
+
+```{python}
+from notebooks.popcorn import plt_package_distribution
+pplot(plt_package_distribution)
+```
+
+```{python}
+from notebooks.popcorn import plt_top_packages
+_, defs = plt_top_packages.run()
+df_pkg_dl = defs["df_pkg_dl"]
+def get_num(df: pl.LazyFrame) -> int:
+    return df.count().collect(engine="streaming").item(0, 0)
+
+one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
+    (pl.col("count") >= 1) & (pl.col("count") < 10)
+)
+ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
+    (pl.col("count") >= 10) & (pl.col("count") < 20)
+)
+twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
+    (pl.col("count") >= 20) & (pl.col("count") < 30)
+)
+thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30))
+```
+
+There are `{python} f"{get_num(one_ten_installs):,}"` packages which have between one
+and ten installations in the data, `{python} f"{get_num(ten_twenty_installs):,}"`
+packages between eleven and 20 installations, and
+`{python} f"{get_num(twenty_thirty):,}"` packages between 21 and 30 installations.
+`{python} f"{get_num(thirty_plus):,}"` packages have over 30 installations.
+
+## Kernel Analysis
+
+```{python}
+from notebooks.popcorn import plt_kernel_versions
+pplot(plt_kernel_versions)
+```
+
+When looking at the kernel versions used, we see a very strong jump between major kernel version
+4 and major kernel version 5.
+
+For this analysis we had to exclude {kernel_df_v99.select(pl.len()).item()} rows which were
+apparently from the future, as they were running variations of major kernel version 99. In all
+likelihood there is a custom kernel version out there which reports its own major version as 99.
+The strange version starts appearing on {kernel_df_v99.select("date").row(0)[0]} and shows up
+all the way until {kernel_df_v99.select("date").row(-1)[0]}.
+
+```{python}
+from notebooks.popcorn import plt_kernel_timeline
+pplot(plt_kernel_timeline)
+```
+
+```{python}
+from datetime import date
+from notebooks.popcorn import plt_kernel_timeline
+_, defs = plt_kernel_timeline.run()
+weekly_kernel_df = defs["weekly_kernel_df"]
+
+last_kernel4: date = weekly_kernel_df.filter(pl.col("major_ver") == "4")[-1][
+    "date"
+].item()
+first_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[0][
+    "date"
+].item()
+last_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[-1][
+    "date"
+].item()
+```
+
+A timeline analysis of the kernels used to report daily downloads shows that people generally
+adopt new major kernel versons at roughly the same time. This change is especially stark between
+major kernel versions 5 and 6, which seem to have traded place in usage almost over night.
+
+The first time that major version 5 of the kernel shows up is on {first_kernel5}. From here, it
+took a long time for the last of the version 4 kernels to disappear, coinciding with the big
+switch between major version 5 and 6. The last time a major version 4 is seen is on
+{last_kernel4}, while the last major version 5 kernels still pop up.
+It would seem, then, that the people still running kernel version 4 used the opportunity of
+everybody switching to the stable version of 6 to also upgrade their machines.
+
+## Odds and Ends
+
+There are some missing days in the statistics.
+
+```{python}
+from notebooks.popcorn import tab_missing_days
+
+outp, defs = tab_missing_days.run()
 outp
 ```

@ -46,3 +244,10 @@ outp
 - things we can't see (limitations)
  - packages on offer in the repositories
    - this could shed light on the bumps of users and relative package ownership
+
+Modified date != descriptive (named) date
+
+```{python}
+from notebooks.popcorn import plt_modified_times
+pplot(plt_modified_times)
+```