analysis-voidlinux-popcorn/notebooks/popcorn.py

import marimo

__generated_with = "0.16.2"
app = marimo.App(width="medium")

with app.setup:
    import lets_plot as lp
    import marimo as mo
    import polars as pl

    LIMIT_ROWS = False
    DATA_DIR = "input/popcorn/output"


@app.cell(hide_code=True)
def _():
    mo.md(r"""# Void Linux 'Popcorn' package repository stat analysis

    This notebook analyses the daily package repository statistics files,
    colloquially known as 'popcorn' files, that are generated by the Void Linux
    package manager `xbps` and uploaded by users who have opted in to share.
    """)
    return


@app.cell
def _():
    sizes_df = (
        pl.read_csv(
            f"{DATA_DIR}/files.csv",
            schema={
                "date": pl.Date,
                "filename": pl.String,
                "mtime": pl.Float32,
                "filesize": pl.UInt32,
            },
        )
        .with_columns(
            (pl.col("filesize") / 1024).alias("filesize_kb"),
            pl.from_epoch("mtime").alias("modified"),
        )
        .select(["date", "filesize", "filesize_kb", "modified"])
    )
    return sizes_df


@app.cell(hide_code=True)
def _():
    mo.md(
        r"""
    ## Daily statistics file size

    The simplest operation we can do is look at the overall file size for each of the daily
    statistics files over time. The files consist of a long list of packages which have been checked
    from the repositories that day, along with the number of package instances. It also consists of
    the same list separated by specifically installed versions of packages, so if somebody has
    v0.9.1 and somebody else v0.9.3 instead this would count both packages separately.

    Another count is the number of different Kernels that have been used on that day, with their
    exact kernel name including major version, minor version and any suffix.

    These are the major things that will lead to size increases in the file, but not just for an
    increased amount of absolute users, packages or uploads --- we will get to those shortly.

    No, an increase in file size here mainly suggests an increase in the 'breadth' of files on offer
    in the repository, whether that be a wider variety of program versions or more different
    packages that people are interested in, and those that the community chooses to use.

    So while the overall amount of packages gives a general estimate of the interest in the
    distribution, this can show a more 'distributor'-aligned view on how many different aisles of
    the buffet people are eating from.

    """
    )
    return


@app.cell
def plt_filesize(sizes_df):
    (
        lp.ggplot(sizes_df, lp.aes(x="date", y="filesize_kb"))
        + lp.geom_point()
        + lp.geom_smooth(method="lm")
        + lp.labs(
            title="Report size",
            subtitle="Filesize of popcorn statistics reports each day",
            caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
            y="filesize in KB",
        )
    )
    return


@app.cell
def plt_filesize_cumulative(sizes_df: pl.DataFrame):
    (
        lp.ggplot(
            sizes_df.with_columns(
                (pl.col("filesize").cum_sum() / 1024 / 1024).alias("filesize_cum")
            ),
            lp.aes(x="date", y="filesize_cum"),
        )
        + lp.geom_line()
        # + lp.geom_smooth(method="lm")
        + lp.labs(
            title="Report size growth",
            subtitle="Cumulative filesize of all popcorn statistics reports up to that day",
            caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
            y="filesize in MB",
        )
    )
    return


@app.cell(hide_code=True)
def _():
    mo.md(
        r"""

    As we can see, the difference over time is massive. Especially early on, between 2019 and the
    start of 2021, the amount of different packages and package versions used grew rapidly, with the
    pace picking up once again starting 2023.

    There are a few outlier days with a size of 0 kB, which we will remove from the data. In all
    likelihood, those days were not reported correctly or there was some kind of issue on the
    backend so the stats for those days are lost.

    There are also a few days where the modification date of the file does not correspond to the
    represented statistical date but those are kept. This rather points to certain times when the
    files have been moved on the backend, or recreated externally but does not mean the data are
    bad.

    """
    )
    return


@app.cell
def tab_pkg():
    df_versions_lazy = pl.scan_csv(
        f"{DATA_DIR}/packages.csv",
        schema={
            "date": pl.Date,
            "package": pl.String,
            "version": pl.String,
            "count": pl.UInt16,
        },
    )
    df_pkg_lazy = (
        df_versions_lazy.drop("version")
        .group_by(["date", "package"])
        .agg(pl.sum("count"))
        .sort("date")
    )
    if LIMIT_ROWS:  # NOTE: this is only for debugging purposes
        df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
    # give small df preview
    df_pkg_lazy.head(100).collect(engine="streaming")
    return


@app.cell(hide_code=True)
def _():
    mo.md(
        r"""
    ## Package statistics

    Now that we have an idea of how the overall interest in the distribution has changed over time,
    let's look at the actual package statistics.

    The popcorn files contain two main pieces of information: the number of installs per package
    (e.g. how many people have rsync installed) and the number of unique installs (i.e. unique
    machines providing statistics). We will look at both of these in turn.

    """
    )
    return


@app.cell
def plt_weekly_packages(df_pkg_lazy: pl.LazyFrame):
    pkg_per_day = df_pkg_lazy.group_by("date").agg(pl.col("count").sum()).sort("date")

    def _():
        weekly_packages = pkg_per_day.group_by_dynamic("date", every="1w").agg(
            pl.col("count").sum()
        )
        return (
            lp.ggplot(
                weekly_packages.collect(engine="streaming"), lp.aes("date", "count")
            )
            + lp.geom_line()
            + lp.geom_smooth(method="loess")
            + lp.labs(
                title="Weekly package ownership",
                subtitle="Count of all installed packages aggregated for each week",
                y="number of packages",
            )
        )

    _()
    return


@app.cell
def plt_pkg_relative(pkg_per_day: pl.LazyFrame, df_unique_installs: pl.DataFrame):
    def _():
        relative_packages = (
            pkg_per_day.with_columns(df_unique_installs["unique"])
            .with_columns((pl.col("count") / pl.col("unique")).alias("relative"))
            .group_by_dynamic("date", every="1w")
            .agg(pl.col("relative").mean())
        )
        return (
            lp.ggplot(
                relative_packages.collect(engine="streaming"),
                lp.aes("date", "relative"),
            )
            + lp.geom_line()
            + lp.geom_smooth(method="loess")
            + lp.labs(
                title="Package ownership per user",
                subtitle="Average relative weekly package ownership",
                caption="Calculated by total amount of packages per day over unique installations",
                y="number of packages",
            )
        )

    _()
    return


@app.cell(hide_code=True)
def _():
    mo.md(
        r"""

    The amount of packages installed on all machines increases strongly over time.

    """
    )
    return


@app.cell
def plt_weekday_packages(df_pkg_lazy: pl.LazyFrame):
    def _():
        weekday_downloads = (
            df_pkg_lazy.with_columns(
                pl.col("date")
                .dt.weekday()
                .replace_strict(
                    {
                        1: "Mon",
                        2: "Tue",
                        3: "Wed",
                        4: "Thu",
                        5: "Fri",
                        6: "Sat",
                        7: "Sun",
                    }
                )
                .alias("weekday")
            )
            .group_by("weekday")
            .agg(pl.col("count").sum())
        )
        return (
            lp.ggplot(
                weekday_downloads.collect(engine="streaming"),
                lp.aes("weekday", "count"),
            )
            + lp.geom_bar(stat="identity")
            + lp.labs(
                title="Ownership per weekday",
                caption="Package ownership per day of the week over all time",
            )
        )

    _()
    return


@app.cell
def plt_month_packages(df_pkg_lazy: pl.LazyFrame):
    def _():
        month_agg_downloads = (
            df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
            .filter(
                (pl.col("date") >= pl.datetime(2018, 10, 1))
                & (pl.col("date") < pl.datetime(2025, 10, 1))
            )
            .group_by("month")
            .agg(pl.col("count").sum())
        )
        return (
            lp.ggplot(
                month_agg_downloads.collect(engine="streaming"),
                lp.aes("month", "count"),
            )
            + lp.geom_bar(stat="identity")
            + lp.labs(
                title="Monthwise ownership",
                caption="Package ownership per month of the year over all time",
            )
        )

    _()
    return


@app.cell
def plt_unique_installs():
    df_unique_installs = pl.read_csv(
        f"{DATA_DIR}/unique_installs.csv",
        schema={"date": pl.Date, "unique": pl.UInt16},
    )
    (
        lp.ggplot(
            df_unique_installs.sort("date")
            .group_by_dynamic("date", every="1w")
            .agg(pl.col("unique").mean()),
            lp.aes("date", "unique"),
        )
        + lp.geom_line()
        + lp.geom_smooth(method="loess")
        + lp.labs(
            title="Unique installations",
            subtitle="Weekly statistics upload averages",
            caption="Daily number of unique providers for package update statistics opting in to data collection.",
        )
    )
    return


@app.cell
def plt_top_packages(df_pkg_lazy: pl.LazyFrame):
    df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum())

    def _():
        DISPLAY_LIMIT = 20

        return lp.gggrid(
            [
                lp.ggplot(
                    df_pkg_dl.sort("count", descending=True)
                    .filter(pl.col("package") != "PopCorn")
                    .head(DISPLAY_LIMIT)
                    .collect(engine="streaming"),
                    lp.aes("package", "count"),
                )
                + lp.geom_bar(stat="identity")
                + lp.labs(
                    title="Top packages",
                    caption="Most installed packages over all time",
                ),
                lp.ggplot(
                    df_pkg_dl.sort("count", descending=False)
                    # this seems arbitrary but gives a better result?
                    .head(DISPLAY_LIMIT)
                    .collect(engine="streaming"),
                    lp.aes("package", "count"),
                )
                + lp.geom_bar(stat="identity")
                + lp.labs(
                    title="Rare packages",
                    caption="Least often installed packages",
                ),
            ],
            ncol=1,
        )

    _()
    return


@app.cell
def tab_rarest_packages(df_pkg_dl: pl.LazyFrame):
    (
        df_pkg_dl.sort("count", descending=False)
        # this seems arbitrary but gives a better result?
        .filter(pl.col("count") == 1)
        .collect(engine="streaming")
    )
    return


@app.cell(hide_code=True)
def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
    def _():
        return (
            lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count"))
            + lp.geom_freqpoly(stat="bin")
            + lp.labs(
                title="Package installation count distribution",
            )
        )

    _()
    return


@app.cell
def tab_percentiles(df_pkg_dl: pl.LazyFrame):
    def get_num(df: pl.LazyFrame) -> int:
        return df.count().collect(engine="streaming").item(0, 0)

    one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
        (pl.col("count") >= 1) & (pl.col("count") < 10)
    )
    ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
        (pl.col("count") >= 10) & (pl.col("count") < 20)
    )
    twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
        (pl.col("count") >= 20) & (pl.col("count") < 30)
    )
    thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
        (pl.col("count") >= 30)
    )
    pl.DataFrame(
        [
            get_num(one_ten_installs),
            get_num(ten_twenty_installs),
            get_num(twenty_thirty),
            get_num(thirty_plus),
        ]
    )
    return


@app.cell(hide_code=True)
def _():
    mo.md(r""" ## Kernel Analysis """)
    return


# - which kernels have been DL when? (simplified for semver)
@app.cell
def plt_kernel_versions():
    kernel_df_lazy = (
        pl.scan_csv(
            f"{DATA_DIR}/kernels.csv",
            schema={
                "date": pl.Date,
                "kernel": pl.String,
                "downloads": pl.UInt16,
            },
        )
        .fill_null(0)
        .with_columns(pl.col("kernel").str.replace(r"(\d+\.\d+\.\d+).*", "${1}"))
        .with_columns(
            pl.col("kernel")
            .str.replace(r"(\d+).*", "${1}")
            .str.to_integer(dtype=pl.UInt8)
            .alias("major_ver"),
            pl.col("kernel").str.replace(r"(\d+\.\d+).*", "${1}").alias("minor_ver"),
        )
        .filter(pl.col("major_ver") != 99)
    )

    (
        lp.ggplot(
            kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
            .group_by("major_ver")
            .agg(pl.col("downloads").sum())
            .sort("major_ver")
            .collect(engine="streaming"),
            lp.aes("major_ver", "downloads"),
        )
        + lp.geom_bar(stat="identity")
        + lp.labs(
            title="Kernel versions used",
            caption="For each daily download, add up the currently running kernel version",
        )
    )
    return


@app.cell
def df_kernel_v99():
    kernel_df_v99 = (
        pl.scan_csv(
            f"{DATA_DIR}/kernels.csv",
            schema={
                "date": pl.Date,
                "kernel": pl.String,
                "downloads": pl.UInt16,
            },
        )
        .fill_null(0)
        .with_columns(pl.col("kernel").str.replace(r"(\d+\.\d+\.\d+).*", "${1}"))
        .with_columns(
            pl.col("kernel")
            .str.replace(r"(\d+).*", "${1}")
            .str.to_integer(dtype=pl.UInt8)
            .alias("major_ver"),
            pl.col("kernel").str.replace(r"(\d+\.\d+).*", "${1}").alias("minor_ver"),
        )
        .filter(pl.col("major_ver") == 99)
        .collect(engine="streaming")
        .select("date")
    )
    kernel_df_v99


@app.cell
def plt_kernel_timeline(kernel_df_lazy: pl.LazyFrame):
    weekly_kernel_df = (
        kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
        .select(["date", "major_ver", "downloads"])
        .sort("date")
        .group_by_dynamic("date", every="1w", group_by="major_ver")
        .agg(pl.col("downloads").sum())
        .collect(engine="streaming")
    )

    (
        lp.ggplot(
            weekly_kernel_df,
            lp.aes("date", "downloads", color="major_ver"),
        )
        + lp.geom_line()
        + lp.labs(
            title="Kernels over time",
            caption="For each daily download, count used kernel versions",
        )
    )


@app.cell(hide_code=True)
def _(weekly_kernel_df: pl.DataFrame):
    from datetime import date

    last_kernel4: date = weekly_kernel_df.filter(pl.col("major_ver") == "4")[-1][
        "date"
    ].item()
    first_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[0][
        "date"
    ].item()
    last_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[-1][
        "date"
    ].item()
    mo.md(
        rf"""

    A timeline analysis of the kernels used to report daily downloads shows that people generally
    adopt new major kernel versons at roughly the same time. This change is especially stark between
    major kernel versions 5 and 6, which seem to have traded place in usage almost over night.

    The first time that major version 5 of the kernel shows up is on {first_kernel5}. From here, it
    took a long time for the last of the version 4 kernels to disappear, coinciding with the big
    switch between major version 5 and 6. The last time a major version 4 is seen is on
    {last_kernel4}, while the last major version 5 kernels still pop up.
    It would seem, then, that the people still running kernel version 4 used the opportunity of
    everybody switching to the stable version of 6 to also upgrade their machines.

    """
    )
    return


@app.cell(hide_code=True)
def _():
    mo.md(
        r"""
    ## Odds and Ends
    There are some missing days in the statistics.
    """
    )
    return


@app.cell
def tab_missing_days(sizes_df: pl.DataFrame):
    date_range = pl.date_range(
        sizes_df.select("date").min().item(), sizes_df.select("date").max().item()
    )

    pl.DataFrame().select(date_range).filter(
        ~date_range.is_in(sizes_df["date"].implode())
    ).style.tab_header(
        title="Missing Days",
        subtitle="Days with 0B size due to missing on the popcorn server.",
    )
    return


@app.cell
def plt_modified_times(sizes_df):
    # Disregard this cell.
    # It was originally used to find days where the given date (i.e. filename)
    # diverged from the modification date (i.e. unix mdate). But, there is not
    # too much new info to be gained that the missing days above don't already
    # provide imo. Aside from many files being externally modified on one day,
    # pointing to being moved/changed all at once as part of a server migration
    # or update.
    #
    # With the current data, this information is unfortunately lost, as the
    # files have now also been modified (attributes) within the dataset itself.
    # An updated dataset could make use of this information as part of its records,
    # potentially.
    def _():
        different_modification_date = sizes_df.filter(
            pl.col("date") != pl.col("modified").dt.date()
        )
        return different_modification_date
        return (
            lp.ggplot(
                different_modification_date,
                lp.aes("date", "modified"),
            )
            + lp.geom_freqpoly()
        )

    _()
    return


# further ideas:
#
# _relative_ package amounts: absolute packages counts / absolute unique installs
#
# - daily download habits:
#   - are we downloading further spread of versions on specific days
#   - are there 'update' days, where things converge? specific weekday/on holidays/etc?
#
# - when did specific kernels enter the repos?
#
# - which arches are/were most prevalent over time?
# - have the arches been mostly even relative to each other?
#
# - what does unique install mean?
#
# - which Packages had the most unique versions, least versions
# - which pkg had the most download of a single version?
# - for which pkg were the version dls the most spread out?

if __name__ == "__main__":
    app.run()