analysis-voidlinux-popcorn/popcorn.py

import marimo

__generated_with = "0.16.2"
app = marimo.App(width="medium")

with app.setup:
    # Initialization code that runs beimpofore all other cells
    import json
    import re
    from pathlib import Path

    import lets_plot as lp
    import marimo as mo
    import polars as pl


@app.cell(hide_code=True)
def _():
    mo.md(r"""# Void Linux 'Popcorn' package repository stat analysis""")
    return


@app.cell
def _():
    def parse_size(size_str):
        try:
            return float(re.search(r"(\d+.?\d+) kB", size_str).group(1))  # pyright: ignore[reportOptionalMemberAccess]
        except AttributeError:
            return None

    sizes_df_raw = (
        pl.read_csv("data/file_sizes.csv")
        .with_columns(
            pl.col("name")
            .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
            .str.to_date()
            .alias("date"),
            pl.col("size")
            .map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
            .alias("size_num"),
        )
        .select(["date", "size_num", "size", "modified"])
    )
    sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
    return sizes_df, sizes_df_raw


@app.cell(hide_code=True)
def _():
    mo.md(
        r"""
    ## Daily statistics file size

    The simplest operation we can do is look at the overall file size for each
    of the daily statistics files over time. The files consist of a long list
    of packages which have been downloaded from the repositories that day,
    along with the number of downloads. It also consists of the same list
    separated by specifically downloaded versions of packages, so if somebody
    downloads v0.9.1 and somebody else downloads v0.9.3 this would count both
    downloads separately.

    Another count is the number of different Kernels that have been used to
    download (or downloaded?) from the repositories.

    These are the major things that will lead to size increases in the file,
    but not just for an increased amount of downloads --- we will get to those shortly.

    No, an increase in file size here mainly suggests an increase in the
    'breadth' of files on offer in the repository, whether that be a wider
    variety of program versions or more different packages that people are
    interested in.

    So while the overall amount of downloads gives a general estimate of the
    interest in the distribution, this can show a more 'distributor'-aligned
    view on how many different aisles of the buffet people are eating from.
    """
    )
    return


@app.cell
def _(sizes_df):
    (
        lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
        + lp.geom_point()
        + lp.geom_smooth(method="lm")
        + lp.labs(
            title="Size growth",
            subtitle="Size of daily popcorn statistics files over time",
            caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
        )
    )
    return


@app.cell(hide_code=True)
def _():
    mo.md(
        r"""

    As we can see, the difference over time is massive. Especially early on,
    between 2019 and the start of 2021, the amount of different stuff
    downloaded grew rapidly, with the pace picking up again starting 2023.

    """
    )
    return


@app.cell
def _():
    df = (
        pl.scan_ndjson("data/daily/*", include_file_paths="file")
        .head(200)  # FIXME: take out after debug
        .with_columns(
            pl.col("file")
            .str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
            .str.to_date()
            .alias("date")
        )
        .select("date", pl.col("Packages").struct.unnest())
        .fill_null(0)
        .unpivot(index="date", variable_name="package", value_name="downloads")
        .collect()
    )
    df
    return


@app.cell
def _(df: pl.DataFrame):
    (
        lp.ggplot(
            df.group_by("date").agg(pl.col("downloads").sum()).sort("date"),
            lp.aes("date", "downloads"),
        )
        + lp.geom_line()
        + lp.labs(
            title="Daily downloads",
        )
    )
    return


@app.cell
def _():
    mo.md(
        r"""
    ## Odds and Ends
    There are some missing days in the statistics.
    """
    )
    return


@app.cell
def _(sizes_df_raw):
    sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
    sizes_df_null.select(["date", "size"]).style.tab_header(
        title="Missing Days",
        subtitle="Days with 0B size due to missing on the popcorn server.",
    )
    return


@app.cell
def _(sizes_df):
    def _():
        different_modification_date = sizes_df.with_columns(
            pl.col("modified")
            .str.to_datetime(format="%F %T %:z", strict=False)
            .alias("modified_dt"),
        ).filter(pl.col("date") != pl.col("modified_dt").dt.date())
        # This does not work well what are we showing?
        # 'true' capture date on X but then what on Y - the
        # same date for each? the difference in dt?
        return (
            lp.ggplot(
                different_modification_date,
                lp.aes("date", "modified_dt"),
            )
            + lp.geom_freqpoly()
        )

    _()
    return


# further ideas:
# - which kernels have been DL when? (simplified for semver)
# - when did specific kernels enter the repos?
#
# - which arches are/were most prevalent over time?
# - have the arches been mostly even relative to each other?
#
# - what does unique install mean?
#
# - which Packages had the most unique versions, least versions
# - which pkg had the most download of a single version?
# - for which pkg were the version dls the most spread out?

if __name__ == "__main__":
    app.run()