analysis-voidlinux-popcorn/popcorn.py

import marimo

__generated_with = "0.16.2"
app = marimo.App(width="medium")

with app.setup:
    # Initialization code that runs beimpofore all other cells
    import json
    import re
    from pathlib import Path

    import lets_plot as lp
    import marimo as mo
    import polars as pl


@app.cell(hide_code=True)
def _():
    mo.md(r"""# Void Linux 'Popcorn' package repository stat analysis""")
    return


@app.cell
def _():
    def parse_size(size_str):
        try:
            return float(re.search(r"(\d+.?\d+) kB", size_str).group(1))  # pyright: ignore[reportOptionalMemberAccess]
        except AttributeError:
            return None

    sizes_df_raw = (
        pl.read_csv("data/file_sizes.csv")
        .with_columns(
            pl.col("name")
            .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
            .str.to_date()
            .alias("date"),
            pl.col("size")
            .map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
            .alias("size_num"),
        )
        .select(["date", "size_num", "size", "modified"])
    )
    sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
    return sizes_df, sizes_df_raw


@app.cell
def _(sizes_df):
    (
        lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
        + lp.geom_point()
        + lp.geom_smooth(method="lm")
        + lp.labs(
            title="Size growth",
            subtitle="Size of daily popcorn statistics files over time",
            caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
        )
    )


@app.cell
def _():
    df = (
        pl.scan_ndjson("data/daily/*", include_file_paths="file")
        .head(200)  # FIXME: take out after debug
        .with_columns(
            pl.col("file")
            .str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
            .str.to_date()
            .alias("date")
        )
        .select("date", pl.col("Packages").struct.unnest())
        .fill_null(0)
        .unpivot(index="date", variable_name="package", value_name="downloads")
        .collect()
    )
    df
    return


    (
        lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
        + lp.geom_point()
        + lp.geom_smooth(method="lowess")
        + lp.labs(
            title="",
        )
    )
    return


@app.cell
def _():
    mo.md(
        r"""
    ## Odds and Ends
    There are some missing days in the statistics.
    """
    )
    return


@app.cell
def _(sizes_df_raw):
    sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
    sizes_df_null.select(["date", "size"]).style.tab_header(
        title="Missing Days",
        subtitle="Days with 0B size due to missing on the popcorn server.",
    )
    return


@app.cell
def _(sizes_df):
    def _():
        different_modification_date = sizes_df.with_columns(
            pl.col("modified")
            .str.to_datetime(format="%F %T %:z", strict=False)
            .alias("modified_dt"),
        ).filter(pl.col("date") != pl.col("modified_dt").dt.date())
        # This does not work well what are we showing?
        # 'true' capture date on X but then what on Y - the
        # same date for each? the difference in dt?
        return (
            lp.ggplot(
                different_modification_date,
                lp.aes("date", "modified_dt"),
            )
            + lp.geom_freqpoly()
        )

    _()
    return


# further ideas:
# - which kernels have been DL when? (simplified for semver)
# - when did specific kernels enter the repos?
#
# - which arches are/were most prevalent over time?
# - have the arches been mostly even relative to each other?
#
# - what does unique install mean?
#
# - which Packages had the most unique versions, least versions
# - which pkg had the most download of a single version?
# - for which pkg were the version dls the most spread out?

if __name__ == "__main__":
    app.run()