Remove reduntant cleaning files

2025-10-01 14:42:07 +02:00 · 2025-10-01 14:42:07 +02:00 · 66b0464809
commit 66b0464809
parent 0618814c49
3 changed files with 0 additions and 169 deletions
--- a/notebooks/popcorn.py
+++ b/notebooks/popcorn.py
@ -0,0 +1,562 @@
+import marimo
+
+__generated_with = "0.16.2"
+app = marimo.App(width="medium")
+
+with app.setup:
+    # Initialization code that runs beimpofore all other cells
+    import re
+    from pathlib import Path
+
+    import lets_plot as lp
+    import marimo as mo
+    import polars as pl
+
+    LIMIT_ROWS = 50_000
+    DATA_RAW_DIR = "data/raw"
+    DATA_CLEAN_DIR = "data/cleaned"
+    DATA_PARQUET_DIR = "data/parquet"
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(r"""# Void Linux 'Popcorn' package repository stat analysis
+
+    This notebook analyses the daily package repository statistics files,
+    colloquially known as 'popcorn' files, that are generated by the Void Linux
+    package manager `xbps` and uploaded by users who have opted in to share.
+    """)
+    return
+
+
+# run data prep
+@app.cell
+def _():
+    import clean
+
+    clean.json_to_daily_pkg(
+        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
+    )
+    clean.json_to_unique_csv(
+        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False
+    )
+    clean.json_to_daily_kernel_csv(
+        Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False
+    )
+
+
+@app.cell
+def _():
+    def parse_size(size_str):
+        try:
+            return float(re.search(r"(\d+.?\d+) kB", size_str).group(1))  # pyright: ignore[reportOptionalMemberAccess]
+        except AttributeError:
+            return None
+
+    sizes_df_raw = (
+        pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
+        .with_columns(
+            pl.col("name")
+            .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
+            .str.to_date()
+            .alias("date"),
+            pl.col("size")
+            .map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
+            .alias("size_num"),
+        )
+        .select(["date", "size_num", "size", "modified"])
+    )
+    sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
+    return sizes_df, sizes_df_raw
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(
+        r"""
+    ## Daily statistics file size
+
+    The simplest operation we can do is look at the overall file size for each of the daily
+    statistics files over time. The files consist of a long list of packages which have been checked
+    from the repositories that day, along with the number of package instances. It also consists of
+    the same list separated by specifically installed versions of packages, so if somebody has
+    v0.9.1 and somebody else v0.9.3 instead this would count both packages separately.
+
+    Another count is the number of different Kernels that have been used on that day, with their
+    exact kernel name including major version, minor version and any suffix.
+
+    These are the major things that will lead to size increases in the file, but not just for an
+    increased amount of absolute users, packages or uploads --- we will get to those shortly.
+
+    No, an increase in file size here mainly suggests an increase in the 'breadth' of files on offer
+    in the repository, whether that be a wider variety of program versions or more different
+    packages that people are interested in, and those that the community chooses to use.
+
+    So while the overall amount of packages gives a general estimate of the interest in the
+    distribution, this can show a more 'distributor'-aligned view on how many different aisles of
+    the buffet people are eating from.
+
+    """
+    )
+    return
+
+
+@app.cell
+def _(sizes_df):
+    (
+        lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
+        + lp.geom_point()
+        + lp.geom_smooth(method="lm")
+        + lp.labs(
+            title="Size growth",
+            subtitle="Size of daily popcorn statistics files over time",
+            caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
+        )
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(
+        r"""
+
+    As we can see, the difference over time is massive. Especially early on, between 2019 and the
+    start of 2021, the amount of different packages and package versions used grew rapidly, with the
+    pace picking up once again starting 2023.
+
+    There are a few outlier days with a size of 0 kB, which we will remove from the data. In all
+    likelihood, those days were not reported correctly or there was some kind of issue on the
+    backend so the stats for those days are lost.
+
+    There are also a few days where the modification date of the file does not correspond to the
+    represented statistical date but those are kept. This rather points to certain times when the
+    files have been moved on the backend, or recreated externally but does not mean the data are
+    bad.
+
+    """
+    )
+    return
+
+
+@app.cell
+def _():
+    df_pkg_lazy = (
+        pl.scan_csv(
+            f"{DATA_CLEAN_DIR}/daily/*.csv",
+            include_file_paths="file",
+            schema={
+                "date": pl.Date,
+                "package": pl.String,
+                "downloads": pl.UInt16,
+            },
+        )
+        .drop("file")
+        .fill_null(0)
+    )
+    if LIMIT_ROWS: # NOTE: this is only for debugging purposes
+        df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
+    # give small df preview
+    df_pkg_lazy.head(100).collect(engine="streaming")
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(
+        r"""
+    ## Package statistics
+
+    Now that we have an idea of how the overall interest in the distribution has changed over time,
+    let's look at the actual package statistics.
+
+    The popcorn files contain two main pieces of information: the number of installs per package
+    (e.g. how many people have rsync installed) and the number of unique installs (i.e. unique
+    machines providing statistics). We will look at both of these in turn.
+
+    """
+    )
+    return
+
+
+@app.cell
+def _(df_pkg_lazy: pl.LazyFrame):
+    def _():
+        weekly_packages = (
+            df_pkg_lazy.sort("date")
+            .group_by_dynamic("date", every="1w")
+            .agg(pl.col("downloads").sum())
+            .sort("date")
+        )
+        return (
+            lp.ggplot(weekly_packages.collect(engine="streaming"), lp.aes("date", "downloads"))
+            + lp.geom_line()
+            + lp.geom_smooth(method="loess")
+            + lp.labs(
+                title="Weekly package ownership",
+                caption="Count of all installed packages aggregated for each week",
+            )
+        )
+
+    _()
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(
+        r"""
+
+    The amount of packages installed on all machines increases strongly over time.
+
+    """
+    )
+    return
+
+
+@app.cell
+def _(df_pkg_lazy: pl.LazyFrame):
+    def _():
+        weekday_downloads = df_pkg_lazy.sort("date").with_columns(
+            pl.col("date")
+            .dt.weekday()
+            .sort()
+            .replace_strict(
+                {
+                    1: "Mon",
+                    2: "Tue",
+                    3: "Wed",
+                    4: "Thu",
+                    5: "Fri",
+                    6: "Sat",
+                    7: "Sun",
+                }
+            )
+            .alias("weekday")
+        )
+        return (
+            lp.ggplot(weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "downloads"))
+            + lp.geom_bar()
+            + lp.labs(
+                title="Weekday downloads",
+                caption="Downloads aggregated per day of the week they took place.",
+            )
+        )
+
+    _()
+    return
+
+
+@app.cell
+def _(df_pkg_lazy: pl.LazyFrame):
+    def _():
+        month_agg_downloads = df_pkg_lazy.sort("date").with_columns(
+            pl.col("date").dt.month().alias("month")
+        )
+        return (
+            lp.ggplot(month_agg_downloads.collect(engine="streaming"), lp.aes("month", "downloads"))
+            + lp.geom_bar()
+            + lp.labs(
+                title="Monthwise downloads",
+                caption="Downloads aggregated per month of the year.",
+            )
+        )
+
+    _()
+    return
+
+
+@app.cell
+def _():
+    (
+        lp.ggplot(
+            pl.read_csv(
+                f"{DATA_CLEAN_DIR}/unique_installs.csv",
+                schema={"date": pl.Date, "unique": pl.UInt16},
+            ),
+            lp.aes("date", "unique"),
+        )
+        + lp.geom_line()
+        + lp.geom_smooth()
+        + lp.labs(
+            title="Unique daily uploads",
+            caption="Daily number of unique providers for package update statistics opting in to popcorn.",
+        )
+    )
+    return
+
+
+@app.cell
+def _(df_pkg_lazy: pl.LazyFrame):
+    df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
+
+    def _():
+        DISPLAY_LIMIT = 20
+
+        return lp.gggrid(
+            [
+                lp.ggplot(
+                    df_pkg_dl.sort("downloads", descending=True)
+                    .head(DISPLAY_LIMIT)
+                    .collect(engine="streaming"),
+                    lp.aes("package", "downloads"),
+                )
+                + lp.geom_bar(stat="identity")
+                + lp.labs(
+                    title="Top packages",
+                    caption="Most installed packages over all time",
+                ),
+                lp.ggplot(
+                    df_pkg_dl.sort("downloads", descending=False)
+                    # this seems arbitrary but gives a better result?
+                    .head(DISPLAY_LIMIT)
+                    .collect(engine="streaming"),
+                    lp.aes("package", "downloads"),
+                )
+                + lp.geom_bar(stat="identity")
+                + lp.labs(
+                    title="Rare packages",
+                    caption="Least often installed packages",
+                ),
+            ],
+            ncol=1,
+        )
+
+    _()
+    return
+
+
+@app.cell(hide_code=True)
+def _(df_pkg_dl: pl.LazyFrame):
+    def _():
+        return (
+            lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("downloads"))
+            + lp.geom_freqpoly(stat="bin")
+            + lp.labs(
+                title="Package installation count distribution",
+            )
+        )
+
+    _()
+    return
+
+
+@app.cell(hide_code=True)
+def _(df_pkg_dl: pl.LazyFrame):
+    def _():
+        def get_num(df: pl.LazyFrame) -> int:
+            return df.count().collect(engine="streaming").item(0, 0)
+
+        one_install = df_pkg_dl.sort("downloads", descending=False).filter(
+            pl.col("downloads") == 1
+        )
+        two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
+            (pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
+        )
+        three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
+            (pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
+        )
+        # TODO: Fix for new filters above
+        return mo.md(rf"""
+
+        There are {get_num(one_install)} packages which have exactly a single
+        installation in the data, {get_num(two_installs)} packages with exactly
+        two installations, and {get_num(three_installs)} packages with exactly
+        three.
+
+        """)
+
+    _()
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(r""" ## Kernel Analysis """)
+    return
+
+
+# - which kernels have been DL when? (simplified for semver)
+@app.cell
+def _():
+    kernel_df_lazy = (
+        pl.scan_csv(
+            f"{DATA_CLEAN_DIR}/kernels/*.csv",
+            schema={
+                "date": pl.Date,
+                "kernel": pl.String,
+                "downloads": pl.UInt16,
+            },
+        )
+        .fill_null(0)
+        .with_columns(pl.col("kernel").str.replace(r"(\d+\.\d+\.\d+).*", "${1}"))
+        .with_columns(
+            pl.col("kernel")
+            .str.replace(r"(\d+).*", "${1}")
+            .str.to_integer(dtype=pl.UInt8)
+            .alias("major_ver"),
+            pl.col("kernel").str.replace(r"(\d+\.\d+).*", "${1}").alias("minor_ver"),
+        )
+    )
+
+    kernel_df_v99 = (
+        kernel_df_lazy.filter(pl.col("major_ver") == 99).collect(engine="streaming").select("date")
+    )
+    kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
+
+    (
+        lp.ggplot(
+            kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
+            .group_by("major_ver")
+            .agg(pl.col("downloads").sum())
+            .sort("major_ver")
+            .collect(engine="streaming"),
+            lp.aes("major_ver", "downloads"),
+        )
+        + lp.geom_bar(stat="identity")
+        + lp.labs(
+            title="Kernel versions used",
+            caption="For each daily download, add up the currently running kernel version",
+        )
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(kernel_df_v99: pl.DataFrame):
+    mo.md(
+        rf"""
+
+    When looking at the kernel versions used, we see a very strong jump between major kernel version
+    4 and major kernel version 5.
+
+    For this analysis we had to exclude {kernel_df_v99.select(pl.len()).item()} rows which were
+    apparently from the future, as they were running variations of major kernel version 99. In all
+    likelihood there is a custom kernel version out there which reports its own major version as 99.
+    The strange version starts appearing on {kernel_df_v99.select("date").row(0)[0]} and shows up
+    all the way until {kernel_df_v99.select("date").row(-1)[0]}.
+
+    """
+    )
+    return
+
+
+@app.cell
+def _(kernel_df_lazy: pl.LazyFrame):
+    weekly_kernel_df = (
+        kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
+        .select(["date", "major_ver", "downloads"])
+        .sort("date")
+        .group_by_dynamic("date", every="1w", group_by="major_ver")
+        .agg(pl.col("downloads").sum())
+        .collect(engine="streaming")
+    )
+
+    (
+        lp.ggplot(
+            weekly_kernel_df,
+            lp.aes("date", "downloads", color="major_ver"),
+        )
+        + lp.geom_line()
+        + lp.labs(
+            title="Kernels over time",
+            caption="For each daily download, count used kernel versions",
+        )
+    )
+
+
+@app.cell(hide_code=True)
+def _(weekly_kernel_df: pl.DataFrame):
+    from datetime import date
+
+    last_kernel4: date = weekly_kernel_df.filter(pl.col("major_ver") == "4")[-1][
+        "date"
+    ].item()
+    first_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[0][
+        "date"
+    ].item()
+    last_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[-1][
+        "date"
+    ].item()
+    mo.md(
+        rf"""
+
+    A timeline analysis of the kernels used to report daily downloads shows that people generally
+    adopt new major kernel versons at roughly the same time. This change is especially stark between
+    major kernel versions 5 and 6, which seem to have traded place in usage almost over night.
+
+    The first time that major version 5 of the kernel shows up is on {first_kernel5}. From here, it
+    took a long time for the last of the version 4 kernels to disappear, coinciding with the big
+    switch between major version 5 and 6. The last time a major version 4 is seen is on
+    {last_kernel4}, while the last major version 5 kernels still pop up.
+    It would seem, then, that the people still running kernel version 4 used the opportunity of
+    everybody switching to the stable version of 6 to also upgrade their machines.
+
+    """
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(
+        r"""
+    ## Odds and Ends
+    There are some missing days in the statistics.
+    """
+    )
+    return
+
+
+@app.cell
+def _(sizes_df_raw):
+    sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
+    sizes_df_null.select(["date", "size"]).style.tab_header(
+        title="Missing Days",
+        subtitle="Days with 0B size due to missing on the popcorn server.",
+    )
+    return
+
+
+@app.cell
+def _(sizes_df):
+    def _():
+        different_modification_date = sizes_df.with_columns(
+            pl.col("modified")
+            .str.to_datetime(format="%F %T %:z", strict=False)
+            .alias("modified_dt"),
+        ).filter(pl.col("date") != pl.col("modified_dt").dt.date())
+        # This does not work well what are we showing?
+        # 'true' capture date on X but then what on Y - the
+        # same date for each? the difference in dt?
+        return (
+            lp.ggplot(
+                different_modification_date,
+                lp.aes("date", "modified_dt"),
+            )
+            + lp.geom_freqpoly()
+        )
+
+    _()
+    return
+
+
+# further ideas:
+#
+# - daily download habits:
+#   - are we downloading further spread of versions on specific days
+#   - are there 'update' days, where things converge? specific weekday/on holidays/etc?
+#
+# - when did specific kernels enter the repos?
+#
+# - which arches are/were most prevalent over time?
+# - have the arches been mostly even relative to each other?
+#
+# - what does unique install mean?
+#
+# - which Packages had the most unique versions, least versions
+# - which pkg had the most download of a single version?
+# - for which pkg were the version dls the most spread out?
+
+if __name__ == "__main__":
+    app.run()