analysis-voidlinux-popcorn/popcorn.py

462 lines
13 KiB
Python

import marimo
__generated_with = "0.16.2"
app = marimo.App(width="medium")
with app.setup:
# Initialization code that runs beimpofore all other cells
import re
from pathlib import Path
import lets_plot as lp
import marimo as mo
import polars as pl
LIMIT_ROWS = 500_000
DATA_RAW_DIR = "data/raw"
DATA_CLEAN_DIR = "data/cleaned"
@app.cell(hide_code=True)
def _():
mo.md(r"""# Void Linux 'Popcorn' package repository stat analysis
This notebook analyses the daily package repository statistics files,
colloquially known as 'popcorn' files, that are generated by the Void Linux
package manager `xbps` and uploaded by users who have opted in to share.
""")
return
# run data prep
@app.cell
def _():
import clean
clean.json_to_daily_pkg(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
)
clean.json_to_unique_csv(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False
)
clean.json_to_daily_kernel_csv(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False
)
@app.cell
def _():
def parse_size(size_str):
try:
return float(re.search(r"(\d+.?\d+) kB", size_str).group(1)) # pyright: ignore[reportOptionalMemberAccess]
except AttributeError:
return None
sizes_df_raw = (
pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
.with_columns(
pl.col("name")
.str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
.str.to_date()
.alias("date"),
pl.col("size")
.map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
.alias("size_num"),
)
.select(["date", "size_num", "size", "modified"])
)
sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
return sizes_df, sizes_df_raw
@app.cell(hide_code=True)
def _():
mo.md(
r"""
## Daily statistics file size
The simplest operation we can do is look at the overall file size for each
of the daily statistics files over time. The files consist of a long list
of packages which have been downloaded from the repositories that day,
along with the number of downloads. It also consists of the same list
separated by specifically downloaded versions of packages, so if somebody
downloads v0.9.1 and somebody else downloads v0.9.3 this would count both
downloads separately.
Another count is the number of different Kernels that have been used to
download (or downloaded?) from the repositories.
These are the major things that will lead to size increases in the file,
but not just for an increased amount of downloads --- we will get to those shortly.
No, an increase in file size here mainly suggests an increase in the
'breadth' of files on offer in the repository, whether that be a wider
variety of program versions or more different packages that people are
interested in.
So while the overall amount of downloads gives a general estimate of the
interest in the distribution, this can show a more 'distributor'-aligned
view on how many different aisles of the buffet people are eating from.
"""
)
return
@app.cell
def _(sizes_df):
(
lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
+ lp.geom_point()
+ lp.geom_smooth(method="lm")
+ lp.labs(
title="Size growth",
subtitle="Size of daily popcorn statistics files over time",
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
)
)
return
@app.cell(hide_code=True)
def _():
mo.md(
r"""
As we can see, the difference over time is massive. Especially early on,
between 2019 and the start of 2021, the amount of different stuff
downloaded grew rapidly, with the pace picking up again starting 2023.
There are a few outliers with a size of 0 kB, which we will remove from the
data. There are also a few days where the modification date of the file
does not correspond to the represented statistical date.
"""
)
return
@app.cell
def _():
df_pkg_lazy = (
pl.scan_csv(
f"{DATA_CLEAN_DIR}/daily/*.csv",
include_file_paths="file",
schema={
"date": pl.Date,
"package": pl.String,
"downloads": pl.UInt16,
},
)
.drop("file")
.fill_null(0)
.head(LIMIT_ROWS) # FIXME: take out after debug
)
df_pkg_lazy.collect()
return
@app.cell(hide_code=True)
def _():
mo.md(
r"""
## Download statistics
Now that we have an idea of how the overall interest in the distribution
has changed over time, let's look at the actual download statistics.
The popcorn files contain two main pieces of information: the number of
unique installs (i.e. unique machines downloading packages) and the number
of downloads per package. We will look at both of these in turn.
"""
)
return
@app.cell
def _(df_pkg_lazy: pl.LazyFrame):
def _():
weekly_downloads = (
df_pkg_lazy.sort("date")
.group_by_dynamic("date", every="1w")
.agg(pl.col("downloads").sum())
.sort("date")
.collect()
)
return (
lp.ggplot(weekly_downloads, lp.aes("date", "downloads"))
+ lp.geom_line()
+ lp.geom_smooth(method="loess")
+ lp.labs(
title="Weekly downloads",
)
)
_()
return
@app.cell
def _(df_pkg_lazy: pl.LazyFrame):
def _():
weekday_downloads = (
df_pkg_lazy.sort("date")
.with_columns(
pl.col("date")
.dt.weekday()
.sort()
.replace_strict(
{
1: "Mon",
2: "Tue",
3: "Wed",
4: "Thu",
5: "Fri",
6: "Sat",
7: "Sun",
}
)
.alias("weekday")
)
.collect()
)
return (
lp.ggplot(weekday_downloads, lp.aes("weekday", "downloads"))
+ lp.geom_bar()
+ lp.labs(
title="Weekday downloads",
caption="Downloads aggregated per day of the week they took place.",
)
)
_()
return
@app.cell
def _(df_pkg_lazy: pl.LazyFrame):
def _():
month_agg_downloads = (
df_pkg_lazy.sort("date")
.with_columns(pl.col("date").dt.month().alias("month"))
.collect()
)
return (
lp.ggplot(month_agg_downloads, lp.aes("month", "downloads"))
+ lp.geom_bar()
+ lp.labs(
title="Monthwise downloads",
caption="Downloads aggregated per month of the year.",
)
)
_()
return
@app.cell
def _():
(
lp.ggplot(
pl.read_csv(
f"{DATA_CLEAN_DIR}/unique_installs.csv",
schema={"date": pl.Date, "unique": pl.UInt16},
),
lp.aes("date", "unique"),
)
+ lp.geom_line()
+ lp.geom_smooth()
+ lp.labs(
title="Unique daily uploads",
caption="Daily number of unique providers for package update statistics opting in to popcorn.",
)
)
return
@app.cell
def _(df_pkg_lazy: pl.LazyFrame):
DISPLAY_TOP = 20
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum()).collect()
lp.gggrid(
[
lp.ggplot(
df_pkg_dl.sort("downloads", descending=True).head(DISPLAY_TOP),
lp.aes("package", "downloads"),
)
+ lp.geom_bar(stat="identity")
+ lp.labs(
title="Top packages",
caption="Most updated packages over all time",
),
lp.ggplot(
df_pkg_dl.sort("downloads", descending=False).head(DISPLAY_TOP),
lp.aes("package", "downloads"),
)
+ lp.geom_bar(stat="identity")
+ lp.labs(
title="Rarest packages",
caption="Least updated packages over all time",
),
],
ncol=1,
)
return
@app.cell(hide_code=True)
def _():
mo.md(r""" ## Kernel Analysis """)
return
# - which kernels have been DL when? (simplified for semver)
@app.cell
def _():
kernel_df_lazy = (
pl.scan_csv(
f"{DATA_CLEAN_DIR}/kernels/*.csv",
schema={
"date": pl.Date,
"kernel": pl.String,
"downloads": pl.UInt16,
},
)
.fill_null(0)
.with_columns(pl.col("kernel").str.replace(r"(\d+\.\d+\.\d+).*", "${1}"))
.with_columns(
pl.col("kernel")
.str.replace(r"(\d+).*", "${1}")
.str.to_integer(dtype=pl.UInt8)
.alias("major_ver"),
pl.col("kernel").str.replace(r"(\d+\.\d+).*", "${1}").alias("minor_ver"),
)
.head(LIMIT_ROWS) # FIXME: take out after debug
)
kernel_df_v99 = (
kernel_df_lazy.filter(pl.col("major_ver") == 99).collect().select("date")
)
kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
(
lp.ggplot(
kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
.group_by("major_ver")
.agg(pl.col("downloads").sum())
.sort("major_ver")
.collect(),
lp.aes("major_ver", "downloads"),
)
+ lp.geom_bar(stat="identity")
+ lp.labs(
title="Kernel versions used",
caption="For each daily download, add up the currently running kernel version",
)
)
return
@app.cell(hide_code=True)
def _(kernel_df_v99: pl.DataFrame):
mo.md(
rf"""
When looking at the kernel versions used, we see a very strong jump between major kernel version
4 and major kernel version 5.
For this analysis we had to exclude {kernel_df_v99.select(pl.len()).item()} rows which were
apparently from the future, as they were running variations of major kernel version 99. In all
likelihood there is a custom kernel version out there which reports its own major version as 99.
The strange version starts appearing on {kernel_df_v99.select("date").row(0)[0]} and shows up
all the way until {kernel_df_v99.select("date").row(-1)[0]}.
"""
)
return
@app.cell
def _(kernel_df_lazy: pl.LazyFrame):
(
lp.ggplot(
kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
.select(["date", "major_ver", "downloads"])
.sort("date")
.group_by_dynamic("date", every="1w", group_by="major_ver")
.agg(pl.col("downloads").sum())
.collect(),
lp.aes("date", "downloads", color="major_ver"),
)
+ lp.geom_line()
+ lp.labs(
title="Kernels over time",
caption="For each daily download, count used kernel versions",
)
)
@app.cell(hide_code=True)
def _():
mo.md(
r"""
## Odds and Ends
There are some missing days in the statistics.
"""
)
return
@app.cell
def _(sizes_df_raw):
sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
sizes_df_null.select(["date", "size"]).style.tab_header(
title="Missing Days",
subtitle="Days with 0B size due to missing on the popcorn server.",
)
return
@app.cell
def _(sizes_df):
def _():
different_modification_date = sizes_df.with_columns(
pl.col("modified")
.str.to_datetime(format="%F %T %:z", strict=False)
.alias("modified_dt"),
).filter(pl.col("date") != pl.col("modified_dt").dt.date())
# This does not work well what are we showing?
# 'true' capture date on X but then what on Y - the
# same date for each? the difference in dt?
return (
lp.ggplot(
different_modification_date,
lp.aes("date", "modified_dt"),
)
+ lp.geom_freqpoly()
)
_()
return
# further ideas:
#
# - daily download habits:
# - are we downloading further spread of versions on specific days
# - are there 'update' days, where things converge? specific weekday/on holidays/etc?
#
# - when did specific kernels enter the repos?
#
# - which arches are/were most prevalent over time?
# - have the arches been mostly even relative to each other?
#
# - what does unique install mean?
#
# - which Packages had the most unique versions, least versions
# - which pkg had the most download of a single version?
# - for which pkg were the version dls the most spread out?
if __name__ == "__main__":
app.run()