Remove reduntant cleaning files
This commit is contained in:
parent
0618814c49
commit
66b0464809
3 changed files with 0 additions and 169 deletions
562
notebooks/popcorn.py
Normal file
562
notebooks/popcorn.py
Normal file
|
|
@ -0,0 +1,562 @@
|
|||
import marimo
|
||||
|
||||
__generated_with = "0.16.2"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
with app.setup:
|
||||
# Initialization code that runs beimpofore all other cells
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import lets_plot as lp
|
||||
import marimo as mo
|
||||
import polars as pl
|
||||
|
||||
LIMIT_ROWS = 50_000
|
||||
DATA_RAW_DIR = "data/raw"
|
||||
DATA_CLEAN_DIR = "data/cleaned"
|
||||
DATA_PARQUET_DIR = "data/parquet"
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
mo.md(r"""# Void Linux 'Popcorn' package repository stat analysis
|
||||
|
||||
This notebook analyses the daily package repository statistics files,
|
||||
colloquially known as 'popcorn' files, that are generated by the Void Linux
|
||||
package manager `xbps` and uploaded by users who have opted in to share.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
# run data prep
|
||||
@app.cell
|
||||
def _():
|
||||
import clean
|
||||
|
||||
clean.json_to_daily_pkg(
|
||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
|
||||
)
|
||||
clean.json_to_unique_csv(
|
||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False
|
||||
)
|
||||
clean.json_to_daily_kernel_csv(
|
||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False
|
||||
)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
def parse_size(size_str):
|
||||
try:
|
||||
return float(re.search(r"(\d+.?\d+) kB", size_str).group(1)) # pyright: ignore[reportOptionalMemberAccess]
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
sizes_df_raw = (
|
||||
pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
|
||||
.with_columns(
|
||||
pl.col("name")
|
||||
.str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
|
||||
.str.to_date()
|
||||
.alias("date"),
|
||||
pl.col("size")
|
||||
.map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
|
||||
.alias("size_num"),
|
||||
)
|
||||
.select(["date", "size_num", "size", "modified"])
|
||||
)
|
||||
sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
|
||||
return sizes_df, sizes_df_raw
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
mo.md(
|
||||
r"""
|
||||
## Daily statistics file size
|
||||
|
||||
The simplest operation we can do is look at the overall file size for each of the daily
|
||||
statistics files over time. The files consist of a long list of packages which have been checked
|
||||
from the repositories that day, along with the number of package instances. It also consists of
|
||||
the same list separated by specifically installed versions of packages, so if somebody has
|
||||
v0.9.1 and somebody else v0.9.3 instead this would count both packages separately.
|
||||
|
||||
Another count is the number of different Kernels that have been used on that day, with their
|
||||
exact kernel name including major version, minor version and any suffix.
|
||||
|
||||
These are the major things that will lead to size increases in the file, but not just for an
|
||||
increased amount of absolute users, packages or uploads --- we will get to those shortly.
|
||||
|
||||
No, an increase in file size here mainly suggests an increase in the 'breadth' of files on offer
|
||||
in the repository, whether that be a wider variety of program versions or more different
|
||||
packages that people are interested in, and those that the community chooses to use.
|
||||
|
||||
So while the overall amount of packages gives a general estimate of the interest in the
|
||||
distribution, this can show a more 'distributor'-aligned view on how many different aisles of
|
||||
the buffet people are eating from.
|
||||
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(sizes_df):
|
||||
(
|
||||
lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
|
||||
+ lp.geom_point()
|
||||
+ lp.geom_smooth(method="lm")
|
||||
+ lp.labs(
|
||||
title="Size growth",
|
||||
subtitle="Size of daily popcorn statistics files over time",
|
||||
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
mo.md(
|
||||
r"""
|
||||
|
||||
As we can see, the difference over time is massive. Especially early on, between 2019 and the
|
||||
start of 2021, the amount of different packages and package versions used grew rapidly, with the
|
||||
pace picking up once again starting 2023.
|
||||
|
||||
There are a few outlier days with a size of 0 kB, which we will remove from the data. In all
|
||||
likelihood, those days were not reported correctly or there was some kind of issue on the
|
||||
backend so the stats for those days are lost.
|
||||
|
||||
There are also a few days where the modification date of the file does not correspond to the
|
||||
represented statistical date but those are kept. This rather points to certain times when the
|
||||
files have been moved on the backend, or recreated externally but does not mean the data are
|
||||
bad.
|
||||
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
df_pkg_lazy = (
|
||||
pl.scan_csv(
|
||||
f"{DATA_CLEAN_DIR}/daily/*.csv",
|
||||
include_file_paths="file",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"package": pl.String,
|
||||
"downloads": pl.UInt16,
|
||||
},
|
||||
)
|
||||
.drop("file")
|
||||
.fill_null(0)
|
||||
)
|
||||
if LIMIT_ROWS: # NOTE: this is only for debugging purposes
|
||||
df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
|
||||
# give small df preview
|
||||
df_pkg_lazy.head(100).collect(engine="streaming")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
mo.md(
|
||||
r"""
|
||||
## Package statistics
|
||||
|
||||
Now that we have an idea of how the overall interest in the distribution has changed over time,
|
||||
let's look at the actual package statistics.
|
||||
|
||||
The popcorn files contain two main pieces of information: the number of installs per package
|
||||
(e.g. how many people have rsync installed) and the number of unique installs (i.e. unique
|
||||
machines providing statistics). We will look at both of these in turn.
|
||||
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
def _():
|
||||
weekly_packages = (
|
||||
df_pkg_lazy.sort("date")
|
||||
.group_by_dynamic("date", every="1w")
|
||||
.agg(pl.col("downloads").sum())
|
||||
.sort("date")
|
||||
)
|
||||
return (
|
||||
lp.ggplot(weekly_packages.collect(engine="streaming"), lp.aes("date", "downloads"))
|
||||
+ lp.geom_line()
|
||||
+ lp.geom_smooth(method="loess")
|
||||
+ lp.labs(
|
||||
title="Weekly package ownership",
|
||||
caption="Count of all installed packages aggregated for each week",
|
||||
)
|
||||
)
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
mo.md(
|
||||
r"""
|
||||
|
||||
The amount of packages installed on all machines increases strongly over time.
|
||||
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
def _():
|
||||
weekday_downloads = df_pkg_lazy.sort("date").with_columns(
|
||||
pl.col("date")
|
||||
.dt.weekday()
|
||||
.sort()
|
||||
.replace_strict(
|
||||
{
|
||||
1: "Mon",
|
||||
2: "Tue",
|
||||
3: "Wed",
|
||||
4: "Thu",
|
||||
5: "Fri",
|
||||
6: "Sat",
|
||||
7: "Sun",
|
||||
}
|
||||
)
|
||||
.alias("weekday")
|
||||
)
|
||||
return (
|
||||
lp.ggplot(weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "downloads"))
|
||||
+ lp.geom_bar()
|
||||
+ lp.labs(
|
||||
title="Weekday downloads",
|
||||
caption="Downloads aggregated per day of the week they took place.",
|
||||
)
|
||||
)
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
def _():
|
||||
month_agg_downloads = df_pkg_lazy.sort("date").with_columns(
|
||||
pl.col("date").dt.month().alias("month")
|
||||
)
|
||||
return (
|
||||
lp.ggplot(month_agg_downloads.collect(engine="streaming"), lp.aes("month", "downloads"))
|
||||
+ lp.geom_bar()
|
||||
+ lp.labs(
|
||||
title="Monthwise downloads",
|
||||
caption="Downloads aggregated per month of the year.",
|
||||
)
|
||||
)
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
(
|
||||
lp.ggplot(
|
||||
pl.read_csv(
|
||||
f"{DATA_CLEAN_DIR}/unique_installs.csv",
|
||||
schema={"date": pl.Date, "unique": pl.UInt16},
|
||||
),
|
||||
lp.aes("date", "unique"),
|
||||
)
|
||||
+ lp.geom_line()
|
||||
+ lp.geom_smooth()
|
||||
+ lp.labs(
|
||||
title="Unique daily uploads",
|
||||
caption="Daily number of unique providers for package update statistics opting in to popcorn.",
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
|
||||
|
||||
def _():
|
||||
DISPLAY_LIMIT = 20
|
||||
|
||||
return lp.gggrid(
|
||||
[
|
||||
lp.ggplot(
|
||||
df_pkg_dl.sort("downloads", descending=True)
|
||||
.head(DISPLAY_LIMIT)
|
||||
.collect(engine="streaming"),
|
||||
lp.aes("package", "downloads"),
|
||||
)
|
||||
+ lp.geom_bar(stat="identity")
|
||||
+ lp.labs(
|
||||
title="Top packages",
|
||||
caption="Most installed packages over all time",
|
||||
),
|
||||
lp.ggplot(
|
||||
df_pkg_dl.sort("downloads", descending=False)
|
||||
# this seems arbitrary but gives a better result?
|
||||
.head(DISPLAY_LIMIT)
|
||||
.collect(engine="streaming"),
|
||||
lp.aes("package", "downloads"),
|
||||
)
|
||||
+ lp.geom_bar(stat="identity")
|
||||
+ lp.labs(
|
||||
title="Rare packages",
|
||||
caption="Least often installed packages",
|
||||
),
|
||||
],
|
||||
ncol=1,
|
||||
)
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(df_pkg_dl: pl.LazyFrame):
|
||||
def _():
|
||||
return (
|
||||
lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("downloads"))
|
||||
+ lp.geom_freqpoly(stat="bin")
|
||||
+ lp.labs(
|
||||
title="Package installation count distribution",
|
||||
)
|
||||
)
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(df_pkg_dl: pl.LazyFrame):
|
||||
def _():
|
||||
def get_num(df: pl.LazyFrame) -> int:
|
||||
return df.count().collect(engine="streaming").item(0, 0)
|
||||
|
||||
one_install = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||
pl.col("downloads") == 1
|
||||
)
|
||||
two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||
(pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
|
||||
)
|
||||
three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||
(pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
|
||||
)
|
||||
# TODO: Fix for new filters above
|
||||
return mo.md(rf"""
|
||||
|
||||
There are {get_num(one_install)} packages which have exactly a single
|
||||
installation in the data, {get_num(two_installs)} packages with exactly
|
||||
two installations, and {get_num(three_installs)} packages with exactly
|
||||
three.
|
||||
|
||||
""")
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
mo.md(r""" ## Kernel Analysis """)
|
||||
return
|
||||
|
||||
|
||||
# - which kernels have been DL when? (simplified for semver)
|
||||
@app.cell
|
||||
def _():
|
||||
kernel_df_lazy = (
|
||||
pl.scan_csv(
|
||||
f"{DATA_CLEAN_DIR}/kernels/*.csv",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"kernel": pl.String,
|
||||
"downloads": pl.UInt16,
|
||||
},
|
||||
)
|
||||
.fill_null(0)
|
||||
.with_columns(pl.col("kernel").str.replace(r"(\d+\.\d+\.\d+).*", "${1}"))
|
||||
.with_columns(
|
||||
pl.col("kernel")
|
||||
.str.replace(r"(\d+).*", "${1}")
|
||||
.str.to_integer(dtype=pl.UInt8)
|
||||
.alias("major_ver"),
|
||||
pl.col("kernel").str.replace(r"(\d+\.\d+).*", "${1}").alias("minor_ver"),
|
||||
)
|
||||
)
|
||||
|
||||
kernel_df_v99 = (
|
||||
kernel_df_lazy.filter(pl.col("major_ver") == 99).collect(engine="streaming").select("date")
|
||||
)
|
||||
kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
|
||||
|
||||
(
|
||||
lp.ggplot(
|
||||
kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
|
||||
.group_by("major_ver")
|
||||
.agg(pl.col("downloads").sum())
|
||||
.sort("major_ver")
|
||||
.collect(engine="streaming"),
|
||||
lp.aes("major_ver", "downloads"),
|
||||
)
|
||||
+ lp.geom_bar(stat="identity")
|
||||
+ lp.labs(
|
||||
title="Kernel versions used",
|
||||
caption="For each daily download, add up the currently running kernel version",
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(kernel_df_v99: pl.DataFrame):
|
||||
mo.md(
|
||||
rf"""
|
||||
|
||||
When looking at the kernel versions used, we see a very strong jump between major kernel version
|
||||
4 and major kernel version 5.
|
||||
|
||||
For this analysis we had to exclude {kernel_df_v99.select(pl.len()).item()} rows which were
|
||||
apparently from the future, as they were running variations of major kernel version 99. In all
|
||||
likelihood there is a custom kernel version out there which reports its own major version as 99.
|
||||
The strange version starts appearing on {kernel_df_v99.select("date").row(0)[0]} and shows up
|
||||
all the way until {kernel_df_v99.select("date").row(-1)[0]}.
|
||||
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(kernel_df_lazy: pl.LazyFrame):
|
||||
weekly_kernel_df = (
|
||||
kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
|
||||
.select(["date", "major_ver", "downloads"])
|
||||
.sort("date")
|
||||
.group_by_dynamic("date", every="1w", group_by="major_ver")
|
||||
.agg(pl.col("downloads").sum())
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
(
|
||||
lp.ggplot(
|
||||
weekly_kernel_df,
|
||||
lp.aes("date", "downloads", color="major_ver"),
|
||||
)
|
||||
+ lp.geom_line()
|
||||
+ lp.labs(
|
||||
title="Kernels over time",
|
||||
caption="For each daily download, count used kernel versions",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(weekly_kernel_df: pl.DataFrame):
|
||||
from datetime import date
|
||||
|
||||
last_kernel4: date = weekly_kernel_df.filter(pl.col("major_ver") == "4")[-1][
|
||||
"date"
|
||||
].item()
|
||||
first_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[0][
|
||||
"date"
|
||||
].item()
|
||||
last_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[-1][
|
||||
"date"
|
||||
].item()
|
||||
mo.md(
|
||||
rf"""
|
||||
|
||||
A timeline analysis of the kernels used to report daily downloads shows that people generally
|
||||
adopt new major kernel versons at roughly the same time. This change is especially stark between
|
||||
major kernel versions 5 and 6, which seem to have traded place in usage almost over night.
|
||||
|
||||
The first time that major version 5 of the kernel shows up is on {first_kernel5}. From here, it
|
||||
took a long time for the last of the version 4 kernels to disappear, coinciding with the big
|
||||
switch between major version 5 and 6. The last time a major version 4 is seen is on
|
||||
{last_kernel4}, while the last major version 5 kernels still pop up.
|
||||
It would seem, then, that the people still running kernel version 4 used the opportunity of
|
||||
everybody switching to the stable version of 6 to also upgrade their machines.
|
||||
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
mo.md(
|
||||
r"""
|
||||
## Odds and Ends
|
||||
There are some missing days in the statistics.
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(sizes_df_raw):
|
||||
sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
|
||||
sizes_df_null.select(["date", "size"]).style.tab_header(
|
||||
title="Missing Days",
|
||||
subtitle="Days with 0B size due to missing on the popcorn server.",
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(sizes_df):
|
||||
def _():
|
||||
different_modification_date = sizes_df.with_columns(
|
||||
pl.col("modified")
|
||||
.str.to_datetime(format="%F %T %:z", strict=False)
|
||||
.alias("modified_dt"),
|
||||
).filter(pl.col("date") != pl.col("modified_dt").dt.date())
|
||||
# This does not work well what are we showing?
|
||||
# 'true' capture date on X but then what on Y - the
|
||||
# same date for each? the difference in dt?
|
||||
return (
|
||||
lp.ggplot(
|
||||
different_modification_date,
|
||||
lp.aes("date", "modified_dt"),
|
||||
)
|
||||
+ lp.geom_freqpoly()
|
||||
)
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
# further ideas:
|
||||
#
|
||||
# - daily download habits:
|
||||
# - are we downloading further spread of versions on specific days
|
||||
# - are there 'update' days, where things converge? specific weekday/on holidays/etc?
|
||||
#
|
||||
# - when did specific kernels enter the repos?
|
||||
#
|
||||
# - which arches are/were most prevalent over time?
|
||||
# - have the arches been mostly even relative to each other?
|
||||
#
|
||||
# - what does unique install mean?
|
||||
#
|
||||
# - which Packages had the most unique versions, least versions
|
||||
# - which pkg had the most download of a single version?
|
||||
# - for which pkg were the version dls the most spread out?
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Loading…
Add table
Add a link
Reference in a new issue