203 lines
5.6 KiB
Python
203 lines
5.6 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.16.2"
|
|
app = marimo.App(width="medium")
|
|
|
|
with app.setup:
|
|
# Initialization code that runs beimpofore all other cells
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import lets_plot as lp
|
|
import marimo as mo
|
|
import polars as pl
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _():
|
|
mo.md(r"""# Void Linux 'Popcorn' package repository stat analysis""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
def parse_size(size_str):
|
|
try:
|
|
return float(re.search(r"(\d+.?\d+) kB", size_str).group(1)) # pyright: ignore[reportOptionalMemberAccess]
|
|
except AttributeError:
|
|
return None
|
|
|
|
sizes_df_raw = (
|
|
pl.read_csv("data/file_sizes.csv")
|
|
.with_columns(
|
|
pl.col("name")
|
|
.str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
|
|
.str.to_date()
|
|
.alias("date"),
|
|
pl.col("size")
|
|
.map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
|
|
.alias("size_num"),
|
|
)
|
|
.select(["date", "size_num", "size", "modified"])
|
|
)
|
|
sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
|
|
return sizes_df, sizes_df_raw
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _():
|
|
mo.md(
|
|
r"""
|
|
## Daily statistics file size
|
|
|
|
The simplest operation we can do is look at the overall file size for each
|
|
of the daily statistics files over time. The files consist of a long list
|
|
of packages which have been downloaded from the repositories that day,
|
|
along with the number of downloads. It also consists of the same list
|
|
separated by specifically downloaded versions of packages, so if somebody
|
|
downloads v0.9.1 and somebody else downloads v0.9.3 this would count both
|
|
downloads separately.
|
|
|
|
Another count is the number of different Kernels that have been used to
|
|
download (or downloaded?) from the repositories.
|
|
|
|
These are the major things that will lead to size increases in the file,
|
|
but not just for an increased amount of downloads --- we will get to those shortly.
|
|
|
|
No, an increase in file size here mainly suggests an increase in the
|
|
'breadth' of files on offer in the repository, whether that be a wider
|
|
variety of program versions or more different packages that people are
|
|
interested in.
|
|
|
|
So while the overall amount of downloads gives a general estimate of the
|
|
interest in the distribution, this can show a more 'distributor'-aligned
|
|
view on how many different aisles of the buffet people are eating from.
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(sizes_df):
|
|
(
|
|
lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
|
|
+ lp.geom_point()
|
|
+ lp.geom_smooth(method="lm")
|
|
+ lp.labs(
|
|
title="Size growth",
|
|
subtitle="Size of daily popcorn statistics files over time",
|
|
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
|
|
)
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _():
|
|
mo.md(
|
|
r"""
|
|
|
|
As we can see, the difference over time is massive. Especially early on,
|
|
between 2019 and the start of 2021, the amount of different stuff
|
|
downloaded grew rapidly, with the pace picking up again starting 2023.
|
|
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
df = (
|
|
pl.scan_ndjson("data/daily/*", include_file_paths="file")
|
|
.head(200) # FIXME: take out after debug
|
|
.with_columns(
|
|
pl.col("file")
|
|
.str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
|
|
.str.to_date()
|
|
.alias("date")
|
|
)
|
|
.select("date", pl.col("Packages").struct.unnest())
|
|
.fill_null(0)
|
|
.unpivot(index="date", variable_name="package", value_name="downloads")
|
|
.collect()
|
|
)
|
|
df
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(df: pl.DataFrame):
|
|
(
|
|
lp.ggplot(
|
|
df.group_by("date").agg(pl.col("downloads").sum()).sort("date"),
|
|
lp.aes("date", "downloads"),
|
|
)
|
|
+ lp.geom_line()
|
|
+ lp.labs(
|
|
title="Daily downloads",
|
|
)
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(
|
|
r"""
|
|
## Odds and Ends
|
|
There are some missing days in the statistics.
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(sizes_df_raw):
|
|
sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
|
|
sizes_df_null.select(["date", "size"]).style.tab_header(
|
|
title="Missing Days",
|
|
subtitle="Days with 0B size due to missing on the popcorn server.",
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(sizes_df):
|
|
def _():
|
|
different_modification_date = sizes_df.with_columns(
|
|
pl.col("modified")
|
|
.str.to_datetime(format="%F %T %:z", strict=False)
|
|
.alias("modified_dt"),
|
|
).filter(pl.col("date") != pl.col("modified_dt").dt.date())
|
|
# This does not work well what are we showing?
|
|
# 'true' capture date on X but then what on Y - the
|
|
# same date for each? the difference in dt?
|
|
return (
|
|
lp.ggplot(
|
|
different_modification_date,
|
|
lp.aes("date", "modified_dt"),
|
|
)
|
|
+ lp.geom_freqpoly()
|
|
)
|
|
|
|
_()
|
|
return
|
|
|
|
|
|
# further ideas:
|
|
# - which kernels have been DL when? (simplified for semver)
|
|
# - when did specific kernels enter the repos?
|
|
#
|
|
# - which arches are/were most prevalent over time?
|
|
# - have the arches been mostly even relative to each other?
|
|
#
|
|
# - what does unique install mean?
|
|
#
|
|
# - which Packages had the most unique versions, least versions
|
|
# - which pkg had the most download of a single version?
|
|
# - for which pkg were the version dls the most spread out?
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|