Finish package stats section

This commit is contained in:
Marty Oehme 2025-10-08 15:09:08 +02:00
parent 707632fb7d
commit 9687eb662b
Signed by: Marty
GPG key ID: 4E535BC19C61886E
2 changed files with 255 additions and 32 deletions

View file

@ -82,10 +82,31 @@ def plt_filesize(sizes_df):
+ lp.geom_point() + lp.geom_point()
+ lp.geom_smooth(method="lm") + lp.geom_smooth(method="lm")
+ lp.labs( + lp.labs(
title="Size growth", title="Report size",
subtitle="Cumulative filesize of daily popcorn statistics over time", subtitle="Filesize of popcorn statistics reports each day",
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.", caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
y="filesize in kB", y="filesize in KB",
)
)
return
@app.cell
def plt_filesize_cumulative(sizes_df: pl.DataFrame):
(
lp.ggplot(
sizes_df.with_columns(
(pl.col("filesize").cum_sum() / 1024 / 1024).alias("filesize_cum")
),
lp.aes(x="date", y="filesize_cum"),
)
+ lp.geom_line()
# + lp.geom_smooth(method="lm")
+ lp.labs(
title="Report size growth",
subtitle="Cumulative filesize of all popcorn statistics reports up to that day",
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
y="filesize in MB",
) )
) )
return return
@ -293,14 +314,17 @@ def plt_unique_installs():
) )
( (
lp.ggplot( lp.ggplot(
df_unique_installs, df_unique_installs.sort("date")
.group_by_dynamic("date", every="1w")
.agg(pl.col("unique").mean()),
lp.aes("date", "unique"), lp.aes("date", "unique"),
) )
+ lp.geom_line() + lp.geom_line()
+ lp.geom_smooth() + lp.geom_smooth(method="loess")
+ lp.labs( + lp.labs(
title="Unique daily uploads", title="Unique installations",
caption="Daily number of unique providers for package update statistics opting in to popcorn.", subtitle="Weekly statistics upload averages",
caption="Daily number of unique providers for package update statistics opting in to data collection.",
) )
) )
return return
@ -317,6 +341,7 @@ def plt_top_packages(df_pkg_lazy: pl.LazyFrame):
[ [
lp.ggplot( lp.ggplot(
df_pkg_dl.sort("count", descending=True) df_pkg_dl.sort("count", descending=True)
.filter(pl.col("package") != "PopCorn")
.head(DISPLAY_LIMIT) .head(DISPLAY_LIMIT)
.collect(engine="streaming"), .collect(engine="streaming"),
lp.aes("package", "count"), lp.aes("package", "count"),
@ -346,6 +371,16 @@ def plt_top_packages(df_pkg_lazy: pl.LazyFrame):
return return
@app.cell
def tab_rarest_packages(df_pkg_dl: pl.LazyFrame):
(
df_pkg_dl.sort("count", descending=False)
# this seems arbitrary but gives a better result?
.filter(pl.col("count") == 1)
.collect(engine="streaming")
)
return
@app.cell(hide_code=True) @app.cell(hide_code=True)
def plt_package_distribution(df_pkg_dl: pl.LazyFrame): def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
def _(): def _():
@ -360,6 +395,28 @@ def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
_() _()
return return
@app.cell
def tab_percentiles(df_pkg_dl: pl.LazyFrame):
def get_num(df: pl.LazyFrame) -> int:
return df.count().collect(engine="streaming").item(0, 0)
one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 1) & (pl.col("count") < 10)
)
ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 10) & (pl.col("count") < 20)
)
twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 20) & (pl.col("count") < 30)
)
thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30))
pl.DataFrame([
get_num(one_ten_installs),
get_num(ten_twenty_installs),
get_num(twenty_thirty),
get_num(thirty_plus),
])
return
@app.cell(hide_code=True) @app.cell(hide_code=True)

View file

@ -1,6 +1,7 @@
--- ---
title: "Voidlinux popcorn" title: "Voidlinux popcorn"
subtitle: "Analysis of voidlinux package and kernel statistics" subtitle: "Analysis of voidlinux package and kernel statistics"
toc: true
--- ---
This notebook analyses the daily package repository statistics files, This notebook analyses the daily package repository statistics files,
@ -70,9 +71,9 @@ and their respective versions.
A look at the overall file size for each of the daily statistics files over A look at the overall file size for each of the daily statistics files over
time reveals not necessarily changes in the absolute use of packages (e.g. time reveals not necessarily changes in the absolute use of packages (e.g.
`neovim` being installed more or less often). Whether it has been downloaded `rsync` being installed more or less often). Whether it has been downloaded
once or 100 times, the file size does not change drastically. Instead it once or 100 times, the file size does not change drastically. Instead it
increases much more drastically when both `neovim` and `emacs` are installed, increases much more drastically when both `rsync` and `rclone` are installed,
or a variety of different versions for one of the packages are installed. or a variety of different versions for one of the packages are installed.
Similarly for different versions of the kernel. Similarly for different versions of the kernel.
@ -92,19 +93,21 @@ from notebooks.popcorn import plt_filesize
pplot(plt_filesize) pplot(plt_filesize)
``` ```
As we can see, the difference over time is massive. Especially early on, between 2019 and the As we can see, the difference over time is massive. Especially early on,
start of 2021, the amount of different packages and package versions used grew rapidly, with the between 2019 and the start of 2021, the amount of different packages and
pace also picking up once again starting 2023. package versions used grew rapidly, with the pace also picking up once again
starting 2023.
From a reported filesize of around 50kB in the first days before the end of From a reported filesize of around 50KB in the first days before the end of
2019 we have easily tripled the filesize to over 150kB needed for the report 2019 we have easily tripled the filesize to over 150KB needed for the report
per day. Nowadays we have reached just about 400kB daily report size, over 8 per day. Nowadays we have reached just about 400KB daily report size, over 8
times the size beginning 2018. times the size beginning 2018.
There are a few outlier days with a size of 0 kB on the server, which we had to There are a few outlier days with a size of 0 KB on the server, which we had to
remove from the data. In all likelihood, those days were not reported correctly remove from the data. In all likelihood, those days were not reported correctly
or there was some kind of issue on the backend so the stats for those days are or there was some kind of issue on the backend so the stats for those days are
lost. lost.
<!-- TODO: is this still true? --> <!-- TODO: is this still true? -->
We take a look at the missing days We take a look at the missing days
among other things at the end of this article. among other things at the end of this article.
@ -123,45 +126,172 @@ up again, if at a more mellow pace.
## Package statistics ## Package statistics
Now that we have an idea of how the overall interest in the distribution has changed over time, Now that we have an idea of how the overall reported sizes in the distribution
let's look at the actual package statistics. have changed over time, let's focus on the actual package statistics.
The popcorn files contain two main pieces of information: the number of installs per package The popcorn files contain two main pieces of information: the number of
(e.g. how many people have rsync installed) and the number of unique installs (i.e. unique installs per package (e.g. how many people have `rsync` installed) and the
machines providing statistics). We will look at both of these in turn. number of unique installs (i.e. how many people provide their statistics). We
will look at both of these in turn.
```{python} ```{python}
from notebooks.popcorn import plt_weekly_packages from notebooks.popcorn import plt_weekly_packages
pplot(plt_weekly_packages) pplot(plt_weekly_packages)
``` ```
The number of packages overall strongly rises until early 2021,
when it stagnates a little before rising more slowly again afterwards.
The pattern strongly mirrors the curve we saw before for the daily filesize.
Turning to the daily unique uploads, we can see a similar pattern, though even
more strongly pronounced.
```{python}
from notebooks.popcorn import plt_unique_installs
pplot(plt_unique_installs)
```
Unique installations rise sharply until early 2020. Then they not just stagnate
but shrink for the next three years. It is only early 2023 when the numbers
recover and begin rising again slowly.
We also have one day on 05 July 2024 which has significantly fewer unique
uploads (36 only) than all the other days around it. I have no clue if
something happened to data collection or everybody collectively decided to
leave their PC offline just for that day, but the numbers are back to normal
the day after.[^independence-day]
[^independence-day]: I suppose one interpretation would be people taking their
4th of July celebrations very seriously, and thus not being present in the
statistics for the day after. However, I am not sure if this would reflect so
strongly in data collection, and it additionally pre-supposes the data
collected predominantly stemming from the United States. Lastly, one would
suppose this having a similar effect every year if that was the case.
This curve also goes some way to explaining the dip in overall package
installations previously. When there are fewer people uploading their daily
statistics the absolute number of package installations will be somewhat
reduced as a result, unless for some reason the remaining people all of a
sudden start having many more packages installed.
Let's check that out next, by actually looking at the installed packages _per
user_ for each day.
```{python} ```{python}
from notebooks.popcorn import plt_pkg_relative from notebooks.popcorn import plt_pkg_relative
pplot(plt_pkg_relative) pplot(plt_pkg_relative)
``` ```
The amount of packages installed on all machines increases strongly over time. Combining both stats to look at the installed packages at a more individual
level per user, we see this confirmed. There is no similarly strong dip for the
relative package ownership as there was for the absolute package numbers.
```{python} Indeed, with the exception of a small more rapid increase in individual package
from notebooks.popcorn import plt_weekday_packages ownership in 2019, we see a much more stable increase in per-user packages than
pplot(plt_weekday_packages) the absolute numbers and no similarly big slump over three years.
```
```{python} Instead we see different patterns of rises and dips. Both in the beginning of
from notebooks.popcorn import plt_month_packages 2020, and the beginning of 2024, we can see first a strong rise and then an
pplot(plt_month_packages) equally strong fall in the average number of user-owned packages at once.
``` This could point to one of multiple options:
Perhaps users have been collectively trying out more new packages over the end
of year holidays or with the start of the new year. New year, new workflow
could presumably be something a few users decide to do, and this may be a
reflection of it. Or, equally, using the new year to learn new software. The
subsequent dip would then mean an end to the period of trying out new stuff, or
adopting the new packages and dropping the old ones.
At the same time, with the relatively limited absolute number of installations,
it is also quite likely that the representation is skewed by a single user or a
couple users having a much larger package ownership than everybody else. This
may signify new users checking out Void Linux and downloading a large variety
of packages in the process.
<!-- TODO: still accurate? -->
For a breakdown of the absolute numbers of packages on systems by weekday and
month of the year instead of over time, see the Appendix below.
Beyond pure installation numbers, let's take a look at the actual top-installed
packages on users' systems.
<!-- TODO: perhaps the pre-made ISOs play a role, especially Feb2024? no hang on feb 2025 -->
```{python} ```{python}
from notebooks.popcorn import plt_top_packages from notebooks.popcorn import plt_top_packages
pplot(plt_top_packages) pplot(plt_top_packages)
``` ```
The top packages are unsurprisingly
the `base-system` and `xtools` packages, followed by `wget`, `htop` and
`rsync`.[^popcorn-removal]
[^popcorn-removal]: I have removed the PopCorn package itself from the data.
Funnily enough, since _everybody_ who is represented in the data has to have
PopCorn installed or the data wouldn't be collected in the first place, if we
extrapolate from the collected data naively this means more people have PopCorn
installed than the base-system. Of course, viewed over the majority of Void
Linux installations this is hogwash. We have the absolute numbers and only
around 150 people ever have PopCorn installed. But it nicely represents some of
the danger of over-interpreting the results before us without also reflecting
on sample bias.
In my opinion the list of top packages reflect the technical audience of Void
Linux and does not hold too many surprises. Almost everyone uses `socklog` and
most people have the `nonfree` repo enabled. `firefox` is the most installed
browser, and everyone at least has `alsa-utils` installed, even if they're not
using `alsa` as their primary sound provider.
I am somewhat surprised by the prevalence of `git`, though this package is in
turn required by many others. Among them some of the other top packages such as
`xtools`, so it does make sense.
It, along with the prevalence of `tmux` (even above `zip`!) does once again
speak to the technical nature of Void Linux users, however, at least for those
opting into data collection.
Almost everyone keeps the `base-system` package installed but, importantly,
not _everyone_. The package is not represented for each installation, with a
sizable chunk of people having removed it.
Lastly, I am also pleasantly surprised by the appearance of `gimp` in the top
20 packages.
The 'rarest' 20 packages shows a snapshot of packages which have been installed
by _someone_ but only that single someone. In other words, there are quite a
few packages which nobody in the sample has installed but those are not
represented here. Instead, the rare packages tend to show those that somebody
built themselves, or only tried briefly. They provide more of a snapshot of the
kind of custom shenanigans users get up to within the `xbps` package system, or
could be viewed as a potential 'wishlist' of packages not yet officially
available.
Let's turn to the 'distribution' of package installations.
```{python} ```{python}
from notebooks.popcorn import plt_package_distribution from notebooks.popcorn import plt_package_distribution
pplot(plt_package_distribution) pplot(plt_package_distribution)
``` ```
Visualized above is the package installation frequency (or density) distribution.
On the Y-axis we see the amount of packages while on the X-axis we see the amount of installations.
What this means is that we see _how often_ packages tend to be installed,
and where the majority of packages is grouped.[^density-approximation]
[^density-approximation]: In the package density count above, since we are
accumulating over the absolute numbers of all installations of all users, the
overall high numbers are really _high_, i.e. above 150,000. Since we are
sorting the package counts into a finite number of bins to make visualizing it
possible, the lowest bin overshoots the 0-mark and we get an estimation of
minus-installation counts. Of course, this is not possible, no package in the
data has been installed negative amount of times --- to my knowledge!
_Many_ packages are installed 0 to 10 times.
Some packages are installed above 10 times,
fewer yet above 100 times,
and so on,
and this distribution is what we see here.
```{python} ```{python}
from notebooks.popcorn import plt_top_packages from notebooks.popcorn import plt_top_packages
_, defs = plt_top_packages.run() _, defs = plt_top_packages.run()
@ -181,6 +311,7 @@ twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30)) thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30))
``` ```
To be more precise with the numbers:
There are `python f"{get_num(one_ten_installs):,}"` packages which have between one There are `python f"{get_num(one_ten_installs):,}"` packages which have between one
and ten installations in the data, `python f"{get_num(ten_twenty_installs):,}"` and ten installations in the data, `python f"{get_num(ten_twenty_installs):,}"`
packages between eleven and 20 installations, and packages between eleven and 20 installations, and
@ -236,11 +367,17 @@ switch between major version 5 and 6. The last time a major version 4 is seen is
It would seem, then, that the people still running kernel version 4 used the opportunity of It would seem, then, that the people still running kernel version 4 used the opportunity of
everybody switching to the stable version of 6 to also upgrade their machines. everybody switching to the stable version of 6 to also upgrade their machines.
## Odds and Ends ## Appendix: Odds and Ends
The above graphics are the main ones that I think could be useful, entertaining, or somewhere in between.
However, when exploring data, many more visualizations come to light.
Most of them are a little more 'boring' than the ones selected above,
but may still be of interest for technical deep-dives or more specific investigations.
They are collected here, in my pseudo-appendix to the main article.
### The PopCorn files ### The PopCorn files
Let's have a look at the provided PopCorn statistics files themselves. Let's have a closer look at the provided PopCorn statistics files themselves.
The files consist of a long list of packages which have been reported to the The files consist of a long list of packages which have been reported to the
central server that day, along with the number of package instances. The amount central server that day, along with the number of package instances. The amount
@ -286,6 +423,8 @@ kernel name including major version, minor version and any suffix.
} }
``` ```
When grouped by the packages and aggregated over all days, this results in a
table, for example the following is the table for the package count list:
```{python} ```{python}
from notebooks.popcorn import tab_pkg from notebooks.popcorn import tab_pkg
@ -293,6 +432,21 @@ outp, defs = tab_pkg.run()
outp outp
``` ```
When taking a look at the file sizes of the PopCorn report files we did so for
each day individually above. But we can also look at the accumulative growth
instead: here we just add up all the files reported so far for each day, and
show the resulting growth line.
```{python}
from notebooks.popcorn import plt_filesize_cumulative
outp, defs = plt_filesize_cumulative.run()
outp
```
A cumulative view gives a less granular look at the individual daily changes but
provides a more macro-level view on how big the statistics have grown to be overall.
We can see that, as each individually reported day adds up to 400KB nowadays, the
cumulative size is up to almost 700MB currently.
There are some missing days in the statistics. There are some missing days in the statistics.
@ -303,6 +457,18 @@ outp, defs = tab_missing_days.run()
outp outp
``` ```
### Packages monthwise and per weekday
```{python}
from notebooks.popcorn import plt_weekday_packages
pplot(plt_weekday_packages)
```
```{python}
from notebooks.popcorn import plt_month_packages
pplot(plt_month_packages)
```
## Outline ## Outline
- intro - intro