Finish package stats section
This commit is contained in:
parent
707632fb7d
commit
9687eb662b
2 changed files with 255 additions and 32 deletions
|
|
@ -82,10 +82,31 @@ def plt_filesize(sizes_df):
|
|||
+ lp.geom_point()
|
||||
+ lp.geom_smooth(method="lm")
|
||||
+ lp.labs(
|
||||
title="Size growth",
|
||||
subtitle="Cumulative filesize of daily popcorn statistics over time",
|
||||
title="Report size",
|
||||
subtitle="Filesize of popcorn statistics reports each day",
|
||||
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
|
||||
y="filesize in kB",
|
||||
y="filesize in KB",
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def plt_filesize_cumulative(sizes_df: pl.DataFrame):
|
||||
(
|
||||
lp.ggplot(
|
||||
sizes_df.with_columns(
|
||||
(pl.col("filesize").cum_sum() / 1024 / 1024).alias("filesize_cum")
|
||||
),
|
||||
lp.aes(x="date", y="filesize_cum"),
|
||||
)
|
||||
+ lp.geom_line()
|
||||
# + lp.geom_smooth(method="lm")
|
||||
+ lp.labs(
|
||||
title="Report size growth",
|
||||
subtitle="Cumulative filesize of all popcorn statistics reports up to that day",
|
||||
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
|
||||
y="filesize in MB",
|
||||
)
|
||||
)
|
||||
return
|
||||
|
|
@ -293,14 +314,17 @@ def plt_unique_installs():
|
|||
)
|
||||
(
|
||||
lp.ggplot(
|
||||
df_unique_installs,
|
||||
df_unique_installs.sort("date")
|
||||
.group_by_dynamic("date", every="1w")
|
||||
.agg(pl.col("unique").mean()),
|
||||
lp.aes("date", "unique"),
|
||||
)
|
||||
+ lp.geom_line()
|
||||
+ lp.geom_smooth()
|
||||
+ lp.geom_smooth(method="loess")
|
||||
+ lp.labs(
|
||||
title="Unique daily uploads",
|
||||
caption="Daily number of unique providers for package update statistics opting in to popcorn.",
|
||||
title="Unique installations",
|
||||
subtitle="Weekly statistics upload averages",
|
||||
caption="Daily number of unique providers for package update statistics opting in to data collection.",
|
||||
)
|
||||
)
|
||||
return
|
||||
|
|
@ -317,6 +341,7 @@ def plt_top_packages(df_pkg_lazy: pl.LazyFrame):
|
|||
[
|
||||
lp.ggplot(
|
||||
df_pkg_dl.sort("count", descending=True)
|
||||
.filter(pl.col("package") != "PopCorn")
|
||||
.head(DISPLAY_LIMIT)
|
||||
.collect(engine="streaming"),
|
||||
lp.aes("package", "count"),
|
||||
|
|
@ -346,6 +371,16 @@ def plt_top_packages(df_pkg_lazy: pl.LazyFrame):
|
|||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def tab_rarest_packages(df_pkg_dl: pl.LazyFrame):
|
||||
(
|
||||
df_pkg_dl.sort("count", descending=False)
|
||||
# this seems arbitrary but gives a better result?
|
||||
.filter(pl.col("count") == 1)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
return
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
|
||||
def _():
|
||||
|
|
@ -360,6 +395,28 @@ def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
|
|||
_()
|
||||
return
|
||||
|
||||
@app.cell
|
||||
def tab_percentiles(df_pkg_dl: pl.LazyFrame):
|
||||
def get_num(df: pl.LazyFrame) -> int:
|
||||
return df.count().collect(engine="streaming").item(0, 0)
|
||||
|
||||
one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 1) & (pl.col("count") < 10)
|
||||
)
|
||||
ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 10) & (pl.col("count") < 20)
|
||||
)
|
||||
twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 20) & (pl.col("count") < 30)
|
||||
)
|
||||
thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30))
|
||||
pl.DataFrame([
|
||||
get_num(one_ten_installs),
|
||||
get_num(ten_twenty_installs),
|
||||
get_num(twenty_thirty),
|
||||
get_num(thirty_plus),
|
||||
])
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
|
|
|
|||
216
popcorn.qmd
216
popcorn.qmd
|
|
@ -1,6 +1,7 @@
|
|||
---
|
||||
title: "Voidlinux popcorn"
|
||||
subtitle: "Analysis of voidlinux package and kernel statistics"
|
||||
toc: true
|
||||
---
|
||||
|
||||
This notebook analyses the daily package repository statistics files,
|
||||
|
|
@ -70,9 +71,9 @@ and their respective versions.
|
|||
|
||||
A look at the overall file size for each of the daily statistics files over
|
||||
time reveals not necessarily changes in the absolute use of packages (e.g.
|
||||
`neovim` being installed more or less often). Whether it has been downloaded
|
||||
`rsync` being installed more or less often). Whether it has been downloaded
|
||||
once or 100 times, the file size does not change drastically. Instead it
|
||||
increases much more drastically when both `neovim` and `emacs` are installed,
|
||||
increases much more drastically when both `rsync` and `rclone` are installed,
|
||||
or a variety of different versions for one of the packages are installed.
|
||||
Similarly for different versions of the kernel.
|
||||
|
||||
|
|
@ -92,19 +93,21 @@ from notebooks.popcorn import plt_filesize
|
|||
pplot(plt_filesize)
|
||||
```
|
||||
|
||||
As we can see, the difference over time is massive. Especially early on, between 2019 and the
|
||||
start of 2021, the amount of different packages and package versions used grew rapidly, with the
|
||||
pace also picking up once again starting 2023.
|
||||
As we can see, the difference over time is massive. Especially early on,
|
||||
between 2019 and the start of 2021, the amount of different packages and
|
||||
package versions used grew rapidly, with the pace also picking up once again
|
||||
starting 2023.
|
||||
|
||||
From a reported filesize of around 50kB in the first days before the end of
|
||||
2019 we have easily tripled the filesize to over 150kB needed for the report
|
||||
per day. Nowadays we have reached just about 400kB daily report size, over 8
|
||||
From a reported filesize of around 50KB in the first days before the end of
|
||||
2019 we have easily tripled the filesize to over 150KB needed for the report
|
||||
per day. Nowadays we have reached just about 400KB daily report size, over 8
|
||||
times the size beginning 2018.
|
||||
|
||||
There are a few outlier days with a size of 0 kB on the server, which we had to
|
||||
There are a few outlier days with a size of 0 KB on the server, which we had to
|
||||
remove from the data. In all likelihood, those days were not reported correctly
|
||||
or there was some kind of issue on the backend so the stats for those days are
|
||||
lost.
|
||||
|
||||
<!-- TODO: is this still true? -->
|
||||
We take a look at the missing days
|
||||
among other things at the end of this article.
|
||||
|
|
@ -123,45 +126,172 @@ up again, if at a more mellow pace.
|
|||
|
||||
## Package statistics
|
||||
|
||||
Now that we have an idea of how the overall interest in the distribution has changed over time,
|
||||
let's look at the actual package statistics.
|
||||
Now that we have an idea of how the overall reported sizes in the distribution
|
||||
have changed over time, let's focus on the actual package statistics.
|
||||
|
||||
The popcorn files contain two main pieces of information: the number of installs per package
|
||||
(e.g. how many people have rsync installed) and the number of unique installs (i.e. unique
|
||||
machines providing statistics). We will look at both of these in turn.
|
||||
The popcorn files contain two main pieces of information: the number of
|
||||
installs per package (e.g. how many people have `rsync` installed) and the
|
||||
number of unique installs (i.e. how many people provide their statistics). We
|
||||
will look at both of these in turn.
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_weekly_packages
|
||||
pplot(plt_weekly_packages)
|
||||
```
|
||||
|
||||
The number of packages overall strongly rises until early 2021,
|
||||
when it stagnates a little before rising more slowly again afterwards.
|
||||
The pattern strongly mirrors the curve we saw before for the daily filesize.
|
||||
|
||||
Turning to the daily unique uploads, we can see a similar pattern, though even
|
||||
more strongly pronounced.
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_unique_installs
|
||||
pplot(plt_unique_installs)
|
||||
```
|
||||
|
||||
Unique installations rise sharply until early 2020. Then they not just stagnate
|
||||
but shrink for the next three years. It is only early 2023 when the numbers
|
||||
recover and begin rising again slowly.
|
||||
|
||||
We also have one day on 05 July 2024 which has significantly fewer unique
|
||||
uploads (36 only) than all the other days around it. I have no clue if
|
||||
something happened to data collection or everybody collectively decided to
|
||||
leave their PC offline just for that day, but the numbers are back to normal
|
||||
the day after.[^independence-day]
|
||||
|
||||
[^independence-day]: I suppose one interpretation would be people taking their
|
||||
4th of July celebrations very seriously, and thus not being present in the
|
||||
statistics for the day after. However, I am not sure if this would reflect so
|
||||
strongly in data collection, and it additionally pre-supposes the data
|
||||
collected predominantly stemming from the United States. Lastly, one would
|
||||
suppose this having a similar effect every year if that was the case.
|
||||
|
||||
This curve also goes some way to explaining the dip in overall package
|
||||
installations previously. When there are fewer people uploading their daily
|
||||
statistics the absolute number of package installations will be somewhat
|
||||
reduced as a result, unless for some reason the remaining people all of a
|
||||
sudden start having many more packages installed.
|
||||
|
||||
Let's check that out next, by actually looking at the installed packages _per
|
||||
user_ for each day.
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_pkg_relative
|
||||
pplot(plt_pkg_relative)
|
||||
```
|
||||
|
||||
The amount of packages installed on all machines increases strongly over time.
|
||||
Combining both stats to look at the installed packages at a more individual
|
||||
level per user, we see this confirmed. There is no similarly strong dip for the
|
||||
relative package ownership as there was for the absolute package numbers.
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_weekday_packages
|
||||
pplot(plt_weekday_packages)
|
||||
```
|
||||
Indeed, with the exception of a small more rapid increase in individual package
|
||||
ownership in 2019, we see a much more stable increase in per-user packages than
|
||||
the absolute numbers and no similarly big slump over three years.
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_month_packages
|
||||
pplot(plt_month_packages)
|
||||
```
|
||||
Instead we see different patterns of rises and dips. Both in the beginning of
|
||||
2020, and the beginning of 2024, we can see first a strong rise and then an
|
||||
equally strong fall in the average number of user-owned packages at once.
|
||||
This could point to one of multiple options:
|
||||
|
||||
Perhaps users have been collectively trying out more new packages over the end
|
||||
of year holidays or with the start of the new year. New year, new workflow
|
||||
could presumably be something a few users decide to do, and this may be a
|
||||
reflection of it. Or, equally, using the new year to learn new software. The
|
||||
subsequent dip would then mean an end to the period of trying out new stuff, or
|
||||
adopting the new packages and dropping the old ones.
|
||||
|
||||
At the same time, with the relatively limited absolute number of installations,
|
||||
it is also quite likely that the representation is skewed by a single user or a
|
||||
couple users having a much larger package ownership than everybody else. This
|
||||
may signify new users checking out Void Linux and downloading a large variety
|
||||
of packages in the process.
|
||||
|
||||
<!-- TODO: still accurate? -->
|
||||
For a breakdown of the absolute numbers of packages on systems by weekday and
|
||||
month of the year instead of over time, see the Appendix below.
|
||||
|
||||
Beyond pure installation numbers, let's take a look at the actual top-installed
|
||||
packages on users' systems.
|
||||
|
||||
<!-- TODO: perhaps the pre-made ISOs play a role, especially Feb2024? no hang on feb 2025 -->
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_top_packages
|
||||
pplot(plt_top_packages)
|
||||
```
|
||||
|
||||
The top packages are unsurprisingly
|
||||
the `base-system` and `xtools` packages, followed by `wget`, `htop` and
|
||||
`rsync`.[^popcorn-removal]
|
||||
|
||||
[^popcorn-removal]: I have removed the PopCorn package itself from the data.
|
||||
Funnily enough, since _everybody_ who is represented in the data has to have
|
||||
PopCorn installed or the data wouldn't be collected in the first place, if we
|
||||
extrapolate from the collected data naively this means more people have PopCorn
|
||||
installed than the base-system. Of course, viewed over the majority of Void
|
||||
Linux installations this is hogwash. We have the absolute numbers and only
|
||||
around 150 people ever have PopCorn installed. But it nicely represents some of
|
||||
the danger of over-interpreting the results before us without also reflecting
|
||||
on sample bias.
|
||||
|
||||
In my opinion the list of top packages reflect the technical audience of Void
|
||||
Linux and does not hold too many surprises. Almost everyone uses `socklog` and
|
||||
most people have the `nonfree` repo enabled. `firefox` is the most installed
|
||||
browser, and everyone at least has `alsa-utils` installed, even if they're not
|
||||
using `alsa` as their primary sound provider.
|
||||
|
||||
I am somewhat surprised by the prevalence of `git`, though this package is in
|
||||
turn required by many others. Among them some of the other top packages such as
|
||||
`xtools`, so it does make sense.
|
||||
|
||||
It, along with the prevalence of `tmux` (even above `zip`!) does once again
|
||||
speak to the technical nature of Void Linux users, however, at least for those
|
||||
opting into data collection.
|
||||
|
||||
Almost everyone keeps the `base-system` package installed but, importantly,
|
||||
not _everyone_. The package is not represented for each installation, with a
|
||||
sizable chunk of people having removed it.
|
||||
|
||||
Lastly, I am also pleasantly surprised by the appearance of `gimp` in the top
|
||||
20 packages.
|
||||
|
||||
The 'rarest' 20 packages shows a snapshot of packages which have been installed
|
||||
by _someone_ but only that single someone. In other words, there are quite a
|
||||
few packages which nobody in the sample has installed but those are not
|
||||
represented here. Instead, the rare packages tend to show those that somebody
|
||||
built themselves, or only tried briefly. They provide more of a snapshot of the
|
||||
kind of custom shenanigans users get up to within the `xbps` package system, or
|
||||
could be viewed as a potential 'wishlist' of packages not yet officially
|
||||
available.
|
||||
|
||||
Let's turn to the 'distribution' of package installations.
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_package_distribution
|
||||
pplot(plt_package_distribution)
|
||||
```
|
||||
|
||||
Visualized above is the package installation frequency (or density) distribution.
|
||||
On the Y-axis we see the amount of packages while on the X-axis we see the amount of installations.
|
||||
What this means is that we see _how often_ packages tend to be installed,
|
||||
and where the majority of packages is grouped.[^density-approximation]
|
||||
|
||||
[^density-approximation]: In the package density count above, since we are
|
||||
accumulating over the absolute numbers of all installations of all users, the
|
||||
overall high numbers are really _high_, i.e. above 150,000. Since we are
|
||||
sorting the package counts into a finite number of bins to make visualizing it
|
||||
possible, the lowest bin overshoots the 0-mark and we get an estimation of
|
||||
minus-installation counts. Of course, this is not possible, no package in the
|
||||
data has been installed negative amount of times --- to my knowledge!
|
||||
|
||||
_Many_ packages are installed 0 to 10 times.
|
||||
Some packages are installed above 10 times,
|
||||
fewer yet above 100 times,
|
||||
and so on,
|
||||
and this distribution is what we see here.
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_top_packages
|
||||
_, defs = plt_top_packages.run()
|
||||
|
|
@ -181,6 +311,7 @@ twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
|
|||
thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30))
|
||||
```
|
||||
|
||||
To be more precise with the numbers:
|
||||
There are `python f"{get_num(one_ten_installs):,}"` packages which have between one
|
||||
and ten installations in the data, `python f"{get_num(ten_twenty_installs):,}"`
|
||||
packages between eleven and 20 installations, and
|
||||
|
|
@ -236,11 +367,17 @@ switch between major version 5 and 6. The last time a major version 4 is seen is
|
|||
It would seem, then, that the people still running kernel version 4 used the opportunity of
|
||||
everybody switching to the stable version of 6 to also upgrade their machines.
|
||||
|
||||
## Odds and Ends
|
||||
## Appendix: Odds and Ends
|
||||
|
||||
The above graphics are the main ones that I think could be useful, entertaining, or somewhere in between.
|
||||
However, when exploring data, many more visualizations come to light.
|
||||
Most of them are a little more 'boring' than the ones selected above,
|
||||
but may still be of interest for technical deep-dives or more specific investigations.
|
||||
They are collected here, in my pseudo-appendix to the main article.
|
||||
|
||||
### The PopCorn files
|
||||
|
||||
Let's have a look at the provided PopCorn statistics files themselves.
|
||||
Let's have a closer look at the provided PopCorn statistics files themselves.
|
||||
|
||||
The files consist of a long list of packages which have been reported to the
|
||||
central server that day, along with the number of package instances. The amount
|
||||
|
|
@ -286,6 +423,8 @@ kernel name including major version, minor version and any suffix.
|
|||
}
|
||||
```
|
||||
|
||||
When grouped by the packages and aggregated over all days, this results in a
|
||||
table, for example the following is the table for the package count list:
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import tab_pkg
|
||||
|
|
@ -293,6 +432,21 @@ outp, defs = tab_pkg.run()
|
|||
outp
|
||||
```
|
||||
|
||||
When taking a look at the file sizes of the PopCorn report files we did so for
|
||||
each day individually above. But we can also look at the accumulative growth
|
||||
instead: here we just add up all the files reported so far for each day, and
|
||||
show the resulting growth line.
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_filesize_cumulative
|
||||
outp, defs = plt_filesize_cumulative.run()
|
||||
outp
|
||||
```
|
||||
|
||||
A cumulative view gives a less granular look at the individual daily changes but
|
||||
provides a more macro-level view on how big the statistics have grown to be overall.
|
||||
We can see that, as each individually reported day adds up to 400KB nowadays, the
|
||||
cumulative size is up to almost 700MB currently.
|
||||
|
||||
There are some missing days in the statistics.
|
||||
|
||||
|
|
@ -303,6 +457,18 @@ outp, defs = tab_missing_days.run()
|
|||
outp
|
||||
```
|
||||
|
||||
### Packages monthwise and per weekday
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_weekday_packages
|
||||
pplot(plt_weekday_packages)
|
||||
```
|
||||
|
||||
```{python}
|
||||
from notebooks.popcorn import plt_month_packages
|
||||
pplot(plt_month_packages)
|
||||
```
|
||||
|
||||
## Outline
|
||||
|
||||
- intro
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue