diff --git a/popcorn.py b/popcorn.py index 04d8641..dc1cc61 100644 --- a/popcorn.py +++ b/popcorn.py @@ -75,28 +75,26 @@ def _(): r""" ## Daily statistics file size - The simplest operation we can do is look at the overall file size for each - of the daily statistics files over time. The files consist of a long list - of packages which have been downloaded from the repositories that day, - along with the number of downloads. It also consists of the same list - separated by specifically downloaded versions of packages, so if somebody - downloads v0.9.1 and somebody else downloads v0.9.3 this would count both - downloads separately. + The simplest operation we can do is look at the overall file size for each of the daily + statistics files over time. The files consist of a long list of packages which have been checked + from the repositories that day, along with the number of package instances. It also consists of + the same list separated by specifically installed versions of packages, so if somebody has + v0.9.1 and somebody else v0.9.3 instead this would count both packages separately. - Another count is the number of different Kernels that have been used to - download (or downloaded?) from the repositories. + Another count is the number of different Kernels that have been used on that day, with their + exact kernel name including major version, minor version and any suffix. - These are the major things that will lead to size increases in the file, - but not just for an increased amount of downloads --- we will get to those shortly. + These are the major things that will lead to size increases in the file, but not just for an + increased amount of absolute users, packages or uploads --- we will get to those shortly. - No, an increase in file size here mainly suggests an increase in the - 'breadth' of files on offer in the repository, whether that be a wider - variety of program versions or more different packages that people are - interested in. + No, an increase in file size here mainly suggests an increase in the 'breadth' of files on offer + in the repository, whether that be a wider variety of program versions or more different + packages that people are interested in, and those that the community chooses to use. + + So while the overall amount of packages gives a general estimate of the interest in the + distribution, this can show a more 'distributor'-aligned view on how many different aisles of + the buffet people are eating from. - So while the overall amount of downloads gives a general estimate of the - interest in the distribution, this can show a more 'distributor'-aligned - view on how many different aisles of the buffet people are eating from. """ ) return @@ -122,13 +120,18 @@ def _(): mo.md( r""" - As we can see, the difference over time is massive. Especially early on, - between 2019 and the start of 2021, the amount of different stuff - downloaded grew rapidly, with the pace picking up again starting 2023. + As we can see, the difference over time is massive. Especially early on, between 2019 and the + start of 2021, the amount of different packages and package versions used grew rapidly, with the + pace picking up once again starting 2023. - There are a few outliers with a size of 0 kB, which we will remove from the - data. There are also a few days where the modification date of the file - does not correspond to the represented statistical date. + There are a few outlier days with a size of 0 kB, which we will remove from the data. In all + likelihood, those days were not reported correctly or there was some kind of issue on the + backend so the stats for those days are lost. + + There are also a few days where the modification date of the file does not correspond to the + represented statistical date but those are kept. This rather points to certain times when the + files have been moved on the backend, or recreated externally but does not mean the data are + bad. """ ) @@ -159,14 +162,15 @@ def _(): def _(): mo.md( r""" - ## Download statistics + ## Package statistics - Now that we have an idea of how the overall interest in the distribution - has changed over time, let's look at the actual download statistics. + Now that we have an idea of how the overall interest in the distribution has changed over time, + let's look at the actual package statistics. + + The popcorn files contain two main pieces of information: the number of installs per package + (e.g. how many people have rsync installed) and the number of unique installs (i.e. unique + machines providing statistics). We will look at both of these in turn. - The popcorn files contain two main pieces of information: the number of - unique installs (i.e. unique machines downloading packages) and the number - of downloads per package. We will look at both of these in turn. """ ) return @@ -195,6 +199,18 @@ def _(df_pkg_lazy: pl.LazyFrame): return +@app.cell(hide_code=True) +def _(): + mo.md( + r""" + + The amount of packages installed on all machines increases strongly over time. + + """ + ) + return + + @app.cell def _(df_pkg_lazy: pl.LazyFrame): def _():