import marimo __generated_with = "0.16.2" app = marimo.App(width="medium") with app.setup: # Initialization code that runs beimpofore all other cells import re from pathlib import Path import lets_plot as lp import marimo as mo import polars as pl LIMIT_ROWS = 500_000 DATA_RAW_DIR = "data/raw" DATA_CLEAN_DIR = "data/cleaned" @app.cell(hide_code=True) def _(): mo.md(r"""# Void Linux 'Popcorn' package repository stat analysis This notebook analyses the daily package repository statistics files, colloquially known as 'popcorn' files, that are generated by the Void Linux package manager `xbps` and uploaded by users who have opted in to share. """) return # run data prep @app.cell def _(): import clean clean.json_to_daily_pkg( Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False ) clean.json_to_unique_csv( Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False ) clean.json_to_daily_kernel_csv( Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False ) @app.cell def _(): def parse_size(size_str): try: return float(re.search(r"(\d+.?\d+) kB", size_str).group(1)) # pyright: ignore[reportOptionalMemberAccess] except AttributeError: return None sizes_df_raw = ( pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv") .with_columns( pl.col("name") .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}") .str.to_date() .alias("date"), pl.col("size") .map_elements(lambda x: parse_size(x), return_dtype=pl.Float32) .alias("size_num"), ) .select(["date", "size_num", "size", "modified"]) ) sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null()) return sizes_df, sizes_df_raw @app.cell(hide_code=True) def _(): mo.md( r""" ## Daily statistics file size The simplest operation we can do is look at the overall file size for each of the daily statistics files over time. The files consist of a long list of packages which have been checked from the repositories that day, along with the number of package instances. It also consists of the same list separated by specifically installed versions of packages, so if somebody has v0.9.1 and somebody else v0.9.3 instead this would count both packages separately. Another count is the number of different Kernels that have been used on that day, with their exact kernel name including major version, minor version and any suffix. These are the major things that will lead to size increases in the file, but not just for an increased amount of absolute users, packages or uploads --- we will get to those shortly. No, an increase in file size here mainly suggests an increase in the 'breadth' of files on offer in the repository, whether that be a wider variety of program versions or more different packages that people are interested in, and those that the community chooses to use. So while the overall amount of packages gives a general estimate of the interest in the distribution, this can show a more 'distributor'-aligned view on how many different aisles of the buffet people are eating from. """ ) return @app.cell def _(sizes_df): ( lp.ggplot(sizes_df, lp.aes(x="date", y="size")) + lp.geom_point() + lp.geom_smooth(method="lm") + lp.labs( title="Size growth", subtitle="Size of daily popcorn statistics files over time", caption="Raw json file size, without any formatting, removal of markers, characters or newlines.", ) ) return @app.cell(hide_code=True) def _(): mo.md( r""" As we can see, the difference over time is massive. Especially early on, between 2019 and the start of 2021, the amount of different packages and package versions used grew rapidly, with the pace picking up once again starting 2023. There are a few outlier days with a size of 0 kB, which we will remove from the data. In all likelihood, those days were not reported correctly or there was some kind of issue on the backend so the stats for those days are lost. There are also a few days where the modification date of the file does not correspond to the represented statistical date but those are kept. This rather points to certain times when the files have been moved on the backend, or recreated externally but does not mean the data are bad. """ ) return @app.cell def _(): df_pkg_lazy = ( pl.scan_csv( f"{DATA_CLEAN_DIR}/daily/*.csv", include_file_paths="file", schema={ "date": pl.Date, "package": pl.String, "downloads": pl.UInt16, }, ) .drop("file") .fill_null(0) .head(LIMIT_ROWS) # FIXME: take out after debug ) # give small df preview df_pkg_lazy.head(100).collect() return @app.cell(hide_code=True) def _(): mo.md( r""" ## Package statistics Now that we have an idea of how the overall interest in the distribution has changed over time, let's look at the actual package statistics. The popcorn files contain two main pieces of information: the number of installs per package (e.g. how many people have rsync installed) and the number of unique installs (i.e. unique machines providing statistics). We will look at both of these in turn. """ ) return @app.cell def _(df_pkg_lazy: pl.LazyFrame): def _(): weekly_packages = ( df_pkg_lazy.sort("date") .group_by_dynamic("date", every="1w") .agg(pl.col("downloads").sum()) .sort("date") ) return ( lp.ggplot(weekly_packages.collect(), lp.aes("date", "downloads")) + lp.geom_line() + lp.geom_smooth(method="loess") + lp.labs( title="Weekly package ownership", caption="Count of all installed packages aggregated for each week", ) ) _() return @app.cell(hide_code=True) def _(): mo.md( r""" The amount of packages installed on all machines increases strongly over time. """ ) return @app.cell def _(df_pkg_lazy: pl.LazyFrame): def _(): weekday_downloads = df_pkg_lazy.sort("date").with_columns( pl.col("date") .dt.weekday() .sort() .replace_strict( { 1: "Mon", 2: "Tue", 3: "Wed", 4: "Thu", 5: "Fri", 6: "Sat", 7: "Sun", } ) .alias("weekday") ) return ( lp.ggplot(weekday_downloads.collect(), lp.aes("weekday", "downloads")) + lp.geom_bar() + lp.labs( title="Weekday downloads", caption="Downloads aggregated per day of the week they took place.", ) ) _() return @app.cell def _(df_pkg_lazy: pl.LazyFrame): def _(): month_agg_downloads = df_pkg_lazy.sort("date").with_columns( pl.col("date").dt.month().alias("month") ) return ( lp.ggplot(month_agg_downloads.collect(), lp.aes("month", "downloads")) + lp.geom_bar() + lp.labs( title="Monthwise downloads", caption="Downloads aggregated per month of the year.", ) ) _() return @app.cell def _(): ( lp.ggplot( pl.read_csv( f"{DATA_CLEAN_DIR}/unique_installs.csv", schema={"date": pl.Date, "unique": pl.UInt16}, ), lp.aes("date", "unique"), ) + lp.geom_line() + lp.geom_smooth() + lp.labs( title="Unique daily uploads", caption="Daily number of unique providers for package update statistics opting in to popcorn.", ) ) return @app.cell def _(df_pkg_lazy: pl.LazyFrame): df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum()) def _(): DISPLAY_LIMIT = 20 return lp.gggrid( [ lp.ggplot( df_pkg_dl.sort("downloads", descending=True) .head(DISPLAY_LIMIT) .collect(), lp.aes("package", "downloads"), ) + lp.geom_bar(stat="identity") + lp.labs( title="Top packages", caption="Most installed packages over all time", ), lp.ggplot( df_pkg_dl.sort("downloads", descending=False) # this seems arbitrary but gives a better result? .head(DISPLAY_LIMIT) .collect(), lp.aes("package", "downloads"), ) + lp.geom_bar(stat="identity") + lp.labs( title="Rare packages", caption="Least often installed packages", ), ], ncol=1, ) _() return @app.cell(hide_code=True) def _(df_pkg_dl: pl.LazyFrame): def _(): return ( lp.ggplot(df_pkg_dl.collect(), lp.aes("downloads")) + lp.geom_freqpoly(stat="bin") + lp.labs( title="Package installation count distribution", ) ) _() return @app.cell(hide_code=True) def _(df_pkg_dl: pl.LazyFrame): def _(): def get_num(df: pl.LazyFrame) -> int: return df.count().collect().item(0, 0) one_install = df_pkg_dl.sort("downloads", descending=False).filter( pl.col("downloads") == 1 ) two_installs = df_pkg_dl.sort("downloads", descending=False).filter( (pl.col("downloads") >= 2) & (pl.col("downloads") < 10) ) three_installs = df_pkg_dl.sort("downloads", descending=False).filter( (pl.col("downloads") >= 10) & (pl.col("downloads") < 20) ) # TODO: Fix for new filters above return mo.md(rf""" There are {get_num(one_install)} packages which have exactly a single installation in the data, {get_num(two_installs)} packages with exactly two installations, and {get_num(three_installs)} packages with exactly three. """) _() return @app.cell(hide_code=True) def _(): mo.md(r""" ## Kernel Analysis """) return # - which kernels have been DL when? (simplified for semver) @app.cell def _(): kernel_df_lazy = ( pl.scan_csv( f"{DATA_CLEAN_DIR}/kernels/*.csv", schema={ "date": pl.Date, "kernel": pl.String, "downloads": pl.UInt16, }, ) .fill_null(0) .with_columns(pl.col("kernel").str.replace(r"(\d+\.\d+\.\d+).*", "${1}")) .with_columns( pl.col("kernel") .str.replace(r"(\d+).*", "${1}") .str.to_integer(dtype=pl.UInt8) .alias("major_ver"), pl.col("kernel").str.replace(r"(\d+\.\d+).*", "${1}").alias("minor_ver"), ) .head(LIMIT_ROWS) # FIXME: take out after debug ) kernel_df_v99 = ( kernel_df_lazy.filter(pl.col("major_ver") == 99).collect().select("date") ) kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99) ( lp.ggplot( kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String)) .group_by("major_ver") .agg(pl.col("downloads").sum()) .sort("major_ver") .collect(), lp.aes("major_ver", "downloads"), ) + lp.geom_bar(stat="identity") + lp.labs( title="Kernel versions used", caption="For each daily download, add up the currently running kernel version", ) ) return @app.cell(hide_code=True) def _(kernel_df_v99: pl.DataFrame): mo.md( rf""" When looking at the kernel versions used, we see a very strong jump between major kernel version 4 and major kernel version 5. For this analysis we had to exclude {kernel_df_v99.select(pl.len()).item()} rows which were apparently from the future, as they were running variations of major kernel version 99. In all likelihood there is a custom kernel version out there which reports its own major version as 99. The strange version starts appearing on {kernel_df_v99.select("date").row(0)[0]} and shows up all the way until {kernel_df_v99.select("date").row(-1)[0]}. """ ) return @app.cell def _(kernel_df_lazy: pl.LazyFrame): weekly_kernel_df = ( kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String)) .select(["date", "major_ver", "downloads"]) .sort("date") .group_by_dynamic("date", every="1w", group_by="major_ver") .agg(pl.col("downloads").sum()) .collect() ) ( lp.ggplot( weekly_kernel_df, lp.aes("date", "downloads", color="major_ver"), ) + lp.geom_line() + lp.labs( title="Kernels over time", caption="For each daily download, count used kernel versions", ) ) @app.cell(hide_code=True) def _(weekly_kernel_df: pl.DataFrame): from datetime import date last_kernel4: date = weekly_kernel_df.filter(pl.col("major_ver") == "4")[-1][ "date" ].item() first_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[0][ "date" ].item() last_kernel5: date = weekly_kernel_df.filter(pl.col("major_ver") == "5")[-1][ "date" ].item() mo.md( rf""" A timeline analysis of the kernels used to report daily downloads shows that people generally adopt new major kernel versons at roughly the same time. This change is especially stark between major kernel versions 5 and 6, which seem to have traded place in usage almost over night. The first time that major version 5 of the kernel shows up is on {first_kernel5}. From here, it took a long time for the last of the version 4 kernels to disappear, coinciding with the big switch between major version 5 and 6. The last time a major version 4 is seen is on {last_kernel4}, while the last major version 5 kernels still pop up. It would seem, then, that the people still running kernel version 4 used the opportunity of everybody switching to the stable version of 6 to also upgrade their machines. """ ) return @app.cell(hide_code=True) def _(): mo.md( r""" ## Odds and Ends There are some missing days in the statistics. """ ) return @app.cell def _(sizes_df_raw): sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null()) sizes_df_null.select(["date", "size"]).style.tab_header( title="Missing Days", subtitle="Days with 0B size due to missing on the popcorn server.", ) return @app.cell def _(sizes_df): def _(): different_modification_date = sizes_df.with_columns( pl.col("modified") .str.to_datetime(format="%F %T %:z", strict=False) .alias("modified_dt"), ).filter(pl.col("date") != pl.col("modified_dt").dt.date()) # This does not work well what are we showing? # 'true' capture date on X but then what on Y - the # same date for each? the difference in dt? return ( lp.ggplot( different_modification_date, lp.aes("date", "modified_dt"), ) + lp.geom_freqpoly() ) _() return # further ideas: # # - daily download habits: # - are we downloading further spread of versions on specific days # - are there 'update' days, where things converge? specific weekday/on holidays/etc? # # - when did specific kernels enter the repos? # # - which arches are/were most prevalent over time? # - have the arches been mostly even relative to each other? # # - what does unique install mean? # # - which Packages had the most unique versions, least versions # - which pkg had the most download of a single version? # - for which pkg were the version dls the most spread out? if __name__ == "__main__": app.run()