import marimo __generated_with = "0.16.2" app = marimo.App(width="medium") with app.setup: # Initialization code that runs beimpofore all other cells import json import re from pathlib import Path import lets_plot as lp import marimo as mo import polars as pl @app.cell(hide_code=True) def _(): mo.md(r"""# Void Linux 'Popcorn' package repository stat analysis""") return @app.cell def _(): def parse_size(size_str): try: return float(re.search(r"(\d+.?\d+) kB", size_str).group(1)) # pyright: ignore[reportOptionalMemberAccess] except AttributeError: return None sizes_df_raw = ( pl.read_csv("data/file_sizes.csv") .with_columns( pl.col("name") .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}") .str.to_date() .alias("date"), pl.col("size") .map_elements(lambda x: parse_size(x), return_dtype=pl.Float32) .alias("size_num"), ) .select(["date", "size_num", "size", "modified"]) ) sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null()) return sizes_df, sizes_df_raw @app.cell def _(sizes_df): ( lp.ggplot(sizes_df, lp.aes(x="date", y="size")) + lp.geom_point() + lp.geom_smooth(method="lm") + lp.labs( title="Size growth", subtitle="Size of daily popcorn statistics files over time", caption="Raw json file size, without any formatting, removal of markers, characters or newlines.", ) ) @app.cell def _(): df = ( pl.scan_ndjson("data/daily/*", include_file_paths="file") .head(200) # FIXME: take out after debug .with_columns( pl.col("file") .str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}") .str.to_date() .alias("date") ) .select("date", pl.col("Packages").struct.unnest()) .fill_null(0) .unpivot(index="date", variable_name="package", value_name="downloads") .collect() ) df return ( lp.ggplot(sizes_df, lp.aes(x="date", y="size")) + lp.geom_point() + lp.geom_smooth(method="lowess") + lp.labs( title="", ) ) return @app.cell def _(): mo.md( r""" ## Odds and Ends There are some missing days in the statistics. """ ) return @app.cell def _(sizes_df_raw): sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null()) sizes_df_null.select(["date", "size"]).style.tab_header( title="Missing Days", subtitle="Days with 0B size due to missing on the popcorn server.", ) return @app.cell def _(sizes_df): def _(): different_modification_date = sizes_df.with_columns( pl.col("modified") .str.to_datetime(format="%F %T %:z", strict=False) .alias("modified_dt"), ).filter(pl.col("date") != pl.col("modified_dt").dt.date()) # This does not work well what are we showing? # 'true' capture date on X but then what on Y - the # same date for each? the difference in dt? return ( lp.ggplot( different_modification_date, lp.aes("date", "modified_dt"), ) + lp.geom_freqpoly() ) _() return # further ideas: # - which kernels have been DL when? (simplified for semver) # - when did specific kernels enter the repos? # # - which arches are/were most prevalent over time? # - have the arches been mostly even relative to each other? # # - what does unique install mean? # # - which Packages had the most unique versions, least versions # - which pkg had the most download of a single version? # - for which pkg were the version dls the most spread out? if __name__ == "__main__": app.run()