Parse JSON Package downloads

This commit is contained in:
Marty Oehme 2025-09-27 11:34:52 +02:00
parent 97e4d256bb
commit cbda8dca55
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -5,7 +5,9 @@ app = marimo.App(width="medium")
with app.setup:
# Initialization code that runs beimpofore all other cells
import json
import re
from pathlib import Path
import lets_plot as lp
import marimo as mo
@ -58,15 +60,31 @@ def _(sizes_df):
@app.cell
def _(sizes_df):
def _():
df = (
pl.scan_ndjson("data/daily/*", include_file_paths="file")
.head(200) # FIXME: take out after debug
.with_columns(
pl.col("file")
.str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
.str.to_date()
.alias("date")
)
.select("date", pl.col("Packages").struct.unnest())
.fill_null(0)
.unpivot(index="date", variable_name="package", value_name="downloads")
.collect()
)
df
return
(
lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
+ lp.geom_point()
+ lp.geom_smooth(method="lowess")
+ lp.labs(
title="",
subtitle="",
caption="",
)
)
return
@ -116,5 +134,18 @@ def _(sizes_df):
return
# further ideas:
# - which kernels have been DL when? (simplified for semver)
# - when did specific kernels enter the repos?
#
# - which arches are/were most prevalent over time?
# - have the arches been mostly even relative to each other?
#
# - what does unique install mean?
#
# - which Packages had the most unique versions, least versions
# - which pkg had the most download of a single version?
# - for which pkg were the version dls the most spread out?
if __name__ == "__main__":
app.run()