Adapt functions to new csv data layout

This commit is contained in:
Marty Oehme 2025-09-29 16:17:27 +02:00
parent 91d64f428c
commit 4c9518cf67
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -6,12 +6,15 @@ app = marimo.App(width="medium")
with app.setup:
# Initialization code that runs beimpofore all other cells
import re
from pathlib import Path
import lets_plot as lp
import marimo as mo
import polars as pl
LIMIT_ROWS = 200
LIMIT_ROWS = 500_000
DATA_RAW_DIR = "data/raw"
DATA_CLEAN_DIR = "data/cleaned"
@app.cell(hide_code=True)
@ -25,6 +28,16 @@ def _():
return
# run data prep
@app.cell
def _():
import clean
clean.json_to_daily_pkg(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
)
@app.cell
def _():
def parse_size(size_str):
@ -34,7 +47,7 @@ def _():
return None
sizes_df_raw = (
pl.read_csv("data/file_sizes.csv")
pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
.with_columns(
pl.col("name")
.str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
@ -118,21 +131,21 @@ def _():
@app.cell
def _():
df_lazy = (
pl.scan_ndjson("data/daily/*", include_file_paths="file")
.head(LIMIT_ROWS) # FIXME: take out after debug
.with_columns(
pl.col("file")
.str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
.str.to_date()
.alias("date")
)
)
df_pkg_lazy = (
df_lazy.select("date", pl.col("Packages").struct.unnest())
pl.scan_csv(
f"{DATA_CLEAN_DIR}/daily/*.csv",
include_file_paths="file",
schema={
"date": pl.Date,
"package": pl.String,
"downloads": pl.UInt16,
},
)
.drop("file")
.fill_null(0)
.unpivot(index="date", variable_name="package", value_name="downloads")
.head(LIMIT_ROWS) # FIXME: take out after debug
)
df_pkg_lazy.collect()
return
@ -281,6 +294,20 @@ def _(df_pkg_lazy: pl.LazyFrame):
return
# - which kernels have been DL when? (simplified for semver)
@app.cell
def _(df_lazy):
kernel_df_lazy = df_lazy.select("date", "XuKernel")
kernel_df = (
kernel_df_lazy.with_columns(pl.col("XuKernel").struct.unnest())
.fill_null(0)
# .unpivot(index="date", variable_name="kernel", value_name="downloads")
.collect()
)
df_lazy.collect()
@app.cell
def _():
mo.md(
@ -325,19 +352,12 @@ def _(sizes_df):
return
@app.cell
def _(df_lazy):
kernel_df = df_lazy.select("date", pl.col("Kernels").struct.unnest())
kernel_df
# further ideas:
#
# - daily download habits:
# - are we downloading further spread of versions on specific days
# - are there 'update' days, where things converge? specific weekday/on holidays/etc?
#
# - which kernels have been DL when? (simplified for semver)
# - when did specific kernels enter the repos?
#
# - which arches are/were most prevalent over time?