Modify popcorn notebook to run with new data structure

This commit is contained in:
Marty Oehme 2025-10-01 15:05:36 +02:00
parent 9628d7a4d8
commit 728ec37bda
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -12,10 +12,8 @@ with app.setup:
import marimo as mo
import polars as pl
LIMIT_ROWS = 50_000
DATA_RAW_DIR = "data/raw"
DATA_CLEAN_DIR = "data/cleaned"
DATA_PARQUET_DIR = "data/parquet"
LIMIT_ROWS = 500_000
DATA_DIR = "input/popcorn/output"
@app.cell(hide_code=True)
@ -29,45 +27,25 @@ def _():
return
# run data prep
@app.cell
def _():
import clean
clean.json_to_daily_pkg(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
sizes_df = (
pl.read_csv(
f"{DATA_DIR}/files.csv",
schema={
"date": pl.Date,
"filename": pl.String,
"mtime": pl.Float32,
"filesize": pl.UInt32,
},
)
clean.json_to_unique_csv(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False
)
clean.json_to_daily_kernel_csv(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False
)
@app.cell
def _():
def parse_size(size_str):
try:
return float(re.search(r"(\d+.?\d+) kB", size_str).group(1)) # pyright: ignore[reportOptionalMemberAccess]
except AttributeError:
return None
sizes_df_raw = (
pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
.with_columns(
pl.col("name")
.str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
.str.to_date()
.alias("date"),
pl.col("size")
.map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
.alias("size_num"),
(pl.col("filesize") / 1024).alias("filesize_kb"),
pl.from_epoch("mtime").alias("modified"),
)
.select(["date", "size_num", "size", "modified"])
.select(["date", "filesize", "filesize_kb", "modified"])
)
sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
return sizes_df, sizes_df_raw
return sizes_df
@app.cell(hide_code=True)
@ -104,13 +82,14 @@ def _():
@app.cell
def _(sizes_df):
(
lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
lp.ggplot(sizes_df, lp.aes(x="date", y="filesize_kb"))
+ lp.geom_point()
+ lp.geom_smooth(method="lm")
+ lp.labs(
title="Size growth",
subtitle="Size of daily popcorn statistics files over time",
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
y="filesize in kB",
)
)
return
@ -141,19 +120,15 @@ def _():
@app.cell
def _():
df_pkg_lazy = (
pl.scan_csv(
f"{DATA_CLEAN_DIR}/daily/*.csv",
include_file_paths="file",
df_pkg_lazy = pl.scan_csv(
f"{DATA_DIR}/packages.csv",
schema={
"date": pl.Date,
"package": pl.String,
"downloads": pl.UInt16,
"version": pl.String,
"count": pl.UInt16,
},
)
.drop("file")
.fill_null(0)
)
if LIMIT_ROWS: # NOTE: this is only for debugging purposes
df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
# give small df preview
@ -183,18 +158,22 @@ def _():
def _(df_pkg_lazy: pl.LazyFrame):
def _():
weekly_packages = (
df_pkg_lazy.sort("date")
df_pkg_lazy
# .sort("date")
.group_by_dynamic("date", every="1w")
.agg(pl.col("downloads").sum())
.agg(pl.col("count").sum())
.sort("date")
)
return (
lp.ggplot(weekly_packages.collect(engine="streaming"), lp.aes("date", "downloads"))
lp.ggplot(
weekly_packages.collect(engine="streaming"), lp.aes("date", "count")
)
+ lp.geom_line()
+ lp.geom_smooth(method="loess")
+ lp.labs(
title="Weekly package ownership",
caption="Count of all installed packages aggregated for each week",
subtitle="Count of all installed packages aggregated for each week",
y="number of packages",
)
)
@ -235,11 +214,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
.alias("weekday")
)
return (
lp.ggplot(weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "downloads"))
lp.ggplot(
weekday_downloads.collect(engine="streaming"),
lp.aes("weekday", "count"),
)
+ lp.geom_bar()
+ lp.labs(
title="Weekday downloads",
caption="Downloads aggregated per day of the week they took place.",
title="Ownership per weekday",
caption="Package ownership per day of the week over all time",
)
)
@ -254,11 +236,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
pl.col("date").dt.month().alias("month")
)
return (
lp.ggplot(month_agg_downloads.collect(engine="streaming"), lp.aes("month", "downloads"))
lp.ggplot(
month_agg_downloads.collect(engine="streaming"),
lp.aes("month", "count"),
)
+ lp.geom_bar()
+ lp.labs(
title="Monthwise downloads",
caption="Downloads aggregated per month of the year.",
title="Monthwise ownership",
caption="Package ownership per month of the year over all time",
)
)
@ -271,7 +256,7 @@ def _():
(
lp.ggplot(
pl.read_csv(
f"{DATA_CLEAN_DIR}/unique_installs.csv",
f"{DATA_DIR}/unique_installs.csv",
schema={"date": pl.Date, "unique": pl.UInt16},
),
lp.aes("date", "unique"),
@ -288,7 +273,7 @@ def _():
@app.cell
def _(df_pkg_lazy: pl.LazyFrame):
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum())
def _():
DISPLAY_LIMIT = 20
@ -296,10 +281,10 @@ def _(df_pkg_lazy: pl.LazyFrame):
return lp.gggrid(
[
lp.ggplot(
df_pkg_dl.sort("downloads", descending=True)
df_pkg_dl.sort("count", descending=True)
.head(DISPLAY_LIMIT)
.collect(engine="streaming"),
lp.aes("package", "downloads"),
lp.aes("package", "count"),
)
+ lp.geom_bar(stat="identity")
+ lp.labs(
@ -307,11 +292,11 @@ def _(df_pkg_lazy: pl.LazyFrame):
caption="Most installed packages over all time",
),
lp.ggplot(
df_pkg_dl.sort("downloads", descending=False)
df_pkg_dl.sort("count", descending=False)
# this seems arbitrary but gives a better result?
.head(DISPLAY_LIMIT)
.collect(engine="streaming"),
lp.aes("package", "downloads"),
lp.aes("package", "count"),
)
+ lp.geom_bar(stat="identity")
+ lp.labs(
@ -330,7 +315,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
def _(df_pkg_dl: pl.LazyFrame):
def _():
return (
lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("downloads"))
lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count"))
+ lp.geom_freqpoly(stat="bin")
+ lp.labs(
title="Package installation count distribution",
@ -347,22 +332,26 @@ def _(df_pkg_dl: pl.LazyFrame):
def get_num(df: pl.LazyFrame) -> int:
return df.count().collect(engine="streaming").item(0, 0)
one_install = df_pkg_dl.sort("downloads", descending=False).filter(
pl.col("downloads") == 1
one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 1) & (pl.col("count") < 10)
)
two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
(pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 10) & (pl.col("count") < 20)
)
three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
(pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 20) & (pl.col("count") < 30)
)
thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 30)
)
# TODO: Fix for new filters above
return mo.md(rf"""
There are {get_num(one_install)} packages which have exactly a single
installation in the data, {get_num(two_installs)} packages with exactly
two installations, and {get_num(three_installs)} packages with exactly
three.
There are {get_num(one_ten_installs):,} packages which have between one
and ten installations in the data, {get_num(ten_twenty_installs):,}
packages between eleven and 20 installations, and
{get_num(twenty_thirty):,} packages between 21 and 30 installations.
{get_num(thirty_plus):,} packages have over 30 installations.
""")
@ -381,7 +370,7 @@ def _():
def _():
kernel_df_lazy = (
pl.scan_csv(
f"{DATA_CLEAN_DIR}/kernels/*.csv",
f"{DATA_DIR}/kernels.csv",
schema={
"date": pl.Date,
"kernel": pl.String,
@ -400,7 +389,9 @@ def _():
)
kernel_df_v99 = (
kernel_df_lazy.filter(pl.col("major_ver") == 99).collect(engine="streaming").select("date")
kernel_df_lazy.filter(pl.col("major_ver") == 99)
.collect(engine="streaming")
.select("date")
)
kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
@ -509,9 +500,14 @@ def _():
@app.cell
def _(sizes_df_raw):
sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
sizes_df_null.select(["date", "size"]).style.tab_header(
def _(sizes_df: pl.DataFrame):
date_range = pl.date_range(
sizes_df.select("date").min().item(), sizes_df.select("date").max().item()
)
pl.DataFrame().select(date_range).filter(
~date_range.is_in(sizes_df["date"].implode())
).style.tab_header(
title="Missing Days",
subtitle="Days with 0B size due to missing on the popcorn server.",
)
@ -521,18 +517,16 @@ def _(sizes_df_raw):
@app.cell
def _(sizes_df):
def _():
different_modification_date = sizes_df.with_columns(
pl.col("modified")
.str.to_datetime(format="%F %T %:z", strict=False)
.alias("modified_dt"),
).filter(pl.col("date") != pl.col("modified_dt").dt.date())
different_modification_date = sizes_df.filter(
pl.col("date") != pl.col("modified").dt.date()
)
# This does not work well what are we showing?
# 'true' capture date on X but then what on Y - the
# same date for each? the difference in dt?
return (
lp.ggplot(
different_modification_date,
lp.aes("date", "modified_dt"),
lp.aes("date", "modified"),
)
+ lp.geom_freqpoly()
)
@ -543,6 +537,8 @@ def _(sizes_df):
# further ideas:
#
# _relative_ package amounts: absolute packages counts / absolute unique installs
#
# - daily download habits:
# - are we downloading further spread of versions on specific days
# - are there 'update' days, where things converge? specific weekday/on holidays/etc?