Modify popcorn notebook to run with new data structure
This commit is contained in:
parent
9628d7a4d8
commit
728ec37bda
1 changed files with 87 additions and 91 deletions
|
|
@ -12,10 +12,8 @@ with app.setup:
|
|||
import marimo as mo
|
||||
import polars as pl
|
||||
|
||||
LIMIT_ROWS = 50_000
|
||||
DATA_RAW_DIR = "data/raw"
|
||||
DATA_CLEAN_DIR = "data/cleaned"
|
||||
DATA_PARQUET_DIR = "data/parquet"
|
||||
LIMIT_ROWS = 500_000
|
||||
DATA_DIR = "input/popcorn/output"
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
|
|
@ -29,45 +27,25 @@ def _():
|
|||
return
|
||||
|
||||
|
||||
# run data prep
|
||||
@app.cell
|
||||
def _():
|
||||
import clean
|
||||
|
||||
clean.json_to_daily_pkg(
|
||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
|
||||
)
|
||||
clean.json_to_unique_csv(
|
||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False
|
||||
)
|
||||
clean.json_to_daily_kernel_csv(
|
||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False
|
||||
)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
def parse_size(size_str):
|
||||
try:
|
||||
return float(re.search(r"(\d+.?\d+) kB", size_str).group(1)) # pyright: ignore[reportOptionalMemberAccess]
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
sizes_df_raw = (
|
||||
pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
|
||||
.with_columns(
|
||||
pl.col("name")
|
||||
.str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
|
||||
.str.to_date()
|
||||
.alias("date"),
|
||||
pl.col("size")
|
||||
.map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
|
||||
.alias("size_num"),
|
||||
sizes_df = (
|
||||
pl.read_csv(
|
||||
f"{DATA_DIR}/files.csv",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"filename": pl.String,
|
||||
"mtime": pl.Float32,
|
||||
"filesize": pl.UInt32,
|
||||
},
|
||||
)
|
||||
.select(["date", "size_num", "size", "modified"])
|
||||
.with_columns(
|
||||
(pl.col("filesize") / 1024).alias("filesize_kb"),
|
||||
pl.from_epoch("mtime").alias("modified"),
|
||||
)
|
||||
.select(["date", "filesize", "filesize_kb", "modified"])
|
||||
)
|
||||
sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
|
||||
return sizes_df, sizes_df_raw
|
||||
return sizes_df
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
|
|
@ -104,13 +82,14 @@ def _():
|
|||
@app.cell
|
||||
def _(sizes_df):
|
||||
(
|
||||
lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
|
||||
lp.ggplot(sizes_df, lp.aes(x="date", y="filesize_kb"))
|
||||
+ lp.geom_point()
|
||||
+ lp.geom_smooth(method="lm")
|
||||
+ lp.labs(
|
||||
title="Size growth",
|
||||
subtitle="Size of daily popcorn statistics files over time",
|
||||
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
|
||||
y="filesize in kB",
|
||||
)
|
||||
)
|
||||
return
|
||||
|
|
@ -141,20 +120,16 @@ def _():
|
|||
|
||||
@app.cell
|
||||
def _():
|
||||
df_pkg_lazy = (
|
||||
pl.scan_csv(
|
||||
f"{DATA_CLEAN_DIR}/daily/*.csv",
|
||||
include_file_paths="file",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"package": pl.String,
|
||||
"downloads": pl.UInt16,
|
||||
},
|
||||
)
|
||||
.drop("file")
|
||||
.fill_null(0)
|
||||
df_pkg_lazy = pl.scan_csv(
|
||||
f"{DATA_DIR}/packages.csv",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"package": pl.String,
|
||||
"version": pl.String,
|
||||
"count": pl.UInt16,
|
||||
},
|
||||
)
|
||||
if LIMIT_ROWS: # NOTE: this is only for debugging purposes
|
||||
if LIMIT_ROWS: # NOTE: this is only for debugging purposes
|
||||
df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
|
||||
# give small df preview
|
||||
df_pkg_lazy.head(100).collect(engine="streaming")
|
||||
|
|
@ -183,18 +158,22 @@ def _():
|
|||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
def _():
|
||||
weekly_packages = (
|
||||
df_pkg_lazy.sort("date")
|
||||
df_pkg_lazy
|
||||
# .sort("date")
|
||||
.group_by_dynamic("date", every="1w")
|
||||
.agg(pl.col("downloads").sum())
|
||||
.agg(pl.col("count").sum())
|
||||
.sort("date")
|
||||
)
|
||||
return (
|
||||
lp.ggplot(weekly_packages.collect(engine="streaming"), lp.aes("date", "downloads"))
|
||||
lp.ggplot(
|
||||
weekly_packages.collect(engine="streaming"), lp.aes("date", "count")
|
||||
)
|
||||
+ lp.geom_line()
|
||||
+ lp.geom_smooth(method="loess")
|
||||
+ lp.labs(
|
||||
title="Weekly package ownership",
|
||||
caption="Count of all installed packages aggregated for each week",
|
||||
subtitle="Count of all installed packages aggregated for each week",
|
||||
y="number of packages",
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -235,11 +214,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
|||
.alias("weekday")
|
||||
)
|
||||
return (
|
||||
lp.ggplot(weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "downloads"))
|
||||
lp.ggplot(
|
||||
weekday_downloads.collect(engine="streaming"),
|
||||
lp.aes("weekday", "count"),
|
||||
)
|
||||
+ lp.geom_bar()
|
||||
+ lp.labs(
|
||||
title="Weekday downloads",
|
||||
caption="Downloads aggregated per day of the week they took place.",
|
||||
title="Ownership per weekday",
|
||||
caption="Package ownership per day of the week over all time",
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -254,11 +236,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
|||
pl.col("date").dt.month().alias("month")
|
||||
)
|
||||
return (
|
||||
lp.ggplot(month_agg_downloads.collect(engine="streaming"), lp.aes("month", "downloads"))
|
||||
lp.ggplot(
|
||||
month_agg_downloads.collect(engine="streaming"),
|
||||
lp.aes("month", "count"),
|
||||
)
|
||||
+ lp.geom_bar()
|
||||
+ lp.labs(
|
||||
title="Monthwise downloads",
|
||||
caption="Downloads aggregated per month of the year.",
|
||||
title="Monthwise ownership",
|
||||
caption="Package ownership per month of the year over all time",
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -271,7 +256,7 @@ def _():
|
|||
(
|
||||
lp.ggplot(
|
||||
pl.read_csv(
|
||||
f"{DATA_CLEAN_DIR}/unique_installs.csv",
|
||||
f"{DATA_DIR}/unique_installs.csv",
|
||||
schema={"date": pl.Date, "unique": pl.UInt16},
|
||||
),
|
||||
lp.aes("date", "unique"),
|
||||
|
|
@ -288,7 +273,7 @@ def _():
|
|||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
|
||||
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum())
|
||||
|
||||
def _():
|
||||
DISPLAY_LIMIT = 20
|
||||
|
|
@ -296,10 +281,10 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
|||
return lp.gggrid(
|
||||
[
|
||||
lp.ggplot(
|
||||
df_pkg_dl.sort("downloads", descending=True)
|
||||
df_pkg_dl.sort("count", descending=True)
|
||||
.head(DISPLAY_LIMIT)
|
||||
.collect(engine="streaming"),
|
||||
lp.aes("package", "downloads"),
|
||||
lp.aes("package", "count"),
|
||||
)
|
||||
+ lp.geom_bar(stat="identity")
|
||||
+ lp.labs(
|
||||
|
|
@ -307,11 +292,11 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
|||
caption="Most installed packages over all time",
|
||||
),
|
||||
lp.ggplot(
|
||||
df_pkg_dl.sort("downloads", descending=False)
|
||||
df_pkg_dl.sort("count", descending=False)
|
||||
# this seems arbitrary but gives a better result?
|
||||
.head(DISPLAY_LIMIT)
|
||||
.collect(engine="streaming"),
|
||||
lp.aes("package", "downloads"),
|
||||
lp.aes("package", "count"),
|
||||
)
|
||||
+ lp.geom_bar(stat="identity")
|
||||
+ lp.labs(
|
||||
|
|
@ -330,7 +315,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
|||
def _(df_pkg_dl: pl.LazyFrame):
|
||||
def _():
|
||||
return (
|
||||
lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("downloads"))
|
||||
lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count"))
|
||||
+ lp.geom_freqpoly(stat="bin")
|
||||
+ lp.labs(
|
||||
title="Package installation count distribution",
|
||||
|
|
@ -347,22 +332,26 @@ def _(df_pkg_dl: pl.LazyFrame):
|
|||
def get_num(df: pl.LazyFrame) -> int:
|
||||
return df.count().collect(engine="streaming").item(0, 0)
|
||||
|
||||
one_install = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||
pl.col("downloads") == 1
|
||||
one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 1) & (pl.col("count") < 10)
|
||||
)
|
||||
two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||
(pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
|
||||
ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 10) & (pl.col("count") < 20)
|
||||
)
|
||||
three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||
(pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
|
||||
twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 20) & (pl.col("count") < 30)
|
||||
)
|
||||
thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 30)
|
||||
)
|
||||
# TODO: Fix for new filters above
|
||||
return mo.md(rf"""
|
||||
|
||||
There are {get_num(one_install)} packages which have exactly a single
|
||||
installation in the data, {get_num(two_installs)} packages with exactly
|
||||
two installations, and {get_num(three_installs)} packages with exactly
|
||||
three.
|
||||
There are {get_num(one_ten_installs):,} packages which have between one
|
||||
and ten installations in the data, {get_num(ten_twenty_installs):,}
|
||||
packages between eleven and 20 installations, and
|
||||
{get_num(twenty_thirty):,} packages between 21 and 30 installations.
|
||||
{get_num(thirty_plus):,} packages have over 30 installations.
|
||||
|
||||
""")
|
||||
|
||||
|
|
@ -381,7 +370,7 @@ def _():
|
|||
def _():
|
||||
kernel_df_lazy = (
|
||||
pl.scan_csv(
|
||||
f"{DATA_CLEAN_DIR}/kernels/*.csv",
|
||||
f"{DATA_DIR}/kernels.csv",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"kernel": pl.String,
|
||||
|
|
@ -400,7 +389,9 @@ def _():
|
|||
)
|
||||
|
||||
kernel_df_v99 = (
|
||||
kernel_df_lazy.filter(pl.col("major_ver") == 99).collect(engine="streaming").select("date")
|
||||
kernel_df_lazy.filter(pl.col("major_ver") == 99)
|
||||
.collect(engine="streaming")
|
||||
.select("date")
|
||||
)
|
||||
kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
|
||||
|
||||
|
|
@ -509,9 +500,14 @@ def _():
|
|||
|
||||
|
||||
@app.cell
|
||||
def _(sizes_df_raw):
|
||||
sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
|
||||
sizes_df_null.select(["date", "size"]).style.tab_header(
|
||||
def _(sizes_df: pl.DataFrame):
|
||||
date_range = pl.date_range(
|
||||
sizes_df.select("date").min().item(), sizes_df.select("date").max().item()
|
||||
)
|
||||
|
||||
pl.DataFrame().select(date_range).filter(
|
||||
~date_range.is_in(sizes_df["date"].implode())
|
||||
).style.tab_header(
|
||||
title="Missing Days",
|
||||
subtitle="Days with 0B size due to missing on the popcorn server.",
|
||||
)
|
||||
|
|
@ -521,18 +517,16 @@ def _(sizes_df_raw):
|
|||
@app.cell
|
||||
def _(sizes_df):
|
||||
def _():
|
||||
different_modification_date = sizes_df.with_columns(
|
||||
pl.col("modified")
|
||||
.str.to_datetime(format="%F %T %:z", strict=False)
|
||||
.alias("modified_dt"),
|
||||
).filter(pl.col("date") != pl.col("modified_dt").dt.date())
|
||||
different_modification_date = sizes_df.filter(
|
||||
pl.col("date") != pl.col("modified").dt.date()
|
||||
)
|
||||
# This does not work well what are we showing?
|
||||
# 'true' capture date on X but then what on Y - the
|
||||
# same date for each? the difference in dt?
|
||||
return (
|
||||
lp.ggplot(
|
||||
different_modification_date,
|
||||
lp.aes("date", "modified_dt"),
|
||||
lp.aes("date", "modified"),
|
||||
)
|
||||
+ lp.geom_freqpoly()
|
||||
)
|
||||
|
|
@ -543,6 +537,8 @@ def _(sizes_df):
|
|||
|
||||
# further ideas:
|
||||
#
|
||||
# _relative_ package amounts: absolute packages counts / absolute unique installs
|
||||
#
|
||||
# - daily download habits:
|
||||
# - are we downloading further spread of versions on specific days
|
||||
# - are there 'update' days, where things converge? specific weekday/on holidays/etc?
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue