Modify popcorn notebook to run with new data structure
This commit is contained in:
parent
9628d7a4d8
commit
728ec37bda
1 changed files with 87 additions and 91 deletions
|
|
@ -12,10 +12,8 @@ with app.setup:
|
||||||
import marimo as mo
|
import marimo as mo
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
LIMIT_ROWS = 50_000
|
LIMIT_ROWS = 500_000
|
||||||
DATA_RAW_DIR = "data/raw"
|
DATA_DIR = "input/popcorn/output"
|
||||||
DATA_CLEAN_DIR = "data/cleaned"
|
|
||||||
DATA_PARQUET_DIR = "data/parquet"
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
|
|
@ -29,45 +27,25 @@ def _():
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# run data prep
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _():
|
def _():
|
||||||
import clean
|
sizes_df = (
|
||||||
|
pl.read_csv(
|
||||||
clean.json_to_daily_pkg(
|
f"{DATA_DIR}/files.csv",
|
||||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
|
schema={
|
||||||
|
"date": pl.Date,
|
||||||
|
"filename": pl.String,
|
||||||
|
"mtime": pl.Float32,
|
||||||
|
"filesize": pl.UInt32,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
clean.json_to_unique_csv(
|
|
||||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False
|
|
||||||
)
|
|
||||||
clean.json_to_daily_kernel_csv(
|
|
||||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _():
|
|
||||||
def parse_size(size_str):
|
|
||||||
try:
|
|
||||||
return float(re.search(r"(\d+.?\d+) kB", size_str).group(1)) # pyright: ignore[reportOptionalMemberAccess]
|
|
||||||
except AttributeError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
sizes_df_raw = (
|
|
||||||
pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
|
|
||||||
.with_columns(
|
.with_columns(
|
||||||
pl.col("name")
|
(pl.col("filesize") / 1024).alias("filesize_kb"),
|
||||||
.str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
|
pl.from_epoch("mtime").alias("modified"),
|
||||||
.str.to_date()
|
|
||||||
.alias("date"),
|
|
||||||
pl.col("size")
|
|
||||||
.map_elements(lambda x: parse_size(x), return_dtype=pl.Float32)
|
|
||||||
.alias("size_num"),
|
|
||||||
)
|
)
|
||||||
.select(["date", "size_num", "size", "modified"])
|
.select(["date", "filesize", "filesize_kb", "modified"])
|
||||||
)
|
)
|
||||||
sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null())
|
return sizes_df
|
||||||
return sizes_df, sizes_df_raw
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
|
|
@ -104,13 +82,14 @@ def _():
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(sizes_df):
|
def _(sizes_df):
|
||||||
(
|
(
|
||||||
lp.ggplot(sizes_df, lp.aes(x="date", y="size"))
|
lp.ggplot(sizes_df, lp.aes(x="date", y="filesize_kb"))
|
||||||
+ lp.geom_point()
|
+ lp.geom_point()
|
||||||
+ lp.geom_smooth(method="lm")
|
+ lp.geom_smooth(method="lm")
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
title="Size growth",
|
title="Size growth",
|
||||||
subtitle="Size of daily popcorn statistics files over time",
|
subtitle="Size of daily popcorn statistics files over time",
|
||||||
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
|
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
|
||||||
|
y="filesize in kB",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
@ -141,19 +120,15 @@ def _():
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _():
|
def _():
|
||||||
df_pkg_lazy = (
|
df_pkg_lazy = pl.scan_csv(
|
||||||
pl.scan_csv(
|
f"{DATA_DIR}/packages.csv",
|
||||||
f"{DATA_CLEAN_DIR}/daily/*.csv",
|
|
||||||
include_file_paths="file",
|
|
||||||
schema={
|
schema={
|
||||||
"date": pl.Date,
|
"date": pl.Date,
|
||||||
"package": pl.String,
|
"package": pl.String,
|
||||||
"downloads": pl.UInt16,
|
"version": pl.String,
|
||||||
|
"count": pl.UInt16,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
.drop("file")
|
|
||||||
.fill_null(0)
|
|
||||||
)
|
|
||||||
if LIMIT_ROWS: # NOTE: this is only for debugging purposes
|
if LIMIT_ROWS: # NOTE: this is only for debugging purposes
|
||||||
df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
|
df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
|
||||||
# give small df preview
|
# give small df preview
|
||||||
|
|
@ -183,18 +158,22 @@ def _():
|
||||||
def _(df_pkg_lazy: pl.LazyFrame):
|
def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
def _():
|
def _():
|
||||||
weekly_packages = (
|
weekly_packages = (
|
||||||
df_pkg_lazy.sort("date")
|
df_pkg_lazy
|
||||||
|
# .sort("date")
|
||||||
.group_by_dynamic("date", every="1w")
|
.group_by_dynamic("date", every="1w")
|
||||||
.agg(pl.col("downloads").sum())
|
.agg(pl.col("count").sum())
|
||||||
.sort("date")
|
.sort("date")
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
lp.ggplot(weekly_packages.collect(engine="streaming"), lp.aes("date", "downloads"))
|
lp.ggplot(
|
||||||
|
weekly_packages.collect(engine="streaming"), lp.aes("date", "count")
|
||||||
|
)
|
||||||
+ lp.geom_line()
|
+ lp.geom_line()
|
||||||
+ lp.geom_smooth(method="loess")
|
+ lp.geom_smooth(method="loess")
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
title="Weekly package ownership",
|
title="Weekly package ownership",
|
||||||
caption="Count of all installed packages aggregated for each week",
|
subtitle="Count of all installed packages aggregated for each week",
|
||||||
|
y="number of packages",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -235,11 +214,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
.alias("weekday")
|
.alias("weekday")
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
lp.ggplot(weekday_downloads.collect(engine="streaming"), lp.aes("weekday", "downloads"))
|
lp.ggplot(
|
||||||
|
weekday_downloads.collect(engine="streaming"),
|
||||||
|
lp.aes("weekday", "count"),
|
||||||
|
)
|
||||||
+ lp.geom_bar()
|
+ lp.geom_bar()
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
title="Weekday downloads",
|
title="Ownership per weekday",
|
||||||
caption="Downloads aggregated per day of the week they took place.",
|
caption="Package ownership per day of the week over all time",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -254,11 +236,14 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
pl.col("date").dt.month().alias("month")
|
pl.col("date").dt.month().alias("month")
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
lp.ggplot(month_agg_downloads.collect(engine="streaming"), lp.aes("month", "downloads"))
|
lp.ggplot(
|
||||||
|
month_agg_downloads.collect(engine="streaming"),
|
||||||
|
lp.aes("month", "count"),
|
||||||
|
)
|
||||||
+ lp.geom_bar()
|
+ lp.geom_bar()
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
title="Monthwise downloads",
|
title="Monthwise ownership",
|
||||||
caption="Downloads aggregated per month of the year.",
|
caption="Package ownership per month of the year over all time",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -271,7 +256,7 @@ def _():
|
||||||
(
|
(
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
pl.read_csv(
|
pl.read_csv(
|
||||||
f"{DATA_CLEAN_DIR}/unique_installs.csv",
|
f"{DATA_DIR}/unique_installs.csv",
|
||||||
schema={"date": pl.Date, "unique": pl.UInt16},
|
schema={"date": pl.Date, "unique": pl.UInt16},
|
||||||
),
|
),
|
||||||
lp.aes("date", "unique"),
|
lp.aes("date", "unique"),
|
||||||
|
|
@ -288,7 +273,7 @@ def _():
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(df_pkg_lazy: pl.LazyFrame):
|
def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
|
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum())
|
||||||
|
|
||||||
def _():
|
def _():
|
||||||
DISPLAY_LIMIT = 20
|
DISPLAY_LIMIT = 20
|
||||||
|
|
@ -296,10 +281,10 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
return lp.gggrid(
|
return lp.gggrid(
|
||||||
[
|
[
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
df_pkg_dl.sort("downloads", descending=True)
|
df_pkg_dl.sort("count", descending=True)
|
||||||
.head(DISPLAY_LIMIT)
|
.head(DISPLAY_LIMIT)
|
||||||
.collect(engine="streaming"),
|
.collect(engine="streaming"),
|
||||||
lp.aes("package", "downloads"),
|
lp.aes("package", "count"),
|
||||||
)
|
)
|
||||||
+ lp.geom_bar(stat="identity")
|
+ lp.geom_bar(stat="identity")
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
|
|
@ -307,11 +292,11 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
caption="Most installed packages over all time",
|
caption="Most installed packages over all time",
|
||||||
),
|
),
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
df_pkg_dl.sort("downloads", descending=False)
|
df_pkg_dl.sort("count", descending=False)
|
||||||
# this seems arbitrary but gives a better result?
|
# this seems arbitrary but gives a better result?
|
||||||
.head(DISPLAY_LIMIT)
|
.head(DISPLAY_LIMIT)
|
||||||
.collect(engine="streaming"),
|
.collect(engine="streaming"),
|
||||||
lp.aes("package", "downloads"),
|
lp.aes("package", "count"),
|
||||||
)
|
)
|
||||||
+ lp.geom_bar(stat="identity")
|
+ lp.geom_bar(stat="identity")
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
|
|
@ -330,7 +315,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
def _(df_pkg_dl: pl.LazyFrame):
|
def _(df_pkg_dl: pl.LazyFrame):
|
||||||
def _():
|
def _():
|
||||||
return (
|
return (
|
||||||
lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("downloads"))
|
lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count"))
|
||||||
+ lp.geom_freqpoly(stat="bin")
|
+ lp.geom_freqpoly(stat="bin")
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
title="Package installation count distribution",
|
title="Package installation count distribution",
|
||||||
|
|
@ -347,22 +332,26 @@ def _(df_pkg_dl: pl.LazyFrame):
|
||||||
def get_num(df: pl.LazyFrame) -> int:
|
def get_num(df: pl.LazyFrame) -> int:
|
||||||
return df.count().collect(engine="streaming").item(0, 0)
|
return df.count().collect(engine="streaming").item(0, 0)
|
||||||
|
|
||||||
one_install = df_pkg_dl.sort("downloads", descending=False).filter(
|
one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
|
||||||
pl.col("downloads") == 1
|
(pl.col("count") >= 1) & (pl.col("count") < 10)
|
||||||
)
|
)
|
||||||
two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
|
||||||
(pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
|
(pl.col("count") >= 10) & (pl.col("count") < 20)
|
||||||
)
|
)
|
||||||
three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
|
||||||
(pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
|
(pl.col("count") >= 20) & (pl.col("count") < 30)
|
||||||
|
)
|
||||||
|
thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
|
||||||
|
(pl.col("count") >= 30)
|
||||||
)
|
)
|
||||||
# TODO: Fix for new filters above
|
# TODO: Fix for new filters above
|
||||||
return mo.md(rf"""
|
return mo.md(rf"""
|
||||||
|
|
||||||
There are {get_num(one_install)} packages which have exactly a single
|
There are {get_num(one_ten_installs):,} packages which have between one
|
||||||
installation in the data, {get_num(two_installs)} packages with exactly
|
and ten installations in the data, {get_num(ten_twenty_installs):,}
|
||||||
two installations, and {get_num(three_installs)} packages with exactly
|
packages between eleven and 20 installations, and
|
||||||
three.
|
{get_num(twenty_thirty):,} packages between 21 and 30 installations.
|
||||||
|
{get_num(thirty_plus):,} packages have over 30 installations.
|
||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
|
@ -381,7 +370,7 @@ def _():
|
||||||
def _():
|
def _():
|
||||||
kernel_df_lazy = (
|
kernel_df_lazy = (
|
||||||
pl.scan_csv(
|
pl.scan_csv(
|
||||||
f"{DATA_CLEAN_DIR}/kernels/*.csv",
|
f"{DATA_DIR}/kernels.csv",
|
||||||
schema={
|
schema={
|
||||||
"date": pl.Date,
|
"date": pl.Date,
|
||||||
"kernel": pl.String,
|
"kernel": pl.String,
|
||||||
|
|
@ -400,7 +389,9 @@ def _():
|
||||||
)
|
)
|
||||||
|
|
||||||
kernel_df_v99 = (
|
kernel_df_v99 = (
|
||||||
kernel_df_lazy.filter(pl.col("major_ver") == 99).collect(engine="streaming").select("date")
|
kernel_df_lazy.filter(pl.col("major_ver") == 99)
|
||||||
|
.collect(engine="streaming")
|
||||||
|
.select("date")
|
||||||
)
|
)
|
||||||
kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
|
kernel_df_lazy = kernel_df_lazy.filter(pl.col("major_ver") != 99)
|
||||||
|
|
||||||
|
|
@ -509,9 +500,14 @@ def _():
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(sizes_df_raw):
|
def _(sizes_df: pl.DataFrame):
|
||||||
sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null())
|
date_range = pl.date_range(
|
||||||
sizes_df_null.select(["date", "size"]).style.tab_header(
|
sizes_df.select("date").min().item(), sizes_df.select("date").max().item()
|
||||||
|
)
|
||||||
|
|
||||||
|
pl.DataFrame().select(date_range).filter(
|
||||||
|
~date_range.is_in(sizes_df["date"].implode())
|
||||||
|
).style.tab_header(
|
||||||
title="Missing Days",
|
title="Missing Days",
|
||||||
subtitle="Days with 0B size due to missing on the popcorn server.",
|
subtitle="Days with 0B size due to missing on the popcorn server.",
|
||||||
)
|
)
|
||||||
|
|
@ -521,18 +517,16 @@ def _(sizes_df_raw):
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(sizes_df):
|
def _(sizes_df):
|
||||||
def _():
|
def _():
|
||||||
different_modification_date = sizes_df.with_columns(
|
different_modification_date = sizes_df.filter(
|
||||||
pl.col("modified")
|
pl.col("date") != pl.col("modified").dt.date()
|
||||||
.str.to_datetime(format="%F %T %:z", strict=False)
|
)
|
||||||
.alias("modified_dt"),
|
|
||||||
).filter(pl.col("date") != pl.col("modified_dt").dt.date())
|
|
||||||
# This does not work well what are we showing?
|
# This does not work well what are we showing?
|
||||||
# 'true' capture date on X but then what on Y - the
|
# 'true' capture date on X but then what on Y - the
|
||||||
# same date for each? the difference in dt?
|
# same date for each? the difference in dt?
|
||||||
return (
|
return (
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
different_modification_date,
|
different_modification_date,
|
||||||
lp.aes("date", "modified_dt"),
|
lp.aes("date", "modified"),
|
||||||
)
|
)
|
||||||
+ lp.geom_freqpoly()
|
+ lp.geom_freqpoly()
|
||||||
)
|
)
|
||||||
|
|
@ -543,6 +537,8 @@ def _(sizes_df):
|
||||||
|
|
||||||
# further ideas:
|
# further ideas:
|
||||||
#
|
#
|
||||||
|
# _relative_ package amounts: absolute packages counts / absolute unique installs
|
||||||
|
#
|
||||||
# - daily download habits:
|
# - daily download habits:
|
||||||
# - are we downloading further spread of versions on specific days
|
# - are we downloading further spread of versions on specific days
|
||||||
# - are there 'update' days, where things converge? specific weekday/on holidays/etc?
|
# - are there 'update' days, where things converge? specific weekday/on holidays/etc?
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue