Finish package stats section

This commit is contained in:
Marty Oehme 2025-10-08 15:09:08 +02:00
parent 707632fb7d
commit 9687eb662b
Signed by: Marty
GPG key ID: 4E535BC19C61886E
2 changed files with 255 additions and 32 deletions

View file

@ -82,10 +82,31 @@ def plt_filesize(sizes_df):
+ lp.geom_point()
+ lp.geom_smooth(method="lm")
+ lp.labs(
title="Size growth",
subtitle="Cumulative filesize of daily popcorn statistics over time",
title="Report size",
subtitle="Filesize of popcorn statistics reports each day",
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
y="filesize in kB",
y="filesize in KB",
)
)
return
@app.cell
def plt_filesize_cumulative(sizes_df: pl.DataFrame):
(
lp.ggplot(
sizes_df.with_columns(
(pl.col("filesize").cum_sum() / 1024 / 1024).alias("filesize_cum")
),
lp.aes(x="date", y="filesize_cum"),
)
+ lp.geom_line()
# + lp.geom_smooth(method="lm")
+ lp.labs(
title="Report size growth",
subtitle="Cumulative filesize of all popcorn statistics reports up to that day",
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
y="filesize in MB",
)
)
return
@ -293,14 +314,17 @@ def plt_unique_installs():
)
(
lp.ggplot(
df_unique_installs,
df_unique_installs.sort("date")
.group_by_dynamic("date", every="1w")
.agg(pl.col("unique").mean()),
lp.aes("date", "unique"),
)
+ lp.geom_line()
+ lp.geom_smooth()
+ lp.geom_smooth(method="loess")
+ lp.labs(
title="Unique daily uploads",
caption="Daily number of unique providers for package update statistics opting in to popcorn.",
title="Unique installations",
subtitle="Weekly statistics upload averages",
caption="Daily number of unique providers for package update statistics opting in to data collection.",
)
)
return
@ -317,6 +341,7 @@ def plt_top_packages(df_pkg_lazy: pl.LazyFrame):
[
lp.ggplot(
df_pkg_dl.sort("count", descending=True)
.filter(pl.col("package") != "PopCorn")
.head(DISPLAY_LIMIT)
.collect(engine="streaming"),
lp.aes("package", "count"),
@ -346,6 +371,16 @@ def plt_top_packages(df_pkg_lazy: pl.LazyFrame):
return
@app.cell
def tab_rarest_packages(df_pkg_dl: pl.LazyFrame):
(
df_pkg_dl.sort("count", descending=False)
# this seems arbitrary but gives a better result?
.filter(pl.col("count") == 1)
.collect(engine="streaming")
)
return
@app.cell(hide_code=True)
def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
def _():
@ -360,6 +395,28 @@ def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
_()
return
@app.cell
def tab_percentiles(df_pkg_dl: pl.LazyFrame):
def get_num(df: pl.LazyFrame) -> int:
return df.count().collect(engine="streaming").item(0, 0)
one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 1) & (pl.col("count") < 10)
)
ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 10) & (pl.col("count") < 20)
)
twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
(pl.col("count") >= 20) & (pl.col("count") < 30)
)
thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30))
pl.DataFrame([
get_num(one_ten_installs),
get_num(ten_twenty_installs),
get_num(twenty_thirty),
get_num(thirty_plus),
])
return
@app.cell(hide_code=True)