Add package install count distribution

This commit is contained in:
Marty Oehme 2025-09-30 08:14:46 +02:00
parent ad50b19631
commit 9d64e93486
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -293,32 +293,85 @@ def _():
@app.cell
def _(df_pkg_lazy: pl.LazyFrame):
DISPLAY_TOP = 20
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum()).collect()
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
lp.gggrid(
[
lp.ggplot(
df_pkg_dl.sort("downloads", descending=True).head(DISPLAY_TOP),
lp.aes("package", "downloads"),
)
+ lp.geom_bar(stat="identity")
def _():
DISPLAY_LIMIT = 20
return lp.gggrid(
[
lp.ggplot(
df_pkg_dl.sort("downloads", descending=True)
.head(DISPLAY_LIMIT)
.collect(),
lp.aes("package", "downloads"),
)
+ lp.geom_bar(stat="identity")
+ lp.labs(
title="Top packages",
caption="Most installed packages over all time",
),
lp.ggplot(
df_pkg_dl.sort("downloads", descending=False)
# this seems arbitrary but gives a better result?
.head(DISPLAY_LIMIT)
.collect(),
lp.aes("package", "downloads"),
)
+ lp.geom_bar(stat="identity")
+ lp.labs(
title="Rare packages",
caption="Least often installed packages",
),
],
ncol=1,
)
_()
return
@app.cell(hide_code=True)
def _(df_pkg_dl: pl.LazyFrame):
def _():
return (
lp.ggplot(df_pkg_dl.collect(), lp.aes("downloads"))
+ lp.geom_freqpoly(stat="bin")
+ lp.labs(
title="Top packages",
caption="Most updated packages over all time",
),
lp.ggplot(
df_pkg_dl.sort("downloads", descending=False).head(DISPLAY_TOP),
lp.aes("package", "downloads"),
title="Package installation count distribution",
)
+ lp.geom_bar(stat="identity")
+ lp.labs(
title="Rarest packages",
caption="Least updated packages over all time",
),
],
ncol=1,
)
)
_()
return
@app.cell(hide_code=True)
def _(df_pkg_dl: pl.LazyFrame):
def _():
def get_num(df: pl.LazyFrame) -> int:
return df.count().collect().item(0, 0)
one_install = df_pkg_dl.sort("downloads", descending=False).filter(
pl.col("downloads") == 1
)
two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
(pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
)
three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
(pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
)
# TODO: Fix for new filters above
return mo.md(rf"""
There are {get_num(one_install)} packages which have exactly a single
installation in the data, {get_num(two_installs)} packages with exactly
two installations, and {get_num(three_installs)} packages with exactly
three.
""")
_()
return