From 9d64e93486cffc75190df2b88e935ccf5643458e Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 30 Sep 2025 08:14:46 +0200 Subject: [PATCH] Add package install count distribution --- popcorn.py | 99 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 23 deletions(-) diff --git a/popcorn.py b/popcorn.py index b1ea51e..6ad3057 100644 --- a/popcorn.py +++ b/popcorn.py @@ -293,32 +293,85 @@ def _(): @app.cell def _(df_pkg_lazy: pl.LazyFrame): - DISPLAY_TOP = 20 - df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum()).collect() + df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum()) - lp.gggrid( - [ - lp.ggplot( - df_pkg_dl.sort("downloads", descending=True).head(DISPLAY_TOP), - lp.aes("package", "downloads"), - ) - + lp.geom_bar(stat="identity") + def _(): + DISPLAY_LIMIT = 20 + + return lp.gggrid( + [ + lp.ggplot( + df_pkg_dl.sort("downloads", descending=True) + .head(DISPLAY_LIMIT) + .collect(), + lp.aes("package", "downloads"), + ) + + lp.geom_bar(stat="identity") + + lp.labs( + title="Top packages", + caption="Most installed packages over all time", + ), + lp.ggplot( + df_pkg_dl.sort("downloads", descending=False) + # this seems arbitrary but gives a better result? + .head(DISPLAY_LIMIT) + .collect(), + lp.aes("package", "downloads"), + ) + + lp.geom_bar(stat="identity") + + lp.labs( + title="Rare packages", + caption="Least often installed packages", + ), + ], + ncol=1, + ) + + _() + return + + +@app.cell(hide_code=True) +def _(df_pkg_dl: pl.LazyFrame): + def _(): + return ( + lp.ggplot(df_pkg_dl.collect(), lp.aes("downloads")) + + lp.geom_freqpoly(stat="bin") + lp.labs( - title="Top packages", - caption="Most updated packages over all time", - ), - lp.ggplot( - df_pkg_dl.sort("downloads", descending=False).head(DISPLAY_TOP), - lp.aes("package", "downloads"), + title="Package installation count distribution", ) - + lp.geom_bar(stat="identity") - + lp.labs( - title="Rarest packages", - caption="Least updated packages over all time", - ), - ], - ncol=1, - ) + ) + + _() + return + + +@app.cell(hide_code=True) +def _(df_pkg_dl: pl.LazyFrame): + def _(): + def get_num(df: pl.LazyFrame) -> int: + return df.count().collect().item(0, 0) + + one_install = df_pkg_dl.sort("downloads", descending=False).filter( + pl.col("downloads") == 1 + ) + two_installs = df_pkg_dl.sort("downloads", descending=False).filter( + (pl.col("downloads") >= 2) & (pl.col("downloads") < 10) + ) + three_installs = df_pkg_dl.sort("downloads", descending=False).filter( + (pl.col("downloads") >= 10) & (pl.col("downloads") < 20) + ) + # TODO: Fix for new filters above + return mo.md(rf""" + + There are {get_num(one_install)} packages which have exactly a single + installation in the data, {get_num(two_installs)} packages with exactly + two installations, and {get_num(three_installs)} packages with exactly + three. + + """) + + _() return