Add package install count distribution

2025-09-30 08:14:46 +02:00 · 2025-09-30 08:14:46 +02:00 · 9d64e93486
commit 9d64e93486
parent ad50b19631
1 changed files with 76 additions and 23 deletions
--- a/popcorn.py
+++ b/popcorn.py
@ -293,32 +293,85 @@ def _():

@app.cell
 def _(df_pkg_lazy: pl.LazyFrame):
-    DISPLAY_TOP = 20
-    df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum()).collect()
+    df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())

-    lp.gggrid(
-        [
-            lp.ggplot(
-                df_pkg_dl.sort("downloads", descending=True).head(DISPLAY_TOP),
-                lp.aes("package", "downloads"),
-            )
-            + lp.geom_bar(stat="identity")
+    def _():
+        DISPLAY_LIMIT = 20
+
+        return lp.gggrid(
+            [
+                lp.ggplot(
+                    df_pkg_dl.sort("downloads", descending=True)
+                    .head(DISPLAY_LIMIT)
+                    .collect(),
+                    lp.aes("package", "downloads"),
+                )
+                + lp.geom_bar(stat="identity")
+                + lp.labs(
+                    title="Top packages",
+                    caption="Most installed packages over all time",
+                ),
+                lp.ggplot(
+                    df_pkg_dl.sort("downloads", descending=False)
+                    # this seems arbitrary but gives a better result?
+                    .head(DISPLAY_LIMIT)
+                    .collect(),
+                    lp.aes("package", "downloads"),
+                )
+                + lp.geom_bar(stat="identity")
+                + lp.labs(
+                    title="Rare packages",
+                    caption="Least often installed packages",
+                ),
+            ],
+            ncol=1,
+        )
+
+    _()
+    return
+
+
+@app.cell(hide_code=True)
+def _(df_pkg_dl: pl.LazyFrame):
+    def _():
+        return (
+            lp.ggplot(df_pkg_dl.collect(), lp.aes("downloads"))
+            + lp.geom_freqpoly(stat="bin")
            + lp.labs(
-                title="Top packages",
-                caption="Most updated packages over all time",
-            ),
-            lp.ggplot(
-                df_pkg_dl.sort("downloads", descending=False).head(DISPLAY_TOP),
-                lp.aes("package", "downloads"),
+                title="Package installation count distribution",
            )
-            + lp.geom_bar(stat="identity")
-            + lp.labs(
-                title="Rarest packages",
-                caption="Least updated packages over all time",
-            ),
-        ],
-        ncol=1,
-    )
+        )
+
+    _()
+    return
+
+
+@app.cell(hide_code=True)
+def _(df_pkg_dl: pl.LazyFrame):
+    def _():
+        def get_num(df: pl.LazyFrame) -> int:
+            return df.count().collect().item(0, 0)
+
+        one_install = df_pkg_dl.sort("downloads", descending=False).filter(
+            pl.col("downloads") == 1
+        )
+        two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
+            (pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
+        )
+        three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
+            (pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
+        )
+        # TODO: Fix for new filters above
+        return mo.md(rf"""
+
+        There are {get_num(one_install)} packages which have exactly a single
+        installation in the data, {get_num(two_installs)} packages with exactly
+        two installations, and {get_num(three_installs)} packages with exactly
+        three.
+
+        """)
+
+    _()
    return