Add package install count distribution
This commit is contained in:
parent
ad50b19631
commit
9d64e93486
1 changed files with 76 additions and 23 deletions
99
popcorn.py
99
popcorn.py
|
|
@ -293,32 +293,85 @@ def _():
|
|||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
DISPLAY_TOP = 20
|
||||
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum()).collect()
|
||||
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
|
||||
|
||||
lp.gggrid(
|
||||
[
|
||||
lp.ggplot(
|
||||
df_pkg_dl.sort("downloads", descending=True).head(DISPLAY_TOP),
|
||||
lp.aes("package", "downloads"),
|
||||
)
|
||||
+ lp.geom_bar(stat="identity")
|
||||
def _():
|
||||
DISPLAY_LIMIT = 20
|
||||
|
||||
return lp.gggrid(
|
||||
[
|
||||
lp.ggplot(
|
||||
df_pkg_dl.sort("downloads", descending=True)
|
||||
.head(DISPLAY_LIMIT)
|
||||
.collect(),
|
||||
lp.aes("package", "downloads"),
|
||||
)
|
||||
+ lp.geom_bar(stat="identity")
|
||||
+ lp.labs(
|
||||
title="Top packages",
|
||||
caption="Most installed packages over all time",
|
||||
),
|
||||
lp.ggplot(
|
||||
df_pkg_dl.sort("downloads", descending=False)
|
||||
# this seems arbitrary but gives a better result?
|
||||
.head(DISPLAY_LIMIT)
|
||||
.collect(),
|
||||
lp.aes("package", "downloads"),
|
||||
)
|
||||
+ lp.geom_bar(stat="identity")
|
||||
+ lp.labs(
|
||||
title="Rare packages",
|
||||
caption="Least often installed packages",
|
||||
),
|
||||
],
|
||||
ncol=1,
|
||||
)
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(df_pkg_dl: pl.LazyFrame):
|
||||
def _():
|
||||
return (
|
||||
lp.ggplot(df_pkg_dl.collect(), lp.aes("downloads"))
|
||||
+ lp.geom_freqpoly(stat="bin")
|
||||
+ lp.labs(
|
||||
title="Top packages",
|
||||
caption="Most updated packages over all time",
|
||||
),
|
||||
lp.ggplot(
|
||||
df_pkg_dl.sort("downloads", descending=False).head(DISPLAY_TOP),
|
||||
lp.aes("package", "downloads"),
|
||||
title="Package installation count distribution",
|
||||
)
|
||||
+ lp.geom_bar(stat="identity")
|
||||
+ lp.labs(
|
||||
title="Rarest packages",
|
||||
caption="Least updated packages over all time",
|
||||
),
|
||||
],
|
||||
ncol=1,
|
||||
)
|
||||
)
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(df_pkg_dl: pl.LazyFrame):
|
||||
def _():
|
||||
def get_num(df: pl.LazyFrame) -> int:
|
||||
return df.count().collect().item(0, 0)
|
||||
|
||||
one_install = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||
pl.col("downloads") == 1
|
||||
)
|
||||
two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||
(pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
|
||||
)
|
||||
three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||
(pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
|
||||
)
|
||||
# TODO: Fix for new filters above
|
||||
return mo.md(rf"""
|
||||
|
||||
There are {get_num(one_install)} packages which have exactly a single
|
||||
installation in the data, {get_num(two_installs)} packages with exactly
|
||||
two installations, and {get_num(three_installs)} packages with exactly
|
||||
three.
|
||||
|
||||
""")
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue