Add package install count distribution
This commit is contained in:
parent
ad50b19631
commit
9d64e93486
1 changed files with 76 additions and 23 deletions
69
popcorn.py
69
popcorn.py
|
|
@ -293,32 +293,85 @@ def _():
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(df_pkg_lazy: pl.LazyFrame):
|
def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
DISPLAY_TOP = 20
|
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum())
|
||||||
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("downloads").sum()).collect()
|
|
||||||
|
|
||||||
lp.gggrid(
|
def _():
|
||||||
|
DISPLAY_LIMIT = 20
|
||||||
|
|
||||||
|
return lp.gggrid(
|
||||||
[
|
[
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
df_pkg_dl.sort("downloads", descending=True).head(DISPLAY_TOP),
|
df_pkg_dl.sort("downloads", descending=True)
|
||||||
|
.head(DISPLAY_LIMIT)
|
||||||
|
.collect(),
|
||||||
lp.aes("package", "downloads"),
|
lp.aes("package", "downloads"),
|
||||||
)
|
)
|
||||||
+ lp.geom_bar(stat="identity")
|
+ lp.geom_bar(stat="identity")
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
title="Top packages",
|
title="Top packages",
|
||||||
caption="Most updated packages over all time",
|
caption="Most installed packages over all time",
|
||||||
),
|
),
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
df_pkg_dl.sort("downloads", descending=False).head(DISPLAY_TOP),
|
df_pkg_dl.sort("downloads", descending=False)
|
||||||
|
# this seems arbitrary but gives a better result?
|
||||||
|
.head(DISPLAY_LIMIT)
|
||||||
|
.collect(),
|
||||||
lp.aes("package", "downloads"),
|
lp.aes("package", "downloads"),
|
||||||
)
|
)
|
||||||
+ lp.geom_bar(stat="identity")
|
+ lp.geom_bar(stat="identity")
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
title="Rarest packages",
|
title="Rare packages",
|
||||||
caption="Least updated packages over all time",
|
caption="Least often installed packages",
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
ncol=1,
|
ncol=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_()
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(df_pkg_dl: pl.LazyFrame):
|
||||||
|
def _():
|
||||||
|
return (
|
||||||
|
lp.ggplot(df_pkg_dl.collect(), lp.aes("downloads"))
|
||||||
|
+ lp.geom_freqpoly(stat="bin")
|
||||||
|
+ lp.labs(
|
||||||
|
title="Package installation count distribution",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
_()
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(df_pkg_dl: pl.LazyFrame):
|
||||||
|
def _():
|
||||||
|
def get_num(df: pl.LazyFrame) -> int:
|
||||||
|
return df.count().collect().item(0, 0)
|
||||||
|
|
||||||
|
one_install = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||||
|
pl.col("downloads") == 1
|
||||||
|
)
|
||||||
|
two_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||||
|
(pl.col("downloads") >= 2) & (pl.col("downloads") < 10)
|
||||||
|
)
|
||||||
|
three_installs = df_pkg_dl.sort("downloads", descending=False).filter(
|
||||||
|
(pl.col("downloads") >= 10) & (pl.col("downloads") < 20)
|
||||||
|
)
|
||||||
|
# TODO: Fix for new filters above
|
||||||
|
return mo.md(rf"""
|
||||||
|
|
||||||
|
There are {get_num(one_install)} packages which have exactly a single
|
||||||
|
installation in the data, {get_num(two_installs)} packages with exactly
|
||||||
|
two installations, and {get_num(three_installs)} packages with exactly
|
||||||
|
three.
|
||||||
|
|
||||||
|
""")
|
||||||
|
|
||||||
|
_()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue