Start transferring to quarto file
This commit is contained in:
parent
c51970507d
commit
cfc8ecc4fd
2 changed files with 225 additions and 52 deletions
|
|
@ -83,7 +83,7 @@ def plt_filesize(sizes_df):
|
|||
+ lp.geom_smooth(method="lm")
|
||||
+ lp.labs(
|
||||
title="Size growth",
|
||||
subtitle="Size of daily popcorn statistics files over time",
|
||||
subtitle="Cumulative filesize of daily popcorn statistics over time",
|
||||
caption="Raw json file size, without any formatting, removal of markers, characters or newlines.",
|
||||
y="filesize in kB",
|
||||
)
|
||||
|
|
@ -115,7 +115,7 @@ def _():
|
|||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
def tab_pkg():
|
||||
df_versions_lazy = pl.scan_csv(
|
||||
f"{DATA_DIR}/packages.csv",
|
||||
schema={
|
||||
|
|
@ -157,7 +157,7 @@ def _():
|
|||
|
||||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
def plt_weekly_packages(df_pkg_lazy: pl.LazyFrame):
|
||||
pkg_per_day = df_pkg_lazy.group_by("date").agg(pl.col("count").sum()).sort("date")
|
||||
|
||||
def _():
|
||||
|
|
@ -222,7 +222,7 @@ def _():
|
|||
|
||||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
def plt_weekday_packages(df_pkg_lazy: pl.LazyFrame):
|
||||
def _():
|
||||
weekday_downloads = (
|
||||
df_pkg_lazy.with_columns(
|
||||
|
|
@ -261,7 +261,8 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
|||
|
||||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
def plt_month_packages(df_pkg_lazy: pl.LazyFrame):
|
||||
# FIXME: should be cut off after exact 12 months, or counts some months double
|
||||
def _():
|
||||
month_agg_downloads = (
|
||||
df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
|
||||
|
|
@ -285,7 +286,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
|||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
def plt_unique_installs():
|
||||
df_unique_installs = pl.read_csv(
|
||||
f"{DATA_DIR}/unique_installs.csv",
|
||||
schema={"date": pl.Date, "unique": pl.UInt16},
|
||||
|
|
@ -306,7 +307,7 @@ def _():
|
|||
|
||||
|
||||
@app.cell
|
||||
def _(df_pkg_lazy: pl.LazyFrame):
|
||||
def plt_top_packages(df_pkg_lazy: pl.LazyFrame):
|
||||
df_pkg_dl = df_pkg_lazy.group_by("package").agg(pl.col("count").sum())
|
||||
|
||||
def _():
|
||||
|
|
@ -346,7 +347,7 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
|||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(df_pkg_dl: pl.LazyFrame):
|
||||
def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
|
||||
def _():
|
||||
return (
|
||||
lp.ggplot(df_pkg_dl.collect(engine="streaming"), lp.aes("count"))
|
||||
|
|
@ -360,39 +361,6 @@ def _(df_pkg_dl: pl.LazyFrame):
|
|||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(df_pkg_dl: pl.LazyFrame):
|
||||
# TODO: this is horrible performance-wise
|
||||
def _():
|
||||
def get_num(df: pl.LazyFrame) -> int:
|
||||
return df.count().collect(engine="streaming").item(0, 0)
|
||||
|
||||
one_ten_installs = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 1) & (pl.col("count") < 10)
|
||||
)
|
||||
ten_twenty_installs = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 10) & (pl.col("count") < 20)
|
||||
)
|
||||
twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 20) & (pl.col("count") < 30)
|
||||
)
|
||||
thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
|
||||
(pl.col("count") >= 30)
|
||||
)
|
||||
# TODO: Fix for new filters above
|
||||
return mo.md(rf"""
|
||||
|
||||
There are {get_num(one_ten_installs):,} packages which have between one
|
||||
and ten installations in the data, {get_num(ten_twenty_installs):,}
|
||||
packages between eleven and 20 installations, and
|
||||
{get_num(twenty_thirty):,} packages between 21 and 30 installations.
|
||||
{get_num(thirty_plus):,} packages have over 30 installations.
|
||||
|
||||
""")
|
||||
|
||||
_()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
|
|
@ -402,7 +370,7 @@ def _():
|
|||
|
||||
# - which kernels have been DL when? (simplified for semver)
|
||||
@app.cell
|
||||
def _():
|
||||
def plt_kernel_versions():
|
||||
kernel_df_lazy = (
|
||||
pl.scan_csv(
|
||||
f"{DATA_DIR}/kernels.csv",
|
||||
|
|
@ -468,7 +436,7 @@ def _(kernel_df_v99: pl.DataFrame):
|
|||
|
||||
|
||||
@app.cell
|
||||
def _(kernel_df_lazy: pl.LazyFrame):
|
||||
def plt_kernel_timeline(kernel_df_lazy: pl.LazyFrame):
|
||||
weekly_kernel_df = (
|
||||
kernel_df_lazy.with_columns(pl.col("major_ver").cast(pl.String))
|
||||
.select(["date", "major_ver", "downloads"])
|
||||
|
|
@ -535,7 +503,7 @@ def _():
|
|||
|
||||
|
||||
@app.cell
|
||||
def _(sizes_df: pl.DataFrame):
|
||||
def tab_missing_days(sizes_df: pl.DataFrame):
|
||||
date_range = pl.date_range(
|
||||
sizes_df.select("date").min().item(), sizes_df.select("date").max().item()
|
||||
)
|
||||
|
|
@ -550,7 +518,7 @@ def _(sizes_df: pl.DataFrame):
|
|||
|
||||
|
||||
@app.cell
|
||||
def _(sizes_df):
|
||||
def plt_modified_times(sizes_df):
|
||||
def _():
|
||||
different_modification_date = sizes_df.filter(
|
||||
pl.col("date") != pl.col("modified").dt.date()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue