Major performance improvements for weekly and monthly plots
Doing most of the aggregation in polars with the streaming engine prevents memory overload (as compared to letting ggplot do it).
This commit is contained in:
parent
728ec37bda
commit
443c4c98cd
1 changed files with 35 additions and 31 deletions
|
|
@ -4,15 +4,11 @@ __generated_with = "0.16.2"
|
||||||
app = marimo.App(width="medium")
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
with app.setup:
|
with app.setup:
|
||||||
# Initialization code that runs beimpofore all other cells
|
|
||||||
import re
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import lets_plot as lp
|
import lets_plot as lp
|
||||||
import marimo as mo
|
import marimo as mo
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
LIMIT_ROWS = 500_000
|
LIMIT_ROWS = False
|
||||||
DATA_DIR = "input/popcorn/output"
|
DATA_DIR = "input/popcorn/output"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -120,7 +116,7 @@ def _():
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _():
|
def _():
|
||||||
df_pkg_lazy = pl.scan_csv(
|
df_versions_lazy = pl.scan_csv(
|
||||||
f"{DATA_DIR}/packages.csv",
|
f"{DATA_DIR}/packages.csv",
|
||||||
schema={
|
schema={
|
||||||
"date": pl.Date,
|
"date": pl.Date,
|
||||||
|
|
@ -129,6 +125,12 @@ def _():
|
||||||
"count": pl.UInt16,
|
"count": pl.UInt16,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
df_pkg_lazy = (
|
||||||
|
df_versions_lazy.drop("version")
|
||||||
|
.group_by(["date", "package"])
|
||||||
|
.agg(pl.sum("count"))
|
||||||
|
.sort("date")
|
||||||
|
)
|
||||||
if LIMIT_ROWS: # NOTE: this is only for debugging purposes
|
if LIMIT_ROWS: # NOTE: this is only for debugging purposes
|
||||||
df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
|
df_pkg_lazy = df_pkg_lazy.head(LIMIT_ROWS)
|
||||||
# give small df preview
|
# give small df preview
|
||||||
|
|
@ -157,12 +159,8 @@ def _():
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(df_pkg_lazy: pl.LazyFrame):
|
def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
def _():
|
def _():
|
||||||
weekly_packages = (
|
weekly_packages = df_pkg_lazy.group_by_dynamic("date", every="1w").agg(
|
||||||
df_pkg_lazy
|
pl.col("count").sum()
|
||||||
# .sort("date")
|
|
||||||
.group_by_dynamic("date", every="1w")
|
|
||||||
.agg(pl.col("count").sum())
|
|
||||||
.sort("date")
|
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
|
|
@ -196,10 +194,10 @@ def _():
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(df_pkg_lazy: pl.LazyFrame):
|
def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
def _():
|
def _():
|
||||||
weekday_downloads = df_pkg_lazy.sort("date").with_columns(
|
weekday_downloads = (
|
||||||
|
df_pkg_lazy.with_columns(
|
||||||
pl.col("date")
|
pl.col("date")
|
||||||
.dt.weekday()
|
.dt.weekday()
|
||||||
.sort()
|
|
||||||
.replace_strict(
|
.replace_strict(
|
||||||
{
|
{
|
||||||
1: "Mon",
|
1: "Mon",
|
||||||
|
|
@ -213,12 +211,15 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
)
|
)
|
||||||
.alias("weekday")
|
.alias("weekday")
|
||||||
)
|
)
|
||||||
|
.group_by("weekday")
|
||||||
|
.agg(pl.col("count").sum())
|
||||||
|
)
|
||||||
return (
|
return (
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
weekday_downloads.collect(engine="streaming"),
|
weekday_downloads.collect(engine="streaming"),
|
||||||
lp.aes("weekday", "count"),
|
lp.aes("weekday", "count"),
|
||||||
)
|
)
|
||||||
+ lp.geom_bar()
|
+ lp.geom_bar(stat="identity")
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
title="Ownership per weekday",
|
title="Ownership per weekday",
|
||||||
caption="Package ownership per day of the week over all time",
|
caption="Package ownership per day of the week over all time",
|
||||||
|
|
@ -232,15 +233,17 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(df_pkg_lazy: pl.LazyFrame):
|
def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
def _():
|
def _():
|
||||||
month_agg_downloads = df_pkg_lazy.sort("date").with_columns(
|
month_agg_downloads = (
|
||||||
pl.col("date").dt.month().alias("month")
|
df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
|
||||||
|
.group_by("month")
|
||||||
|
.agg(pl.col("count").sum())
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
month_agg_downloads.collect(engine="streaming"),
|
month_agg_downloads.collect(engine="streaming"),
|
||||||
lp.aes("month", "count"),
|
lp.aes("month", "count"),
|
||||||
)
|
)
|
||||||
+ lp.geom_bar()
|
+ lp.geom_bar(stat="identity")
|
||||||
+ lp.labs(
|
+ lp.labs(
|
||||||
title="Monthwise ownership",
|
title="Monthwise ownership",
|
||||||
caption="Package ownership per month of the year over all time",
|
caption="Package ownership per month of the year over all time",
|
||||||
|
|
@ -328,6 +331,7 @@ def _(df_pkg_dl: pl.LazyFrame):
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(df_pkg_dl: pl.LazyFrame):
|
def _(df_pkg_dl: pl.LazyFrame):
|
||||||
|
# TODO: this is horrible performance-wise
|
||||||
def _():
|
def _():
|
||||||
def get_num(df: pl.LazyFrame) -> int:
|
def get_num(df: pl.LazyFrame) -> int:
|
||||||
return df.count().collect(engine="streaming").item(0, 0)
|
return df.count().collect(engine="streaming").item(0, 0)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue