Provide raw lazyframe
This commit is contained in:
parent
499a819c90
commit
4984289f69
1 changed files with 11 additions and 20 deletions
31
popcorn.py
31
popcorn.py
|
|
@ -118,7 +118,9 @@ def _():
|
||||||
.str.to_date()
|
.str.to_date()
|
||||||
.alias("date")
|
.alias("date")
|
||||||
)
|
)
|
||||||
.select("date", pl.col("Packages").struct.unnest())
|
)
|
||||||
|
df_pkg_lazy = (
|
||||||
|
df_lazy.select("date", pl.col("Packages").struct.unnest())
|
||||||
.fill_null(0)
|
.fill_null(0)
|
||||||
.unpivot(index="date", variable_name="package", value_name="downloads")
|
.unpivot(index="date", variable_name="package", value_name="downloads")
|
||||||
)
|
)
|
||||||
|
|
@ -126,10 +128,10 @@ def _():
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(df_lazy: pl.LazyFrame):
|
def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
def _():
|
def _():
|
||||||
weekly_downloads = (
|
weekly_downloads = (
|
||||||
df_lazy.sort("date")
|
df_pkg_lazy.sort("date")
|
||||||
.group_by_dynamic("date", every="1w")
|
.group_by_dynamic("date", every="1w")
|
||||||
.agg(pl.col("downloads").sum())
|
.agg(pl.col("downloads").sum())
|
||||||
.sort("date")
|
.sort("date")
|
||||||
|
|
@ -149,10 +151,10 @@ def _(df_lazy: pl.LazyFrame):
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(df_lazy: pl.LazyFrame):
|
def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
def _():
|
def _():
|
||||||
weekday_downloads = (
|
weekday_downloads = (
|
||||||
df_lazy.sort("date")
|
df_pkg_lazy.sort("date")
|
||||||
.with_columns(
|
.with_columns(
|
||||||
pl.col("date")
|
pl.col("date")
|
||||||
.dt.weekday()
|
.dt.weekday()
|
||||||
|
|
@ -187,10 +189,10 @@ def _(df_lazy: pl.LazyFrame):
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(df_lazy: pl.LazyFrame):
|
def _(df_pkg_lazy: pl.LazyFrame):
|
||||||
def _():
|
def _():
|
||||||
month_agg_downloads = (
|
month_agg_downloads = (
|
||||||
df_lazy.sort("date")
|
df_pkg_lazy.sort("date")
|
||||||
.with_columns(pl.col("date").dt.month().alias("month"))
|
.with_columns(pl.col("date").dt.month().alias("month"))
|
||||||
.collect()
|
.collect()
|
||||||
)
|
)
|
||||||
|
|
@ -208,19 +210,8 @@ def _(df_lazy: pl.LazyFrame):
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _():
|
def _(df_lazy:pl.LazyFrame):
|
||||||
df_unique_downloads = (
|
df_unique_downloads = df_lazy.select(["date", "UniqueInstalls"]).collect()
|
||||||
pl.scan_ndjson("data/daily/*", include_file_paths="file")
|
|
||||||
.head(LIMIT_ROWS) # FIXME: take out after debug
|
|
||||||
.with_columns(
|
|
||||||
pl.col("file")
|
|
||||||
.str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
|
|
||||||
.str.to_date()
|
|
||||||
.alias("date")
|
|
||||||
)
|
|
||||||
.select(["date", "UniqueInstalls"])
|
|
||||||
.collect()
|
|
||||||
)
|
|
||||||
(
|
(
|
||||||
lp.ggplot(df_unique_downloads, lp.aes("date", "UniqueInstalls"))
|
lp.ggplot(df_unique_downloads, lp.aes("date", "UniqueInstalls"))
|
||||||
+ lp.geom_line()
|
+ lp.geom_line()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue