Adapt functions to new csv data layout
This commit is contained in:
parent
91d64f428c
commit
4c9518cf67
1 changed files with 41 additions and 21 deletions
62
popcorn.py
62
popcorn.py
|
|
@ -6,12 +6,15 @@ app = marimo.App(width="medium")
|
|||
with app.setup:
|
||||
# Initialization code that runs beimpofore all other cells
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import lets_plot as lp
|
||||
import marimo as mo
|
||||
import polars as pl
|
||||
|
||||
LIMIT_ROWS = 200
|
||||
LIMIT_ROWS = 500_000
|
||||
DATA_RAW_DIR = "data/raw"
|
||||
DATA_CLEAN_DIR = "data/cleaned"
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
|
|
@ -25,6 +28,16 @@ def _():
|
|||
return
|
||||
|
||||
|
||||
# run data prep
|
||||
@app.cell
|
||||
def _():
|
||||
import clean
|
||||
|
||||
clean.json_to_daily_pkg(
|
||||
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
|
||||
)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
def parse_size(size_str):
|
||||
|
|
@ -34,7 +47,7 @@ def _():
|
|||
return None
|
||||
|
||||
sizes_df_raw = (
|
||||
pl.read_csv("data/file_sizes.csv")
|
||||
pl.read_csv(f"{DATA_CLEAN_DIR}/file_sizes.csv")
|
||||
.with_columns(
|
||||
pl.col("name")
|
||||
.str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}")
|
||||
|
|
@ -118,21 +131,21 @@ def _():
|
|||
|
||||
@app.cell
|
||||
def _():
|
||||
df_lazy = (
|
||||
pl.scan_ndjson("data/daily/*", include_file_paths="file")
|
||||
.head(LIMIT_ROWS) # FIXME: take out after debug
|
||||
.with_columns(
|
||||
pl.col("file")
|
||||
.str.replace(r"data/daily/(\d{4}-\d{2}-\d{2}).json", "${1}")
|
||||
.str.to_date()
|
||||
.alias("date")
|
||||
)
|
||||
)
|
||||
df_pkg_lazy = (
|
||||
df_lazy.select("date", pl.col("Packages").struct.unnest())
|
||||
pl.scan_csv(
|
||||
f"{DATA_CLEAN_DIR}/daily/*.csv",
|
||||
include_file_paths="file",
|
||||
schema={
|
||||
"date": pl.Date,
|
||||
"package": pl.String,
|
||||
"downloads": pl.UInt16,
|
||||
},
|
||||
)
|
||||
.drop("file")
|
||||
.fill_null(0)
|
||||
.unpivot(index="date", variable_name="package", value_name="downloads")
|
||||
.head(LIMIT_ROWS) # FIXME: take out after debug
|
||||
)
|
||||
df_pkg_lazy.collect()
|
||||
return
|
||||
|
||||
|
||||
|
|
@ -281,6 +294,20 @@ def _(df_pkg_lazy: pl.LazyFrame):
|
|||
return
|
||||
|
||||
|
||||
# - which kernels have been DL when? (simplified for semver)
|
||||
@app.cell
|
||||
def _(df_lazy):
|
||||
kernel_df_lazy = df_lazy.select("date", "XuKernel")
|
||||
kernel_df = (
|
||||
kernel_df_lazy.with_columns(pl.col("XuKernel").struct.unnest())
|
||||
.fill_null(0)
|
||||
# .unpivot(index="date", variable_name="kernel", value_name="downloads")
|
||||
.collect()
|
||||
)
|
||||
|
||||
df_lazy.collect()
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
mo.md(
|
||||
|
|
@ -325,19 +352,12 @@ def _(sizes_df):
|
|||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df_lazy):
|
||||
kernel_df = df_lazy.select("date", pl.col("Kernels").struct.unnest())
|
||||
kernel_df
|
||||
|
||||
|
||||
# further ideas:
|
||||
#
|
||||
# - daily download habits:
|
||||
# - are we downloading further spread of versions on specific days
|
||||
# - are there 'update' days, where things converge? specific weekday/on holidays/etc?
|
||||
#
|
||||
# - which kernels have been DL when? (simplified for semver)
|
||||
# - when did specific kernels enter the repos?
|
||||
#
|
||||
# - which arches are/were most prevalent over time?
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue