diff --git a/popcorn.py b/popcorn.py index 3fbb9f5..29e9192 100644 --- a/popcorn.py +++ b/popcorn.py @@ -5,10 +5,11 @@ app = marimo.App(width="medium") with app.setup: # Initialization code that runs beimpofore all other cells + import re + import lets_plot as lp import marimo as mo import polars as pl - import re @app.cell(hide_code=True) @@ -25,28 +26,21 @@ def _(): except AttributeError: return None - sizes_df = ( + sizes_df_raw = ( pl.read_csv("data/file_sizes.csv") .with_columns( pl.col("name") .str.replace(r"data/(\d{4}-\d{2}-\d{2}).json", "${1}") .str.to_date() .alias("date"), - pl.col("size").map_elements(lambda x: parse_size(x), return_dtype=pl.Float32).alias("size_num") + pl.col("size") + .map_elements(lambda x: parse_size(x), return_dtype=pl.Float32) + .alias("size_num"), ) .select(["date", "size_num", "size", "modified"]) ) - sizes_df_null = sizes_df.filter(pl.col("size_num").is_null()) - sizes_df = sizes_df.filter(pl.col("size").is_not_null()) - return - - -@app.cell -def _(sizes_df_null): - sizes_df_null.select(["date", "size"]).style.tab_header( - title="Missing Days", - subtitle="Days with 0B size due to missing on the popcorn server.", - ) + sizes_df = sizes_df_raw.filter(pl.col("size_num").is_not_null()) + return sizes_df, sizes_df_raw @app.cell @@ -54,9 +48,10 @@ def _(sizes_df): ( lp.ggplot(sizes_df, lp.aes(x="date", y="size")) + lp.geom_point() + + lp.geom_smooth(method="lm") + lp.labs( - title="File sizes", - subtitle="Size of daily popcorn files over time", + title="Size growth", + subtitle="Size of daily popcorn statistics files over time", caption="Raw json file size, without any formatting, removal of markers, characters or newlines.", ) ) @@ -66,13 +61,59 @@ def _(sizes_df): def _(sizes_df): ( lp.ggplot(sizes_df, lp.aes(x="date", y="size")) - + lp.geom_dotplot() + + lp.geom_point() + + lp.geom_smooth(method="lowess") + lp.labs( title="", subtitle="", caption="", ) ) + return + + +@app.cell +def _(): + mo.md( + r""" + ## Odds and Ends + There are some missing days in the statistics. + """ + ) + return + + +@app.cell +def _(sizes_df_raw): + sizes_df_null = sizes_df_raw.filter(pl.col("size_num").is_null()) + sizes_df_null.select(["date", "size"]).style.tab_header( + title="Missing Days", + subtitle="Days with 0B size due to missing on the popcorn server.", + ) + return + + +@app.cell +def _(sizes_df): + def _(): + different_modification_date = sizes_df.with_columns( + pl.col("modified") + .str.to_datetime(format="%F %T %:z", strict=False) + .alias("modified_dt"), + ).filter(pl.col("date") != pl.col("modified_dt").dt.date()) + # This does not work well what are we showing? + # 'true' capture date on X but then what on Y - the + # same date for each? the difference in dt? + return ( + lp.ggplot( + different_modification_date, + lp.aes("date", "modified_dt"), + ) + + lp.geom_freqpoly() + ) + + _() + return if __name__ == "__main__":