Fix monthwise package accumulation
Cut off double-counted months from year.
This commit is contained in:
parent
9e3726402d
commit
06ee312c80
1 changed files with 30 additions and 11 deletions
|
|
@ -283,10 +283,13 @@ def plt_weekday_packages(df_pkg_lazy: pl.LazyFrame):
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def plt_month_packages(df_pkg_lazy: pl.LazyFrame):
|
def plt_month_packages(df_pkg_lazy: pl.LazyFrame):
|
||||||
# FIXME: should be cut off after exact 12 months, or counts some months double
|
|
||||||
def _():
|
def _():
|
||||||
month_agg_downloads = (
|
month_agg_downloads = (
|
||||||
df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
|
df_pkg_lazy.with_columns(pl.col("date").dt.month().alias("month"))
|
||||||
|
.filter(
|
||||||
|
(pl.col("date") >= pl.datetime(2018, 10, 1))
|
||||||
|
& (pl.col("date") < pl.datetime(2025, 10, 1))
|
||||||
|
)
|
||||||
.group_by("month")
|
.group_by("month")
|
||||||
.agg(pl.col("count").sum())
|
.agg(pl.col("count").sum())
|
||||||
)
|
)
|
||||||
|
|
@ -381,6 +384,7 @@ def tab_rarest_packages(df_pkg_dl: pl.LazyFrame):
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
|
def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
|
||||||
def _():
|
def _():
|
||||||
|
|
@ -395,6 +399,7 @@ def plt_package_distribution(df_pkg_dl: pl.LazyFrame):
|
||||||
_()
|
_()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def tab_percentiles(df_pkg_dl: pl.LazyFrame):
|
def tab_percentiles(df_pkg_dl: pl.LazyFrame):
|
||||||
def get_num(df: pl.LazyFrame) -> int:
|
def get_num(df: pl.LazyFrame) -> int:
|
||||||
|
|
@ -409,13 +414,17 @@ def tab_percentiles(df_pkg_dl: pl.LazyFrame):
|
||||||
twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
|
twenty_thirty = df_pkg_dl.sort("count", descending=False).filter(
|
||||||
(pl.col("count") >= 20) & (pl.col("count") < 30)
|
(pl.col("count") >= 20) & (pl.col("count") < 30)
|
||||||
)
|
)
|
||||||
thirty_plus = df_pkg_dl.sort("count", descending=False).filter((pl.col("count") >= 30))
|
thirty_plus = df_pkg_dl.sort("count", descending=False).filter(
|
||||||
pl.DataFrame([
|
(pl.col("count") >= 30)
|
||||||
|
)
|
||||||
|
pl.DataFrame(
|
||||||
|
[
|
||||||
get_num(one_ten_installs),
|
get_num(one_ten_installs),
|
||||||
get_num(ten_twenty_installs),
|
get_num(ten_twenty_installs),
|
||||||
get_num(twenty_thirty),
|
get_num(twenty_thirty),
|
||||||
get_num(thirty_plus),
|
get_num(thirty_plus),
|
||||||
])
|
]
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -576,13 +585,23 @@ def tab_missing_days(sizes_df: pl.DataFrame):
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def plt_modified_times(sizes_df):
|
def plt_modified_times(sizes_df):
|
||||||
|
# Disregard this cell.
|
||||||
|
# It was originally used to find days where the given date (i.e. filename)
|
||||||
|
# diverged from the modification date (i.e. unix mdate). But, there is not
|
||||||
|
# too much new info to be gained that the missing days above don't already
|
||||||
|
# provide imo. Aside from many files being externally modified on one day,
|
||||||
|
# pointing to being moved/changed all at once as part of a server migration
|
||||||
|
# or update.
|
||||||
|
#
|
||||||
|
# With the current data, this information is unfortunately lost, as the
|
||||||
|
# files have now also been modified (attributes) within the dataset itself.
|
||||||
|
# An updated dataset could make use of this information as part of its records,
|
||||||
|
# potentially.
|
||||||
def _():
|
def _():
|
||||||
different_modification_date = sizes_df.filter(
|
different_modification_date = sizes_df.filter(
|
||||||
pl.col("date") != pl.col("modified").dt.date()
|
pl.col("date") != pl.col("modified").dt.date()
|
||||||
)
|
)
|
||||||
# This does not work well what are we showing?
|
return different_modification_date
|
||||||
# 'true' capture date on X but then what on Y - the
|
|
||||||
# same date for each? the difference in dt?
|
|
||||||
return (
|
return (
|
||||||
lp.ggplot(
|
lp.ggplot(
|
||||||
different_modification_date,
|
different_modification_date,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue