Add kernel cleaning func

This commit is contained in:
Marty Oehme 2025-09-29 18:31:43 +02:00
parent 9513e6544e
commit 4d5aa73de7
Signed by: Marty
GPG key ID: 4E535BC19C61886E
2 changed files with 83 additions and 0 deletions

View file

@ -41,6 +41,79 @@ def json_to_daily_pkg(input_dir: Path, output_dir: Path, force: bool = False) ->
# print(output_file, p_date, p_name, p_count)
def json_to_unique_csv(input_dir: Path, output_dir: Path, force: bool = False) -> None:
if not input_dir.is_dir():
raise ValueError
output_dir.mkdir(exist_ok=True, parents=True)
# only clean if raw files have been updated since last time
if not force and not newer_than_last_clean(
input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily unique installs"
):
return
output_file = output_dir / "unique_installs.csv"
with open(output_file, "w") as fw:
writer = csv.writer(fw)
writer.writerow(["date", "unique"])
for j in input_dir.glob("*.json"):
with open(j) as fr:
date = j.stem
try:
data = json.load(fr)
except json.JSONDecodeError:
print(f"WARN: Could not decode JSON data for file {j}")
if "UniqueInstalls" not in data:
print(
f"WARN: No correct json structure containing 'UniqueInstalls' field in file {j}"
)
p_date = date
p_count = data["UniqueInstalls"]
writer.writerow([p_date, p_count])
# print(output_file, p_date, p_count)
def json_to_daily_kernel_csv(
input_dir: Path, output_dir: Path, force: bool = False
) -> None:
if not input_dir.is_dir():
raise ValueError
output_dir.mkdir(exist_ok=True, parents=True)
# only clean if raw files have been updated since last time
if not force and not newer_than_last_clean(
input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily kernels"
):
return
for j in input_dir.glob("*.json"):
with open(j) as fr:
date = j.stem
output_file = output_dir / f"{date}.csv"
try:
data = json.load(fr)
except json.JSONDecodeError:
print(f"WARN: Could not decode JSON data for file {j}")
if "Packages" not in data:
print(
f"WARN: No correct json structure containing 'XuKernel' field in file {j}"
)
with open(output_file, "w") as fw:
writer = csv.writer(fw)
writer.writerow(["date", "kernel", "downloads"])
for entry in data["XuKernel"]:
p_name = entry
p_count = data["XuKernel"][entry]
p_date = date
writer.writerow([p_date, p_name, p_count])
# print(output_file, p_date, p_name, p_count)
def newer_than_last_clean(
input_glob: Generator, output_glob: Generator, desc: str | None = None
) -> bool:
@ -66,6 +139,10 @@ def newer_than_last_clean(
def main(input: str, output: str) -> None:
json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True)
json_to_unique_csv(Path(input) / "daily", Path(output), force=True)
json_to_daily_kernel_csv(
Path(input) / "daily", Path(output) / "kernels", force=True
)
if __name__ == "__main__":

View file

@ -36,6 +36,12 @@ def _():
clean.json_to_daily_pkg(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "daily", force=False
)
clean.json_to_unique_csv(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR), force=False
)
clean.json_to_daily_kernel_csv(
Path(DATA_RAW_DIR) / "daily", Path(DATA_CLEAN_DIR) / "kernels", force=False
)
@app.cell