Extract cleaning date checking

This commit is contained in:
Marty Oehme 2025-09-29 18:08:18 +02:00
parent 4c9518cf67
commit e00217f2fe
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -2,32 +2,18 @@ import csv
import json import json
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Generator
def json_to_daily_pkg( def json_to_daily_pkg(input_dir: Path, output_dir: Path, force: bool = False) -> None:
input_dir: Path, output_dir: Path, force: bool = False
) -> int | None:
if not input_dir.is_dir(): if not input_dir.is_dir():
raise ValueError raise ValueError
output_dir.mkdir(exist_ok=True, parents=True) output_dir.mkdir(exist_ok=True, parents=True)
# only clean if raw files have been updated since last time # only clean if raw files have been updated since last time
if not force: if not force and not newer_than_last_clean(
last_raw_update: datetime = datetime(1, 1, 1) input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily packages"
oldest_clean: datetime = datetime.now() ):
for file in input_dir.glob("*.json"):
mdate = datetime.fromtimestamp(file.stat().st_mtime)
if mdate > last_raw_update:
last_raw_update = mdate
for file in output_dir.glob("*.csv"):
mdate = datetime.fromtimestamp(file.stat().st_mtime)
if mdate < oldest_clean:
oldest_clean = mdate
if oldest_clean > last_raw_update:
print(
"INFO: Skip creating cleaned data,"
f" last processing {oldest_clean} newer than {last_raw_update}"
)
return return
for j in input_dir.glob("*.json"): for j in input_dir.glob("*.json"):
@ -55,6 +41,29 @@ def json_to_daily_pkg(
# print(output_file, p_date, p_name, p_count) # print(output_file, p_date, p_name, p_count)
def newer_than_last_clean(
input_glob: Generator, output_glob: Generator, desc: str | None = None
) -> bool:
last_raw_update: datetime = datetime(1, 1, 1)
oldest_clean: datetime = datetime.now()
for file in input_glob:
mdate = datetime.fromtimestamp(file.stat().st_mtime)
if mdate > last_raw_update:
last_raw_update = mdate
for file in output_glob:
mdate = datetime.fromtimestamp(file.stat().st_mtime)
if mdate < oldest_clean:
oldest_clean = mdate
if oldest_clean > last_raw_update:
print(
"INFO: Skip creating cleaned data"
f"{f' for {desc}' if desc else ''}"
f", last processing {oldest_clean} newer than {last_raw_update}"
)
return False
return True
def main(input: str, output: str) -> None: def main(input: str, output: str) -> None:
json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True) json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True)