Extract cleaning date checking
This commit is contained in:
parent
4c9518cf67
commit
e00217f2fe
1 changed files with 29 additions and 20 deletions
49
clean.py
49
clean.py
|
|
@ -2,33 +2,19 @@ import csv
|
||||||
import json
|
import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
|
||||||
def json_to_daily_pkg(
|
def json_to_daily_pkg(input_dir: Path, output_dir: Path, force: bool = False) -> None:
|
||||||
input_dir: Path, output_dir: Path, force: bool = False
|
|
||||||
) -> int | None:
|
|
||||||
if not input_dir.is_dir():
|
if not input_dir.is_dir():
|
||||||
raise ValueError
|
raise ValueError
|
||||||
output_dir.mkdir(exist_ok=True, parents=True)
|
output_dir.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
# only clean if raw files have been updated since last time
|
# only clean if raw files have been updated since last time
|
||||||
if not force:
|
if not force and not newer_than_last_clean(
|
||||||
last_raw_update: datetime = datetime(1, 1, 1)
|
input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily packages"
|
||||||
oldest_clean: datetime = datetime.now()
|
):
|
||||||
for file in input_dir.glob("*.json"):
|
return
|
||||||
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
|
||||||
if mdate > last_raw_update:
|
|
||||||
last_raw_update = mdate
|
|
||||||
for file in output_dir.glob("*.csv"):
|
|
||||||
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
|
||||||
if mdate < oldest_clean:
|
|
||||||
oldest_clean = mdate
|
|
||||||
if oldest_clean > last_raw_update:
|
|
||||||
print(
|
|
||||||
"INFO: Skip creating cleaned data,"
|
|
||||||
f" last processing {oldest_clean} newer than {last_raw_update}"
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
for j in input_dir.glob("*.json"):
|
for j in input_dir.glob("*.json"):
|
||||||
with open(j) as fr:
|
with open(j) as fr:
|
||||||
|
|
@ -55,6 +41,29 @@ def json_to_daily_pkg(
|
||||||
# print(output_file, p_date, p_name, p_count)
|
# print(output_file, p_date, p_name, p_count)
|
||||||
|
|
||||||
|
|
||||||
|
def newer_than_last_clean(
|
||||||
|
input_glob: Generator, output_glob: Generator, desc: str | None = None
|
||||||
|
) -> bool:
|
||||||
|
last_raw_update: datetime = datetime(1, 1, 1)
|
||||||
|
oldest_clean: datetime = datetime.now()
|
||||||
|
for file in input_glob:
|
||||||
|
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
||||||
|
if mdate > last_raw_update:
|
||||||
|
last_raw_update = mdate
|
||||||
|
for file in output_glob:
|
||||||
|
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
||||||
|
if mdate < oldest_clean:
|
||||||
|
oldest_clean = mdate
|
||||||
|
if oldest_clean > last_raw_update:
|
||||||
|
print(
|
||||||
|
"INFO: Skip creating cleaned data"
|
||||||
|
f"{f' for {desc}' if desc else ''}"
|
||||||
|
f", last processing {oldest_clean} newer than {last_raw_update}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def main(input: str, output: str) -> None:
|
def main(input: str, output: str) -> None:
|
||||||
json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True)
|
json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue