add generation script for file stats
This commit is contained in:
parent
9c88194369
commit
fa3473cdc2
2 changed files with 72 additions and 2 deletions
67
code/files.py
Normal file
67
code/files.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
import csv
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def filesize_csv(input_dir: Path, output_dir: Path) -> None:
|
||||
output_file = output_dir / "files.csv"
|
||||
with output_file.open("w") as fw:
|
||||
writer = csv.writer(fw)
|
||||
writer.writerow(["date", "filename", "mtime", "filesize"])
|
||||
|
||||
for j in input_dir.glob("*.json"):
|
||||
p_date = j.stem
|
||||
p_fname = j.name
|
||||
stat = j.stat()
|
||||
p_mtime = stat.st_mtime
|
||||
p_size = stat.st_size
|
||||
writer.writerow([p_date, p_fname, p_mtime, p_size])
|
||||
|
||||
|
||||
def newer_than_last_clean(
|
||||
input_glob: Generator[Path], output_glob: Generator[Path], desc: str | None = None
|
||||
) -> bool:
|
||||
last_raw_update: datetime = datetime(1, 1, 1)
|
||||
oldest_clean: datetime = datetime.now()
|
||||
for file in input_glob:
|
||||
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
||||
if mdate > last_raw_update:
|
||||
last_raw_update = mdate
|
||||
for file in output_glob:
|
||||
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
||||
if mdate < oldest_clean:
|
||||
oldest_clean = mdate
|
||||
if oldest_clean > last_raw_update:
|
||||
print(
|
||||
"INFO: Skip creating cleaned data"
|
||||
+ f"{f' for {desc}' if desc else ''}"
|
||||
+ f", last processing {oldest_clean} newer than {last_raw_update}"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def ensure_dirs(input_dir: Path, output_dir: Path):
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def main(input: str, output: str) -> None:
|
||||
input_dir = Path(input)
|
||||
output_dir = Path(output)
|
||||
ensure_dirs(input_dir, output_dir)
|
||||
|
||||
filesize_csv(input_dir, output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if not len(sys.argv) == 3:
|
||||
print("Please provide exactly one input directory and one output directory.")
|
||||
sys.exit(1)
|
||||
inp = sys.argv[1]
|
||||
out = sys.argv[2]
|
||||
main(inp, out)
|
||||
7
justfile
7
justfile
|
|
@ -1,7 +1,10 @@
|
|||
default: kernels unique
|
||||
default: files kernels unique
|
||||
|
||||
files:
|
||||
python code/files.py input output
|
||||
|
||||
kernels:
|
||||
python code/kernels.py input output
|
||||
|
||||
unique:
|
||||
python ./code/unique.py input output
|
||||
python code/unique.py input output
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue