add generation script for package stats
This commit is contained in:
parent
fa3473cdc2
commit
28eb0a39ae
3 changed files with 69 additions and 24 deletions
|
|
@ -19,29 +19,6 @@ def filesize_csv(input_dir: Path, output_dir: Path) -> None:
|
||||||
writer.writerow([p_date, p_fname, p_mtime, p_size])
|
writer.writerow([p_date, p_fname, p_mtime, p_size])
|
||||||
|
|
||||||
|
|
||||||
def newer_than_last_clean(
|
|
||||||
input_glob: Generator[Path], output_glob: Generator[Path], desc: str | None = None
|
|
||||||
) -> bool:
|
|
||||||
last_raw_update: datetime = datetime(1, 1, 1)
|
|
||||||
oldest_clean: datetime = datetime.now()
|
|
||||||
for file in input_glob:
|
|
||||||
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
|
||||||
if mdate > last_raw_update:
|
|
||||||
last_raw_update = mdate
|
|
||||||
for file in output_glob:
|
|
||||||
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
|
||||||
if mdate < oldest_clean:
|
|
||||||
oldest_clean = mdate
|
|
||||||
if oldest_clean > last_raw_update:
|
|
||||||
print(
|
|
||||||
"INFO: Skip creating cleaned data"
|
|
||||||
+ f"{f' for {desc}' if desc else ''}"
|
|
||||||
+ f", last processing {oldest_clean} newer than {last_raw_update}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_dirs(input_dir: Path, output_dir: Path):
|
def ensure_dirs(input_dir: Path, output_dir: Path):
|
||||||
if not input_dir.is_dir():
|
if not input_dir.is_dir():
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
|
||||||
65
code/packages.py
Normal file
65
code/packages.py
Normal file
|
|
@ -0,0 +1,65 @@
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
|
|
||||||
|
def packages_csv(input_dir: Path, output_dir: Path) -> None:
|
||||||
|
output_file = output_dir / "packages.csv"
|
||||||
|
with output_file.open("w") as fw:
|
||||||
|
writer = csv.writer(fw)
|
||||||
|
writer.writerow(["date", "package", "version", "count"])
|
||||||
|
|
||||||
|
for j in input_dir.glob("*.json"):
|
||||||
|
with open(j) as fr:
|
||||||
|
date = j.stem
|
||||||
|
data: dict[str, object] = {}
|
||||||
|
try:
|
||||||
|
data = json.load(fr)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(f"WARN: Could not decode JSON data for file {j}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "Versions" not in data or not isinstance(data["Versions"], dict):
|
||||||
|
print(
|
||||||
|
f"WARN: No correct json structure containing 'Versions' field in file {j}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
data_versions = cast(dict[str, dict[str, int]], data["Versions"])
|
||||||
|
for package_name, package_versions in data_versions.items():
|
||||||
|
if not isinstance(package_versions, dict):
|
||||||
|
print(
|
||||||
|
f"WARN: No correct json version structure containing versions in the Version field in file {j}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
for version, count in package_versions.items():
|
||||||
|
p_name = package_name
|
||||||
|
p_version = version
|
||||||
|
v_count = count
|
||||||
|
p_date = date
|
||||||
|
writer.writerow([p_date, p_name, p_version, v_count])
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_dirs(input_dir: Path, output_dir: Path):
|
||||||
|
if not input_dir.is_dir():
|
||||||
|
raise ValueError
|
||||||
|
output_dir.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
|
|
||||||
|
def main(input: str, output: str) -> None:
|
||||||
|
input_dir = Path(input)
|
||||||
|
output_dir = Path(output)
|
||||||
|
ensure_dirs(input_dir, output_dir)
|
||||||
|
packages_csv(input_dir, output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if not len(sys.argv) == 3:
|
||||||
|
print("Please provide exactly one input directory and one output directory.")
|
||||||
|
sys.exit(1)
|
||||||
|
inp = sys.argv[1]
|
||||||
|
out = sys.argv[2]
|
||||||
|
main(inp, out)
|
||||||
5
justfile
5
justfile
|
|
@ -1,4 +1,4 @@
|
||||||
default: files kernels unique
|
default: files kernels unique packages
|
||||||
|
|
||||||
files:
|
files:
|
||||||
python code/files.py input output
|
python code/files.py input output
|
||||||
|
|
@ -8,3 +8,6 @@ kernels:
|
||||||
|
|
||||||
unique:
|
unique:
|
||||||
python code/unique.py input output
|
python code/unique.py input output
|
||||||
|
|
||||||
|
packages:
|
||||||
|
python code/packages.py input output
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue