Remove reduntant cleaning files
This commit is contained in:
parent
0618814c49
commit
66b0464809
3 changed files with 0 additions and 169 deletions
156
clean.py
156
clean.py
|
|
@ -1,156 +0,0 @@
|
|||
import csv
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
|
||||
def json_to_daily_pkg(input_dir: Path, output_dir: Path, force: bool = False) -> None:
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# only clean if raw files have been updated since last time
|
||||
if not force and not newer_than_last_clean(
|
||||
input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily packages"
|
||||
):
|
||||
return
|
||||
|
||||
for j in input_dir.glob("*.json"):
|
||||
with open(j) as fr:
|
||||
date = j.stem
|
||||
output_file = output_dir / f"{date}.csv"
|
||||
try:
|
||||
data = json.load(fr)
|
||||
except json.JSONDecodeError:
|
||||
print(f"WARN: Could not decode JSON data for file {j}")
|
||||
|
||||
if "Packages" not in data:
|
||||
print(
|
||||
f"WARN: No correct json structure containing 'Packages' field in file {j}"
|
||||
)
|
||||
|
||||
with open(output_file, "w") as fw:
|
||||
writer = csv.writer(fw)
|
||||
writer.writerow(["date", "package", "downloads"])
|
||||
for entry in data["Packages"]:
|
||||
p_name = entry
|
||||
p_count = data["Packages"][entry]
|
||||
p_date = date
|
||||
writer.writerow([p_date, p_name, p_count])
|
||||
# print(output_file, p_date, p_name, p_count)
|
||||
|
||||
|
||||
def json_to_unique_csv(input_dir: Path, output_dir: Path, force: bool = False) -> None:
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# only clean if raw files have been updated since last time
|
||||
if not force and not newer_than_last_clean(
|
||||
input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily unique installs"
|
||||
):
|
||||
return
|
||||
|
||||
output_file = output_dir / "unique_installs.csv"
|
||||
with open(output_file, "w") as fw:
|
||||
writer = csv.writer(fw)
|
||||
writer.writerow(["date", "unique"])
|
||||
|
||||
for j in input_dir.glob("*.json"):
|
||||
with open(j) as fr:
|
||||
date = j.stem
|
||||
try:
|
||||
data = json.load(fr)
|
||||
except json.JSONDecodeError:
|
||||
print(f"WARN: Could not decode JSON data for file {j}")
|
||||
|
||||
if "UniqueInstalls" not in data:
|
||||
print(
|
||||
f"WARN: No correct json structure containing 'UniqueInstalls' field in file {j}"
|
||||
)
|
||||
|
||||
p_date = date
|
||||
p_count = data["UniqueInstalls"]
|
||||
writer.writerow([p_date, p_count])
|
||||
# print(output_file, p_date, p_count)
|
||||
|
||||
|
||||
def json_to_daily_kernel_csv(
|
||||
input_dir: Path, output_dir: Path, force: bool = False
|
||||
) -> None:
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# only clean if raw files have been updated since last time
|
||||
if not force and not newer_than_last_clean(
|
||||
input_dir.glob("*.json"), output_dir.glob("*.csv"), desc="daily kernels"
|
||||
):
|
||||
return
|
||||
|
||||
for j in input_dir.glob("*.json"):
|
||||
with open(j) as fr:
|
||||
date = j.stem
|
||||
output_file = output_dir / f"{date}.csv"
|
||||
try:
|
||||
data = json.load(fr)
|
||||
except json.JSONDecodeError:
|
||||
print(f"WARN: Could not decode JSON data for file {j}")
|
||||
|
||||
if "Packages" not in data:
|
||||
print(
|
||||
f"WARN: No correct json structure containing 'XuKernel' field in file {j}"
|
||||
)
|
||||
|
||||
with open(output_file, "w") as fw:
|
||||
writer = csv.writer(fw)
|
||||
writer.writerow(["date", "kernel", "downloads"])
|
||||
for entry in data["XuKernel"]:
|
||||
p_name = entry
|
||||
p_count = data["XuKernel"][entry]
|
||||
p_date = date
|
||||
writer.writerow([p_date, p_name, p_count])
|
||||
# print(output_file, p_date, p_name, p_count)
|
||||
|
||||
|
||||
def newer_than_last_clean(
|
||||
input_glob: Generator, output_glob: Generator, desc: str | None = None
|
||||
) -> bool:
|
||||
last_raw_update: datetime = datetime(1, 1, 1)
|
||||
oldest_clean: datetime = datetime.now()
|
||||
for file in input_glob:
|
||||
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
||||
if mdate > last_raw_update:
|
||||
last_raw_update = mdate
|
||||
for file in output_glob:
|
||||
mdate = datetime.fromtimestamp(file.stat().st_mtime)
|
||||
if mdate < oldest_clean:
|
||||
oldest_clean = mdate
|
||||
if oldest_clean > last_raw_update:
|
||||
print(
|
||||
"INFO: Skip creating cleaned data"
|
||||
f"{f' for {desc}' if desc else ''}"
|
||||
f", last processing {oldest_clean} newer than {last_raw_update}"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def main(input: str, output: str) -> None:
|
||||
json_to_daily_pkg(Path(input) / "daily", Path(output) / "daily", force=True)
|
||||
json_to_unique_csv(Path(input) / "daily", Path(output), force=True)
|
||||
json_to_daily_kernel_csv(
|
||||
Path(input) / "daily", Path(output) / "kernels", force=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if not len(sys.argv) == 3:
|
||||
print("Please provide exactly one input directory and one output directory.")
|
||||
sys.exit(1)
|
||||
inp = sys.argv[1]
|
||||
out = sys.argv[2]
|
||||
main(inp, out)
|
||||
13
grab_all.sh
13
grab_all.sh
|
|
@ -1,13 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
base_url="https://popcorn.voidlinux.org/popcorn_"
|
||||
ft=".json"
|
||||
|
||||
start_date="2018-05-08"
|
||||
end_date="2025-09-25"
|
||||
while [[ "$start_date" != "$end_date" ]]; do
|
||||
echo "$start_date"
|
||||
start_date=$(date --date "$start_date + 1 day" +"%Y-%m-%d")
|
||||
wget -O "${start_date}.json" "${base_url}${start_date}${ft}"
|
||||
done
|
||||
Copy
|
||||
Loading…
Add table
Add a link
Reference in a new issue