Validate CSV output schemas
Also moved code dir to src. There are reasons to do standard things in standard ways. While it is possible to get the `code/` directory to work, and recognize it as a package path, this requires wrangling the pyproject.toml file. Additionally, any import from the `code.something` path automatically shadows the python stdlib `code` module. While it may not be necessary, it still is good to not shadow standard library modules.
This commit is contained in:
parent
de96b67fac
commit
2faeda87c3
14 changed files with 111 additions and 7 deletions
1
code/.gitattributes
vendored
1
code/.gitattributes
vendored
|
|
@ -1 +0,0 @@
|
|||
* annex.largefiles=nothing
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
# Popcorn dataset code
|
||||
|
||||
Each script can be run stand-alone like `python code/files.py <input-dir> <output-dir>`,
|
||||
exchanging the script file for the one intended.
|
||||
|
||||
It is suggested, however, to run the scripts using the `just` command runner from the
|
||||
dataset root, such as `just files` for the same effect as above.
|
||||
This will automatically populate the correct input and output directories.
|
||||
|
||||
To create new `datalad` versioned output data, run `just versioned` or `just` without any arguments.
|
||||
A new commit containing the updated data will be created,
|
||||
and an automatic entry in the CHANGELOG made.
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
import csv
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def filesize_csv(input_dir: Path, output_dir: Path) -> None:
|
||||
output_file = output_dir / "files.csv"
|
||||
with output_file.open("w") as fw:
|
||||
writer = csv.writer(fw)
|
||||
writer.writerow(["date", "filename", "mtime", "filesize"])
|
||||
|
||||
for j in input_dir.glob("*.json"):
|
||||
p_date = j.stem
|
||||
p_fname = j.name
|
||||
stat = j.stat()
|
||||
p_mtime = stat.st_mtime
|
||||
p_size = stat.st_size
|
||||
writer.writerow([p_date, p_fname, p_mtime, p_size])
|
||||
|
||||
|
||||
def ensure_dirs(input_dir: Path, output_dir: Path):
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def main(input: str, output: str) -> None:
|
||||
input_dir = Path(input)
|
||||
output_dir = Path(output)
|
||||
ensure_dirs(input_dir, output_dir)
|
||||
filesize_csv(input_dir, output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if not len(sys.argv) == 3:
|
||||
print("Please provide exactly one input directory and one output directory.")
|
||||
sys.exit(1)
|
||||
inp = sys.argv[1]
|
||||
out = sys.argv[2]
|
||||
main(inp, out)
|
||||
|
|
@ -1,57 +0,0 @@
|
|||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, cast
|
||||
|
||||
|
||||
def package_kernel_csv(input_dir: Path, output_dir: Path) -> None:
|
||||
output_file = output_dir / "kernels.csv"
|
||||
with output_file.open("w") as fw:
|
||||
writer = csv.writer(fw)
|
||||
writer.writerow(["date", "kernel", "downloads"])
|
||||
|
||||
for j in input_dir.glob("*.json"):
|
||||
with open(j) as fr:
|
||||
date = j.stem
|
||||
data: dict[str, Any] = {}
|
||||
try:
|
||||
data = cast(dict[str, object], json.load(fr))
|
||||
except json.JSONDecodeError:
|
||||
print(f"WARN: Could not decode JSON data for file {j}")
|
||||
continue
|
||||
|
||||
if "XuKernel" not in data or not isinstance(data["XuKernel"], dict):
|
||||
print(
|
||||
f"WARN: No correct json structure containing 'XuKernel' field in file {j}"
|
||||
)
|
||||
continue
|
||||
|
||||
for entry in data["XuKernel"]:
|
||||
p_name = cast(str, entry)
|
||||
p_count = cast(int, data["XuKernel"][entry])
|
||||
p_date = date
|
||||
writer.writerow([p_date, p_name, p_count])
|
||||
|
||||
|
||||
def ensure_dirs(input_dir: Path, output_dir: Path):
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def main(input: str, output: str) -> None:
|
||||
input_dir = Path(input)
|
||||
output_dir = Path(output)
|
||||
ensure_dirs(input_dir, output_dir)
|
||||
package_kernel_csv(input_dir, output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if not len(sys.argv) == 3:
|
||||
print("Please provide exactly one input directory and one output directory.")
|
||||
sys.exit(1)
|
||||
inp = sys.argv[1]
|
||||
out = sys.argv[2]
|
||||
main(inp, out)
|
||||
|
|
@ -1,65 +0,0 @@
|
|||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
|
||||
def packages_csv(input_dir: Path, output_dir: Path) -> None:
|
||||
output_file = output_dir / "packages.csv"
|
||||
with output_file.open("w") as fw:
|
||||
writer = csv.writer(fw)
|
||||
writer.writerow(["date", "package", "version", "count"])
|
||||
|
||||
for j in input_dir.glob("*.json"):
|
||||
with open(j) as fr:
|
||||
date = j.stem
|
||||
data: dict[str, object] = {}
|
||||
try:
|
||||
data = json.load(fr)
|
||||
except json.JSONDecodeError:
|
||||
print(f"WARN: Could not decode JSON data for file {j}")
|
||||
continue
|
||||
|
||||
if "Versions" not in data or not isinstance(data["Versions"], dict):
|
||||
print(
|
||||
f"WARN: No correct json structure containing 'Versions' field in file {j}"
|
||||
)
|
||||
continue
|
||||
|
||||
data_versions = cast(dict[str, dict[str, int]], data["Versions"])
|
||||
for package_name, package_versions in data_versions.items():
|
||||
if not isinstance(package_versions, dict):
|
||||
print(
|
||||
f"WARN: No correct json version structure containing versions in the Version field in file {j}"
|
||||
)
|
||||
continue
|
||||
for version, count in package_versions.items():
|
||||
p_name = package_name
|
||||
p_version = version
|
||||
v_count = count
|
||||
p_date = date
|
||||
writer.writerow([p_date, p_name, p_version, v_count])
|
||||
|
||||
|
||||
def ensure_dirs(input_dir: Path, output_dir: Path):
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def main(input: str, output: str) -> None:
|
||||
input_dir = Path(input)
|
||||
output_dir = Path(output)
|
||||
ensure_dirs(input_dir, output_dir)
|
||||
packages_csv(input_dir, output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if not len(sys.argv) == 3:
|
||||
print("Please provide exactly one input directory and one output directory.")
|
||||
sys.exit(1)
|
||||
inp = sys.argv[1]
|
||||
out = sys.argv[2]
|
||||
main(inp, out)
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def unique_install_csv(input_dir: Path, output_dir: Path) -> None:
|
||||
output_file = output_dir / "unique_installs.csv"
|
||||
with open(output_file, "w") as fw:
|
||||
writer = csv.writer(fw)
|
||||
writer.writerow(["date", "unique"])
|
||||
|
||||
for j in input_dir.glob("*.json"):
|
||||
with open(j) as fr:
|
||||
date = j.stem
|
||||
data: dict[str, object] = {}
|
||||
try:
|
||||
data = json.load(fr)
|
||||
except json.JSONDecodeError:
|
||||
print(f"WARN: Could not decode JSON data for file {j}")
|
||||
continue
|
||||
|
||||
if "UniqueInstalls" not in data or not isinstance(
|
||||
data["UniqueInstalls"], int
|
||||
):
|
||||
print(
|
||||
f"WARN: No correct json structure containing 'UniqueInstalls' field in file {j}"
|
||||
)
|
||||
continue
|
||||
|
||||
p_date = date
|
||||
p_count = data["UniqueInstalls"]
|
||||
writer.writerow([p_date, p_count])
|
||||
|
||||
|
||||
def ensure_dirs(input_dir: Path, output_dir: Path):
|
||||
if not input_dir.is_dir():
|
||||
raise ValueError
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def main(input: str, output: str) -> None:
|
||||
input_dir = Path(input)
|
||||
output_dir = Path(output)
|
||||
ensure_dirs(input_dir, output_dir)
|
||||
unique_install_csv(input_dir, output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if not len(sys.argv) == 3:
|
||||
print("Please provide exactly one input directory and one output directory.")
|
||||
sys.exit(1)
|
||||
inp = sys.argv[1]
|
||||
out = sys.argv[2]
|
||||
main(inp, out)
|
||||
Loading…
Add table
Add a link
Reference in a new issue