wow-inequalities/src/data.py

50 lines
1.4 KiB
Python
Raw Normal View History

import io
from pathlib import Path
import sys
try:
import src.load_yaml as yaml # for quarto document scripts
except ModuleNotFoundError:
import load_yaml as yaml # for directly running the package
from pandas import DataFrame, read_csv
DEFAULT_YAML_PATH = Path("02-data/processed/relevant")
def to_tsv(studies: list[dict]) -> str:
if not studies:
return ""
tsv = ""
tab = "\t"
tsv += (
f"{tab.join(studies[0].keys())}{tab}"
f"{tab.join(studies[0]['observation'][0].keys())}\n"
)
for study in studies:
study_list = [str(val).replace("\n", "") for val in study.values()]
for obs in study["observation"]:
obs_list = [str(val).replace("\n", "") for val in obs.values()]
tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
return tsv
def from_yml(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame:
yml_path = Path(yml_path).resolve()
studies = yaml.load(yml_path)
if not studies:
raise ValueError(f"No studies found in directory {yml_path.resolve()}")
tsv = to_tsv(studies)
return read_csv(io.StringIO(tsv), sep="\t")
if __name__ == "__main__":
if len(sys.argv) == 2:
res = from_yml(Path(sys.argv[1]))
else:
res = from_yml()
from io import StringIO
output = StringIO()
res.to_csv(output)
output.seek(0)
print(output.read())