import io from pathlib import Path import sys try: import src.load_yaml as yaml # for quarto document scripts except ModuleNotFoundError: import load_yaml as yaml # for directly running the package from pandas import DataFrame, read_csv DEFAULT_YAML_PATH = Path("02-data/processed/relevant") def to_tsv(studies: list[dict]) -> str: if not studies: return "" tsv = "" tab = "\t" tsv += ( f"{tab.join(studies[0].keys())}{tab}" f"{tab.join(studies[0]['observation'][0].keys())}\n" ) for study in studies: study_list = [str(val).replace("\n", "") for val in study.values()] for obs in study["observation"]: obs_list = [str(val).replace("\n", "") for val in obs.values()] tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n" return tsv def from_yml(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame: yml_path = Path(yml_path).resolve() studies = yaml.load(yml_path) if not studies: raise ValueError(f"No studies found in directory {yml_path.resolve()}") tsv = to_tsv(studies) return read_csv(io.StringIO(tsv), sep="\t") if __name__ == "__main__": if len(sys.argv) == 2: res = from_yml(Path(sys.argv[1])) else: res = from_yml() from io import StringIO output = StringIO() res.to_csv(output) output.seek(0) print(output.read())