import sys from typing import cast import yaml from pathlib import Path DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant") def _get_all_yml(path: Path) -> list: """Returns list of all yml files.""" return list(path.rglob(r"*.y*ml")) def _read_yml(path: Path) -> dict | None: try: with open(path, "r") as f: return yaml.safe_load(f) except FileNotFoundError as e: print(e) return None def from_yml( yml_path: Path | str = DEFAULT_YAML_PATH, ) -> list[dict]: """Main data process routine. Extracts all necessary data from yaml files returns it. """ contents = [_read_yml(source) for source in _get_all_yml(Path(yml_path))] if not contents: return [] contents = cast(list[dict], contents) for study in contents: del study["annotation"] return contents def to_tsv(studies: list[dict]) -> str: tsv = "" tab = "\t" tsv += ( f"{tab.join(studies[0].keys())}{tab}" f"{tab.join(studies[0]['observation'][0].keys())}\n" ) for study in studies: study_list = [str(val).replace("\n", "") for val in study.values()] for obs in study["observation"]: obs_list = [str(val).replace("\n", "") for val in obs.values()] tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n" return tsv if __name__ == "__main__": if len(sys.argv) == 2: res = from_yml(Path(sys.argv[1])) else: res = from_yml() print(to_tsv(res))