From c7533e01d6d42c8302f75c48e04c7eda740c2935 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 5 Dec 2023 16:24:11 +0100 Subject: [PATCH] feat(script): Add yaml simple yaml to dataframe loader Ingests all yaml files in the directory, and loads the data for each study. Currently creates a dataframe for each *observation* in the pool (can be multiple per study, if a study has multiple analyses for different independent/dependent vars). This is to follow the tidy data paradigm of one observation per row. --- 01-scripts/data.py | 22 +++++++++++++++ 01-scripts/load_yaml.py | 59 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 01-scripts/data.py create mode 100644 01-scripts/load_yaml.py diff --git a/01-scripts/data.py b/01-scripts/data.py new file mode 100644 index 0000000..b2575ad --- /dev/null +++ b/01-scripts/data.py @@ -0,0 +1,22 @@ +import io +from pathlib import Path +import sys +import load_yaml +from pandas import DataFrame, read_csv + +DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant") + + +def load(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame: + studies = load_yaml.from_yml(yml_path) + tsv = load_yaml.to_tsv(studies) + return read_csv(io.StringIO(tsv), sep="\t") + + +if __name__ == "__main__": + if len(sys.argv) == 2: + res = load(Path(sys.argv[1])) + else: + res = load() + + print(res) diff --git a/01-scripts/load_yaml.py b/01-scripts/load_yaml.py new file mode 100644 index 0000000..4b31409 --- /dev/null +++ b/01-scripts/load_yaml.py @@ -0,0 +1,59 @@ +import sys +from typing import cast +import yaml +from pathlib import Path + +DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant") + + +def _get_all_yml(path: Path) -> list: + """Returns list of all yml files.""" + return list(path.rglob(r"*.y*ml")) + + +def _read_yml(path: Path) -> dict | None: + try: + with open(path, "r") as f: + return yaml.safe_load(f) + except FileNotFoundError as e: + print(e) + return None + + +def from_yml( + yml_path: Path | str = DEFAULT_YAML_PATH, +) -> list[dict]: + """Main data process routine. + Extracts all necessary data from yaml files returns it. + """ + contents = [_read_yml(source) for source in _get_all_yml(Path(yml_path))] + if not contents: + return [] + contents = cast(list[dict], contents) + for study in contents: + del study["annotation"] + return contents + + +def to_tsv(studies: list[dict]) -> str: + tsv = "" + tab = "\t" + tsv += ( + f"{tab.join(studies[0].keys())}{tab}" + f"{tab.join(studies[0]['observation'][0].keys())}\n" + ) + for study in studies: + study_list = [str(val).replace("\n", "") for val in study.values()] + for obs in study["observation"]: + obs_list = [str(val).replace("\n", "") for val in obs.values()] + tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n" + return tsv + + +if __name__ == "__main__": + if len(sys.argv) == 2: + res = from_yml(Path(sys.argv[1])) + else: + res = from_yml() + + print(to_tsv(res))