feat(script): Add yaml simple yaml to dataframe loader

Ingests all yaml files in the directory, and loads the data for each study. Currently creates a dataframe for each *observation* in the pool (can be multiple per study, if a study has multiple analyses for different independent/dependent vars). This is to follow the tidy data paradigm of one observation per row.
2023-12-05 16:24:11 +01:00 · 2023-12-05 16:24:11 +01:00 · c7533e01d6
commit c7533e01d6
parent 5b78a7fb1a
2 changed files with 81 additions and 0 deletions
--- a/01-scripts/data.py
+++ b/01-scripts/data.py
@ -0,0 +1,22 @@
+import io
+from pathlib import Path
+import sys
+import load_yaml
+from pandas import DataFrame, read_csv
+
+DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant")
+
+
+def load(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame:
+    studies = load_yaml.from_yml(yml_path)
+    tsv = load_yaml.to_tsv(studies)
+    return read_csv(io.StringIO(tsv), sep="\t")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        res = load(Path(sys.argv[1]))
+    else:
+        res = load()
+
+    print(res)
--- a/01-scripts/load_yaml.py
+++ b/01-scripts/load_yaml.py
@ -0,0 +1,59 @@
+import sys
+from typing import cast
+import yaml
+from pathlib import Path
+
+DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant")
+
+
+def _get_all_yml(path: Path) -> list:
+    """Returns list of all yml files."""
+    return list(path.rglob(r"*.y*ml"))
+
+
+def _read_yml(path: Path) -> dict | None:
+    try:
+        with open(path, "r") as f:
+            return yaml.safe_load(f)
+    except FileNotFoundError as e:
+        print(e)
+    return None
+
+
+def from_yml(
+    yml_path: Path | str = DEFAULT_YAML_PATH,
+) -> list[dict]:
+    """Main data process routine.
+    Extracts all necessary data from yaml files returns it.
+    """
+    contents = [_read_yml(source) for source in _get_all_yml(Path(yml_path))]
+    if not contents:
+        return []
+    contents = cast(list[dict], contents)
+    for study in contents:
+        del study["annotation"]
+    return contents
+
+
+def to_tsv(studies: list[dict]) -> str:
+    tsv = ""
+    tab = "\t"
+    tsv += (
+        f"{tab.join(studies[0].keys())}{tab}"
+        f"{tab.join(studies[0]['observation'][0].keys())}\n"
+    )
+    for study in studies:
+        study_list = [str(val).replace("\n", "") for val in study.values()]
+        for obs in study["observation"]:
+            obs_list = [str(val).replace("\n", "") for val in obs.values()]
+            tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
+    return tsv
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        res = from_yml(Path(sys.argv[1]))
+    else:
+        res = from_yml()
+
+    print(to_tsv(res))