feat(script): Add yaml simple yaml to dataframe loader

Ingests all yaml files in the directory, and loads the data for each study.

Currently creates a dataframe for each *observation* in the pool (can be
multiple per study, if a study has multiple analyses for different
independent/dependent vars). This is to follow the tidy data paradigm
of one observation per row.
This commit is contained in:
Marty Oehme 2023-12-05 16:24:11 +01:00
parent 5b78a7fb1a
commit c7533e01d6
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 81 additions and 0 deletions

22
01-scripts/data.py Normal file
View file

@ -0,0 +1,22 @@
import io
from pathlib import Path
import sys
import load_yaml
from pandas import DataFrame, read_csv
DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant")
def load(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame:
studies = load_yaml.from_yml(yml_path)
tsv = load_yaml.to_tsv(studies)
return read_csv(io.StringIO(tsv), sep="\t")
if __name__ == "__main__":
if len(sys.argv) == 2:
res = load(Path(sys.argv[1]))
else:
res = load()
print(res)

59
01-scripts/load_yaml.py Normal file
View file

@ -0,0 +1,59 @@
import sys
from typing import cast
import yaml
from pathlib import Path
DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant")
def _get_all_yml(path: Path) -> list:
"""Returns list of all yml files."""
return list(path.rglob(r"*.y*ml"))
def _read_yml(path: Path) -> dict | None:
try:
with open(path, "r") as f:
return yaml.safe_load(f)
except FileNotFoundError as e:
print(e)
return None
def from_yml(
yml_path: Path | str = DEFAULT_YAML_PATH,
) -> list[dict]:
"""Main data process routine.
Extracts all necessary data from yaml files returns it.
"""
contents = [_read_yml(source) for source in _get_all_yml(Path(yml_path))]
if not contents:
return []
contents = cast(list[dict], contents)
for study in contents:
del study["annotation"]
return contents
def to_tsv(studies: list[dict]) -> str:
tsv = ""
tab = "\t"
tsv += (
f"{tab.join(studies[0].keys())}{tab}"
f"{tab.join(studies[0]['observation'][0].keys())}\n"
)
for study in studies:
study_list = [str(val).replace("\n", "") for val in study.values()]
for obs in study["observation"]:
obs_list = [str(val).replace("\n", "") for val in obs.values()]
tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
return tsv
if __name__ == "__main__":
if len(sys.argv) == 2:
res = from_yml(Path(sys.argv[1]))
else:
res = from_yml()
print(to_tsv(res))