chore(code): Refactor yml loading code

Fix for new paths and restructure to have the from_yml and to_tsv
functions in the same data loading file.
Improved path behavior to resolve any paths to absolutes before
loading.
This commit is contained in:
Marty Oehme 2023-12-07 13:04:29 +01:00
parent d3a6b1296b
commit 446db080f6
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 33 additions and 31 deletions

View file

@ -4,19 +4,41 @@ import sys
import load_yaml import load_yaml
from pandas import DataFrame, read_csv from pandas import DataFrame, read_csv
DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant") DEFAULT_YAML_PATH = Path("02-data/processed/relevant")
def load(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame: def to_tsv(studies: list[dict]) -> str:
studies = load_yaml.from_yml(yml_path) if not studies:
tsv = load_yaml.to_tsv(studies) return ""
tsv = ""
tab = "\t"
tsv += (
f"{tab.join(studies[0].keys())}{tab}"
f"{tab.join(studies[0]['observation'][0].keys())}\n"
)
for study in studies:
study_list = [str(val).replace("\n", "") for val in study.values()]
for obs in study["observation"]:
obs_list = [str(val).replace("\n", "") for val in obs.values()]
tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
return tsv
def from_yml(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame:
yml_path = Path(yml_path).resolve()
studies = load_yaml.load(yml_path)
if not studies:
raise ValueError(f"No studies found in directory {yml_path.resolve()}")
tsv = to_tsv(studies)
return read_csv(io.StringIO(tsv), sep="\t") return read_csv(io.StringIO(tsv), sep="\t")
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) == 2: if len(sys.argv) == 2:
res = load(Path(sys.argv[1])) res = from_yml(Path(sys.argv[1]))
else: else:
res = load() res = from_yml()
print(res) print(res)
# print out tsv file instead
# print(to_tsv(load_yaml.load(DEFAULT_YAML_PATH)))

View file

@ -3,12 +3,10 @@ from typing import cast
import yaml import yaml
from pathlib import Path from pathlib import Path
DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant")
def _get_all_yml(path: Path) -> list: def _get_all_yml(path: Path) -> list:
"""Returns list of all yml files.""" """Returns list of all yml files."""
return list(path.rglob(r"*.y*ml")) return list(path.rglob(r"**/*.y*ml"))
def _read_yml(path: Path) -> dict | None: def _read_yml(path: Path) -> dict | None:
@ -20,9 +18,7 @@ def _read_yml(path: Path) -> dict | None:
return None return None
def from_yml( def load(yml_path: Path | str) -> list[dict]:
yml_path: Path | str = DEFAULT_YAML_PATH,
) -> list[dict]:
"""Main data process routine. """Main data process routine.
Extracts all necessary data from yaml files returns it. Extracts all necessary data from yaml files returns it.
""" """
@ -35,25 +31,9 @@ def from_yml(
return contents return contents
def to_tsv(studies: list[dict]) -> str:
tsv = ""
tab = "\t"
tsv += (
f"{tab.join(studies[0].keys())}{tab}"
f"{tab.join(studies[0]['observation'][0].keys())}\n"
)
for study in studies:
study_list = [str(val).replace("\n", "") for val in study.values()]
for obs in study["observation"]:
obs_list = [str(val).replace("\n", "") for val in obs.values()]
tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
return tsv
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) == 2: if len(sys.argv) == 2:
res = from_yml(Path(sys.argv[1])) res = load(Path(sys.argv[1]))
print(res)
else: else:
res = from_yml() print("Please provide path to yml files.")
print(to_tsv(res))