diff --git a/01-scripts/data.py b/01-scripts/data.py index b2575ad..8ba8f99 100644 --- a/01-scripts/data.py +++ b/01-scripts/data.py @@ -4,19 +4,41 @@ import sys import load_yaml from pandas import DataFrame, read_csv -DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant") +DEFAULT_YAML_PATH = Path("02-data/processed/relevant") -def load(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame: - studies = load_yaml.from_yml(yml_path) - tsv = load_yaml.to_tsv(studies) +def to_tsv(studies: list[dict]) -> str: + if not studies: + return "" + tsv = "" + tab = "\t" + tsv += ( + f"{tab.join(studies[0].keys())}{tab}" + f"{tab.join(studies[0]['observation'][0].keys())}\n" + ) + for study in studies: + study_list = [str(val).replace("\n", "") for val in study.values()] + for obs in study["observation"]: + obs_list = [str(val).replace("\n", "") for val in obs.values()] + tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n" + return tsv + + +def from_yml(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame: + yml_path = Path(yml_path).resolve() + studies = load_yaml.load(yml_path) + if not studies: + raise ValueError(f"No studies found in directory {yml_path.resolve()}") + tsv = to_tsv(studies) return read_csv(io.StringIO(tsv), sep="\t") if __name__ == "__main__": if len(sys.argv) == 2: - res = load(Path(sys.argv[1])) + res = from_yml(Path(sys.argv[1])) else: - res = load() + res = from_yml() print(res) + # print out tsv file instead + # print(to_tsv(load_yaml.load(DEFAULT_YAML_PATH))) diff --git a/01-scripts/load_yaml.py b/01-scripts/load_yaml.py index 4b31409..f58fcb1 100644 --- a/01-scripts/load_yaml.py +++ b/01-scripts/load_yaml.py @@ -3,12 +3,10 @@ from typing import cast import yaml from pathlib import Path -DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant") - def _get_all_yml(path: Path) -> list: """Returns list of all yml files.""" - return list(path.rglob(r"*.y*ml")) + return list(path.rglob(r"**/*.y*ml")) def _read_yml(path: Path) -> dict | None: @@ -20,9 +18,7 @@ def _read_yml(path: Path) -> dict | None: return None -def from_yml( - yml_path: Path | str = DEFAULT_YAML_PATH, -) -> list[dict]: +def load(yml_path: Path | str) -> list[dict]: """Main data process routine. Extracts all necessary data from yaml files returns it. """ @@ -35,25 +31,9 @@ def from_yml( return contents -def to_tsv(studies: list[dict]) -> str: - tsv = "" - tab = "\t" - tsv += ( - f"{tab.join(studies[0].keys())}{tab}" - f"{tab.join(studies[0]['observation'][0].keys())}\n" - ) - for study in studies: - study_list = [str(val).replace("\n", "") for val in study.values()] - for obs in study["observation"]: - obs_list = [str(val).replace("\n", "") for val in obs.values()] - tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n" - return tsv - - if __name__ == "__main__": if len(sys.argv) == 2: - res = from_yml(Path(sys.argv[1])) + res = load(Path(sys.argv[1])) + print(res) else: - res = from_yml() - - print(to_tsv(res)) + print("Please provide path to yml files.")