chore(code): Refactor yml loading code
Fix for new paths and restructure to have the from_yml and to_tsv functions in the same data loading file. Improved path behavior to resolve any paths to absolutes before loading.
This commit is contained in:
parent
d3a6b1296b
commit
446db080f6
2 changed files with 33 additions and 31 deletions
|
@ -4,19 +4,41 @@ import sys
|
||||||
import load_yaml
|
import load_yaml
|
||||||
from pandas import DataFrame, read_csv
|
from pandas import DataFrame, read_csv
|
||||||
|
|
||||||
DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant")
|
DEFAULT_YAML_PATH = Path("02-data/processed/relevant")
|
||||||
|
|
||||||
|
|
||||||
def load(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame:
|
def to_tsv(studies: list[dict]) -> str:
|
||||||
studies = load_yaml.from_yml(yml_path)
|
if not studies:
|
||||||
tsv = load_yaml.to_tsv(studies)
|
return ""
|
||||||
|
tsv = ""
|
||||||
|
tab = "\t"
|
||||||
|
tsv += (
|
||||||
|
f"{tab.join(studies[0].keys())}{tab}"
|
||||||
|
f"{tab.join(studies[0]['observation'][0].keys())}\n"
|
||||||
|
)
|
||||||
|
for study in studies:
|
||||||
|
study_list = [str(val).replace("\n", "") for val in study.values()]
|
||||||
|
for obs in study["observation"]:
|
||||||
|
obs_list = [str(val).replace("\n", "") for val in obs.values()]
|
||||||
|
tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
|
||||||
|
return tsv
|
||||||
|
|
||||||
|
|
||||||
|
def from_yml(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame:
|
||||||
|
yml_path = Path(yml_path).resolve()
|
||||||
|
studies = load_yaml.load(yml_path)
|
||||||
|
if not studies:
|
||||||
|
raise ValueError(f"No studies found in directory {yml_path.resolve()}")
|
||||||
|
tsv = to_tsv(studies)
|
||||||
return read_csv(io.StringIO(tsv), sep="\t")
|
return read_csv(io.StringIO(tsv), sep="\t")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
res = load(Path(sys.argv[1]))
|
res = from_yml(Path(sys.argv[1]))
|
||||||
else:
|
else:
|
||||||
res = load()
|
res = from_yml()
|
||||||
|
|
||||||
print(res)
|
print(res)
|
||||||
|
# print out tsv file instead
|
||||||
|
# print(to_tsv(load_yaml.load(DEFAULT_YAML_PATH)))
|
||||||
|
|
|
@ -3,12 +3,10 @@ from typing import cast
|
||||||
import yaml
|
import yaml
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
DEFAULT_YAML_PATH = Path("../02-data/intermediate/relevant")
|
|
||||||
|
|
||||||
|
|
||||||
def _get_all_yml(path: Path) -> list:
|
def _get_all_yml(path: Path) -> list:
|
||||||
"""Returns list of all yml files."""
|
"""Returns list of all yml files."""
|
||||||
return list(path.rglob(r"*.y*ml"))
|
return list(path.rglob(r"**/*.y*ml"))
|
||||||
|
|
||||||
|
|
||||||
def _read_yml(path: Path) -> dict | None:
|
def _read_yml(path: Path) -> dict | None:
|
||||||
|
@ -20,9 +18,7 @@ def _read_yml(path: Path) -> dict | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def from_yml(
|
def load(yml_path: Path | str) -> list[dict]:
|
||||||
yml_path: Path | str = DEFAULT_YAML_PATH,
|
|
||||||
) -> list[dict]:
|
|
||||||
"""Main data process routine.
|
"""Main data process routine.
|
||||||
Extracts all necessary data from yaml files returns it.
|
Extracts all necessary data from yaml files returns it.
|
||||||
"""
|
"""
|
||||||
|
@ -35,25 +31,9 @@ def from_yml(
|
||||||
return contents
|
return contents
|
||||||
|
|
||||||
|
|
||||||
def to_tsv(studies: list[dict]) -> str:
|
|
||||||
tsv = ""
|
|
||||||
tab = "\t"
|
|
||||||
tsv += (
|
|
||||||
f"{tab.join(studies[0].keys())}{tab}"
|
|
||||||
f"{tab.join(studies[0]['observation'][0].keys())}\n"
|
|
||||||
)
|
|
||||||
for study in studies:
|
|
||||||
study_list = [str(val).replace("\n", "") for val in study.values()]
|
|
||||||
for obs in study["observation"]:
|
|
||||||
obs_list = [str(val).replace("\n", "") for val in obs.values()]
|
|
||||||
tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
|
|
||||||
return tsv
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
res = from_yml(Path(sys.argv[1]))
|
res = load(Path(sys.argv[1]))
|
||||||
|
print(res)
|
||||||
else:
|
else:
|
||||||
res = from_yml()
|
print("Please provide path to yml files.")
|
||||||
|
|
||||||
print(to_tsv(res))
|
|
||||||
|
|
Loading…
Reference in a new issue