Marty Oehme
446db080f6
Fix for new paths and restructure to have the from_yml and to_tsv functions in the same data loading file. Improved path behavior to resolve any paths to absolutes before loading.
44 lines
1.2 KiB
Python
44 lines
1.2 KiB
Python
import io
|
|
from pathlib import Path
|
|
import sys
|
|
import load_yaml
|
|
from pandas import DataFrame, read_csv
|
|
|
|
DEFAULT_YAML_PATH = Path("02-data/processed/relevant")
|
|
|
|
|
|
def to_tsv(studies: list[dict]) -> str:
|
|
if not studies:
|
|
return ""
|
|
tsv = ""
|
|
tab = "\t"
|
|
tsv += (
|
|
f"{tab.join(studies[0].keys())}{tab}"
|
|
f"{tab.join(studies[0]['observation'][0].keys())}\n"
|
|
)
|
|
for study in studies:
|
|
study_list = [str(val).replace("\n", "") for val in study.values()]
|
|
for obs in study["observation"]:
|
|
obs_list = [str(val).replace("\n", "") for val in obs.values()]
|
|
tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
|
|
return tsv
|
|
|
|
|
|
def from_yml(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame:
|
|
yml_path = Path(yml_path).resolve()
|
|
studies = load_yaml.load(yml_path)
|
|
if not studies:
|
|
raise ValueError(f"No studies found in directory {yml_path.resolve()}")
|
|
tsv = to_tsv(studies)
|
|
return read_csv(io.StringIO(tsv), sep="\t")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) == 2:
|
|
res = from_yml(Path(sys.argv[1]))
|
|
else:
|
|
res = from_yml()
|
|
|
|
print(res)
|
|
# print out tsv file instead
|
|
# print(to_tsv(load_yaml.load(DEFAULT_YAML_PATH)))
|