2023-12-05 15:24:11 +00:00
|
|
|
import io
|
|
|
|
from pathlib import Path
|
|
|
|
import sys
|
2023-12-14 17:08:13 +00:00
|
|
|
try:
|
|
|
|
import src.load_yaml as yaml # for quarto document scripts
|
|
|
|
except ModuleNotFoundError:
|
|
|
|
import load_yaml as yaml # for directly running the package
|
2023-12-05 15:24:11 +00:00
|
|
|
from pandas import DataFrame, read_csv
|
|
|
|
|
2023-12-07 12:04:29 +00:00
|
|
|
DEFAULT_YAML_PATH = Path("02-data/processed/relevant")
|
2023-12-05 15:24:11 +00:00
|
|
|
|
|
|
|
|
2023-12-07 12:04:29 +00:00
|
|
|
def to_tsv(studies: list[dict]) -> str:
|
|
|
|
if not studies:
|
|
|
|
return ""
|
|
|
|
tsv = ""
|
|
|
|
tab = "\t"
|
|
|
|
tsv += (
|
|
|
|
f"{tab.join(studies[0].keys())}{tab}"
|
|
|
|
f"{tab.join(studies[0]['observation'][0].keys())}\n"
|
|
|
|
)
|
|
|
|
for study in studies:
|
|
|
|
study_list = [str(val).replace("\n", "") for val in study.values()]
|
|
|
|
for obs in study["observation"]:
|
|
|
|
obs_list = [str(val).replace("\n", "") for val in obs.values()]
|
|
|
|
tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
|
|
|
|
return tsv
|
|
|
|
|
|
|
|
|
|
|
|
def from_yml(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame:
|
|
|
|
yml_path = Path(yml_path).resolve()
|
2023-12-14 17:08:13 +00:00
|
|
|
studies = yaml.load(yml_path)
|
2023-12-07 12:04:29 +00:00
|
|
|
if not studies:
|
|
|
|
raise ValueError(f"No studies found in directory {yml_path.resolve()}")
|
|
|
|
tsv = to_tsv(studies)
|
2023-12-05 15:24:11 +00:00
|
|
|
return read_csv(io.StringIO(tsv), sep="\t")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if len(sys.argv) == 2:
|
2023-12-07 12:04:29 +00:00
|
|
|
res = from_yml(Path(sys.argv[1]))
|
2023-12-05 15:24:11 +00:00
|
|
|
else:
|
2023-12-07 12:04:29 +00:00
|
|
|
res = from_yml()
|
2023-12-05 15:24:11 +00:00
|
|
|
|
2023-12-14 17:06:39 +00:00
|
|
|
from io import StringIO
|
|
|
|
output = StringIO()
|
|
|
|
res.to_csv(output)
|
|
|
|
output.seek(0)
|
|
|
|
print(output.read())
|