Marty Oehme
ed3d09b3f7
For the first time we use the actual final extracted data from relevant studies to do analysis on instead of just the intermediate Zotero-provided metadata. We still inject the intermediate metadata where it may be useful (things like citation counts and keywords) but otherwise switch to the new data.
44 lines
1.2 KiB
Python
44 lines
1.2 KiB
Python
import io
|
|
from pathlib import Path
|
|
import sys
|
|
from src import load_yaml
|
|
from pandas import DataFrame, read_csv
|
|
|
|
DEFAULT_YAML_PATH = Path("02-data/processed/relevant")
|
|
|
|
|
|
def to_tsv(studies: list[dict]) -> str:
|
|
if not studies:
|
|
return ""
|
|
tsv = ""
|
|
tab = "\t"
|
|
tsv += (
|
|
f"{tab.join(studies[0].keys())}{tab}"
|
|
f"{tab.join(studies[0]['observation'][0].keys())}\n"
|
|
)
|
|
for study in studies:
|
|
study_list = [str(val).replace("\n", "") for val in study.values()]
|
|
for obs in study["observation"]:
|
|
obs_list = [str(val).replace("\n", "") for val in obs.values()]
|
|
tsv += f"{tab.join(study_list)}{tab}{tab.join(obs_list)}\n"
|
|
return tsv
|
|
|
|
|
|
def from_yml(yml_path: Path | str = DEFAULT_YAML_PATH) -> DataFrame:
|
|
yml_path = Path(yml_path).resolve()
|
|
studies = load_yaml.load(yml_path)
|
|
if not studies:
|
|
raise ValueError(f"No studies found in directory {yml_path.resolve()}")
|
|
tsv = to_tsv(studies)
|
|
return read_csv(io.StringIO(tsv), sep="\t")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) == 2:
|
|
res = from_yml(Path(sys.argv[1]))
|
|
else:
|
|
res = from_yml()
|
|
|
|
print(res)
|
|
# print out tsv file instead
|
|
# print(to_tsv(load_yaml.load(DEFAULT_YAML_PATH)))
|