from pathlib import Path import os import re ## standard imports from IPython.display import display, Markdown, HTML import numpy as np import pandas as pd from matplotlib import pyplot as plt import seaborn as sns from tabulate import tabulate import bibtexparser sns.set_style("whitegrid") PROJECT_DIR=Path(os.getenv("QUARTO_PROJECT_DIR", ".")) DATA_DIR=PROJECT_DIR.joinpath("02-data") RAW_DATA=DATA_DIR.joinpath("raw") WORKING_DATA=DATA_DIR.joinpath("intermediate") PROCESSED_DATA=DATA_DIR.joinpath("processed") SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") ## Creates 3 important data structures: # df: The main dataframe containing all final sample studies # df_by_intervention: The same dataframe but split up by individual interventions per study # validities: The studies with their validities, containing only quasi-/experimental studies from src.process import add_metadata as meta # raw database-search results bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA) # the complete library of sampled (and working) literature bib_sample = prep_data.bib_library_from_dir(WORKING_DATA) # load relevant studies from src.extract import load_data as load # each observation in a single dataframe raw_observations = load_data.from_yml(PROCESSED_DATA), study_metadata = prep_data.bib_metadata_df(bib_sample), country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), df = meta.observations_with_metadata_df( ) # all observations but split per individual intervention df_by_intervention = ( df .fillna("") .groupby(["author", "year", "title", "design", "method", "representativeness", "citation"]) .agg( { "intervention": lambda _col: "; ".join(_col), } ) .reset_index() .drop_duplicates() .assign( intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) ), ) .explode("intervention") ) # Calc study validities (internal & external separated) from src.model import validity validities = validity.calculate(df_by_intervention) validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")] #validities["external_validity"] = validities["external_validity"].astype('category') validities["internal_validity"] = validities["internal_validity"].astype('category') validities["External Validity"] = validities["external_validity"] validities["Internal Validity"] = validities["internal_validity"]