72 lines
2.7 KiB
Python
72 lines
2.7 KiB
Python
from pathlib import Path
|
|
import os
|
|
import re
|
|
## standard imports
|
|
from IPython.display import display, Markdown, HTML
|
|
import numpy as np
|
|
import pandas as pd
|
|
from matplotlib import pyplot as plt
|
|
import seaborn as sns
|
|
from tabulate import tabulate
|
|
import bibtexparser
|
|
|
|
sns.set_style("whitegrid")
|
|
|
|
PROJECT_DIR=Path(os.getenv("QUARTO_PROJECT_DIR", "."))
|
|
DATA_DIR=PROJECT_DIR.joinpath("02-data")
|
|
RAW_DATA=DATA_DIR.joinpath("raw")
|
|
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
|
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
|
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
|
## Creates 3 important data structures:
|
|
# df: The main dataframe containing all final sample studies
|
|
# df_by_intervention: The same dataframe but split up by individual interventions per study
|
|
# validities: The studies with their validities, containing only quasi-/experimental studies
|
|
|
|
from src.process import add_metadata as meta
|
|
|
|
# raw database-search results
|
|
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
|
|
# the complete library of sampled (and working) literature
|
|
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
|
|
|
# load relevant studies
|
|
from src.extract import load_data as load
|
|
|
|
# each observation in a single dataframe
|
|
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
|
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
|
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
|
df = meta.observations_with_metadata_df(
|
|
)
|
|
|
|
# all observations but split per individual intervention
|
|
df_by_intervention = (
|
|
df
|
|
.fillna("")
|
|
.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
|
|
.agg(
|
|
{
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates()
|
|
.assign(
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("intervention")
|
|
)
|
|
|
|
# Calc study validities (internal & external separated)
|
|
from src.model import validity
|
|
|
|
validities = validity.calculate(df_by_intervention)
|
|
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
|
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
|
|
#validities["external_validity"] = validities["external_validity"].astype('category')
|
|
validities["internal_validity"] = validities["internal_validity"].astype('category')
|
|
validities["External Validity"] = validities["external_validity"]
|
|
validities["Internal Validity"] = validities["internal_validity"]
|