load data, boilerplate: ```{python} #| label: load-data #| echo: false from pathlib import Path import re ## standard imports from IPython.core.display import Markdown as md import numpy as np import pandas as pd from matplotlib import pyplot as plt import seaborn as sns from tabulate import tabulate import bibtexparser sns.set_style("whitegrid") DATA_DIR=Path("./data") RAW_DATA=DATA_DIR.joinpath("raw") WORKING_DATA=DATA_DIR.joinpath("intermediate") PROCESSED_DATA=DATA_DIR.joinpath("processed") SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") from src import prep_data # raw database-search results bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA) # the complete library of sampled (and working) literature bib_sample = prep_data.bib_library_from_dir(WORKING_DATA) # load relevant studies from src import load_data bib_df = prep_data.observations_with_metadata_df( raw_observations = load_data.from_yml(PROCESSED_DATA), study_metadata = prep_data.bib_metadata_df(bib_sample), country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), ) raw_observations = None zot_df = None df_country_groups = None ``` prep data: Map a 0-5 external validity score based on 'representativeness' to rows: ```{python} df=bib_df vd=df[(df['design'] == 'quasi-experimental') | (df['design'] == 'experimental')] vd = vd.assign(valid_ext=0) vd["representativeness"] = vd["representativeness"].fillna("") mask_subnational = vd['representativeness'].str.contains("subnational") mask_national = vd['representativeness'].str.contains("national") mask_regional = vd['representativeness'].str.contains("regional") mask_local = vd['representativeness'].str.contains("local") vd.loc[mask_regional, 'valid_ext'] = 5 vd.loc[mask_national, 'valid_ext'] = 4 vd.loc[mask_subnational, 'valid_ext'] = 3 vd.loc[mask_local, 'valid_ext'] = 2 vd[['representativeness', 'valid_ext']] ``` Map an internal validity score based on study design/method: ```{python} vd = vd.assign(valid_int=0) vd["method"] = vd["method"].fillna("") vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0 vd.loc[vd['method'].str.contains("|".join(["RD","regression.vdiscontinuity"])), 'valid_int'] = 4.5 vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0 vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5 vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0 vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference"])), 'valid_int'] = 3.0 vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0 vd[['method', 'valid_int']] ``` ## visualize data: Prep the by_intervention dataframe: ```{python} #| label: fig-intervention-types #| fig-cap: Available studies by primary type of intervention by_intervention = ( bib_df .fillna("") .groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"]) .agg( { "intervention": lambda _col: "; ".join(_col), } ) .reset_index() .drop_duplicates() .assign( intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) ), ) .explode("intervention") ) sort_order = by_intervention["intervention"].value_counts().index fig = plt.figure() fig.set_size_inches(6, 3) ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() ``` then visualize: validities as distplot with external as categorical x and internal as hue facet. Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5 (subnational and census-based) ```{python} #| label: fig-validity-density from src.model import validity import seaborn.objects as so validities = validity.calculate(by_intervention) validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" # As distplot to show hue-facetted density sns.displot( data=validities, x="external_validity", hue="internal_validity", kind="kde", multiple="fill", clip=(0, None), palette="ch:rot=-0.5,hue=1.5,light=0.9", ) ``` As a point-plot which shows the x and y correlation and the spread (roughly) per external validity ```{python} #| label: fig-validity-points from src.model import validity import seaborn.objects as so validities = validity.calculate(by_intervention) validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" sns.pointplot( data=validities, x="internal_validity", y="external_validity" ) ``` As a relation-chart which shows the internal-external relation and the deviation from individual points. ```{python} #| label: fig-validity-relation #| fig-cap: "Relation between internal and external validity" #| fig-height: 5 #| code-fold: true from src.model import validity validities = validity.calculate(by_intervention) validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")] #validities["external_validity"] = validities["external_validity"].astype('category') validities["internal_validity"] = validities["internal_validity"].astype('category') sns.pointplot( data=validities, x="internal_validity", y="external_validity", ) ``` ```{python} #| label: fig-validity-distribution #| fig-cap: "Distribution of internal validities" #| fig-height: 5 #| code-fold: true fig, ax = plt.subplots() #sns.displot( # data=validities, # x="external_validity", hue="internal_validity", # kind="kde", # multiple="fill", clip=(0, None), # palette="ch:rot=-0.5,hue=1.5,light=0.9", # bw_adjust=.65, cut=0, # warn_singular = False #) ``` Following plots need at least one axis, preferably external to be set to categorical. As a heatmap plot for categorical data between x-y: ```{python} #| label: fig-validity-distribution sns.displot( data=validities, x="internal_validity", y="external_validity", hue="design", palette="ch:rot=-0.75,hue=1.5,light=0.9", ) ``` As a violin plot showing distribution of external along internal category: ```{python} sns.violinplot( data=validities, x="internal_validity", y="external_validity", hue="design", cut=0, bw_method="scott", orient="x" ) # optional swarmplot showing the actual amount of data points for each rank sns.swarmplot( data=validities, x="internal_validity", y="external_validity", color="red", s=6 ) ```