2024-02-13 15:12:08 +00:00
|
|
|
load data, boilerplate:
|
|
|
|
|
|
|
|
```{python}
|
2024-02-20 16:58:35 +00:00
|
|
|
#| label: load-data
|
2024-02-13 15:12:08 +00:00
|
|
|
#| echo: false
|
|
|
|
from pathlib import Path
|
|
|
|
import re
|
|
|
|
## standard imports
|
|
|
|
from IPython.core.display import Markdown as md
|
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
|
|
|
from matplotlib import pyplot as plt
|
|
|
|
import seaborn as sns
|
|
|
|
from tabulate import tabulate
|
|
|
|
import bibtexparser
|
|
|
|
|
|
|
|
sns.set_style("whitegrid")
|
|
|
|
|
|
|
|
DATA_DIR=Path("./02-data")
|
|
|
|
RAW_DATA=DATA_DIR.joinpath("raw")
|
|
|
|
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
|
|
|
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
|
|
|
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
|
|
|
|
2024-02-20 16:58:35 +00:00
|
|
|
from src import prep_data
|
|
|
|
|
|
|
|
# raw database-search results
|
|
|
|
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
|
|
|
|
# the complete library of sampled (and working) literature
|
|
|
|
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
2024-02-13 15:12:08 +00:00
|
|
|
|
|
|
|
# load relevant studies
|
|
|
|
from src import load_data
|
|
|
|
|
2024-02-20 16:58:35 +00:00
|
|
|
bib_df = prep_data.observations_with_metadata_df(
|
|
|
|
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
|
|
|
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
|
|
|
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
2024-02-13 15:12:08 +00:00
|
|
|
)
|
2024-02-20 16:58:35 +00:00
|
|
|
raw_observations = None
|
2024-02-13 15:12:08 +00:00
|
|
|
zot_df = None
|
|
|
|
df_country_groups = None
|
|
|
|
```
|
|
|
|
|
2024-02-20 16:58:35 +00:00
|
|
|
prep data:
|
2024-02-13 15:12:08 +00:00
|
|
|
|
|
|
|
Map a 0-5 external validity score based on 'representativeness' to rows:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
df=bib_df
|
|
|
|
vd=df[(df['design'] == 'quasi-experimental') | (df['design'] == 'experimental')]
|
|
|
|
|
|
|
|
vd = vd.assign(valid_ext=0)
|
|
|
|
vd["representativeness"] = vd["representativeness"].fillna("")
|
|
|
|
|
|
|
|
mask_subnational = vd['representativeness'].str.contains("subnational")
|
|
|
|
mask_national = vd['representativeness'].str.contains("national")
|
|
|
|
mask_regional = vd['representativeness'].str.contains("regional")
|
|
|
|
mask_local = vd['representativeness'].str.contains("local")
|
|
|
|
|
|
|
|
vd.loc[mask_regional, 'valid_ext'] = 5
|
|
|
|
vd.loc[mask_national, 'valid_ext'] = 4
|
|
|
|
vd.loc[mask_subnational, 'valid_ext'] = 3
|
|
|
|
vd.loc[mask_local, 'valid_ext'] = 2
|
|
|
|
|
|
|
|
vd[['representativeness', 'valid_ext']]
|
|
|
|
```
|
|
|
|
|
|
|
|
Map an internal validity score based on study design/method:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
vd = vd.assign(valid_int=0)
|
|
|
|
vd["method"] = vd["method"].fillna("")
|
|
|
|
|
|
|
|
vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["RD","regression.vdiscontinuity"])), 'valid_int'] = 4.5
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference"])), 'valid_int'] = 3.0
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0
|
|
|
|
vd[['method', 'valid_int']]
|
|
|
|
```
|
|
|
|
|
2024-02-20 16:58:35 +00:00
|
|
|
## visualize data:
|
|
|
|
|
|
|
|
Prep the by_intervention dataframe:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-intervention-types
|
|
|
|
#| fig-cap: Available studies by primary type of intervention
|
|
|
|
|
|
|
|
by_intervention = (
|
|
|
|
bib_df
|
|
|
|
.fillna("")
|
|
|
|
.groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"])
|
|
|
|
.agg(
|
|
|
|
{
|
|
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
|
|
}
|
|
|
|
)
|
|
|
|
.reset_index()
|
|
|
|
.drop_duplicates()
|
|
|
|
.assign(
|
|
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
|
|
),
|
|
|
|
)
|
|
|
|
.explode("intervention")
|
|
|
|
)
|
|
|
|
sort_order = by_intervention["intervention"].value_counts().index
|
|
|
|
|
|
|
|
fig = plt.figure()
|
|
|
|
fig.set_size_inches(6, 3)
|
|
|
|
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
|
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
|
|
rotation_mode="anchor")
|
|
|
|
plt.show()
|
|
|
|
```
|
|
|
|
|
|
|
|
then visualize:
|
|
|
|
|
|
|
|
validities as distplot with external as categorical x and internal as hue facet.
|
|
|
|
Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5
|
|
|
|
(subnational and census-based)
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-validity-density
|
|
|
|
from src.model import validity
|
|
|
|
import seaborn.objects as so
|
|
|
|
|
|
|
|
validities = validity.calculate(by_intervention)
|
|
|
|
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
|
|
|
|
|
|
|
|
|
|
|
# As distplot to show hue-facetted density
|
|
|
|
|
|
|
|
sns.displot(
|
|
|
|
data=validities,
|
|
|
|
x="external_validity", hue="internal_validity",
|
|
|
|
kind="kde",
|
|
|
|
multiple="fill", clip=(0, None),
|
|
|
|
palette="ch:rot=-0.5,hue=1.5,light=0.9",
|
|
|
|
)
|
|
|
|
```
|
|
|
|
As a point-plot which shows the x and y correlation and the spread (roughly) per external validity
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-validity-points
|
|
|
|
from src.model import validity
|
|
|
|
import seaborn.objects as so
|
|
|
|
|
|
|
|
validities = validity.calculate(by_intervention)
|
|
|
|
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sns.pointplot(
|
|
|
|
data=validities,
|
|
|
|
x="internal_validity", y="external_validity"
|
|
|
|
)
|
|
|
|
```
|