2024-02-13 15:12:08 +00:00
|
|
|
load data, boilerplate:
|
|
|
|
|
|
|
|
```{python}
|
2024-02-20 16:58:35 +00:00
|
|
|
#| label: load-data
|
2024-02-13 15:12:08 +00:00
|
|
|
#| echo: false
|
|
|
|
from pathlib import Path
|
|
|
|
import re
|
|
|
|
## standard imports
|
|
|
|
from IPython.core.display import Markdown as md
|
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
|
|
|
from matplotlib import pyplot as plt
|
|
|
|
import seaborn as sns
|
|
|
|
from tabulate import tabulate
|
|
|
|
import bibtexparser
|
|
|
|
|
|
|
|
sns.set_style("whitegrid")
|
|
|
|
|
2024-07-15 19:38:05 +00:00
|
|
|
DATA_DIR=Path("./data")
|
2024-02-13 15:12:08 +00:00
|
|
|
RAW_DATA=DATA_DIR.joinpath("raw")
|
|
|
|
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
|
|
|
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
|
|
|
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
|
|
|
|
2024-02-20 16:58:35 +00:00
|
|
|
from src import prep_data
|
|
|
|
|
|
|
|
# raw database-search results
|
|
|
|
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
|
|
|
|
# the complete library of sampled (and working) literature
|
|
|
|
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
2024-02-13 15:12:08 +00:00
|
|
|
|
|
|
|
# load relevant studies
|
|
|
|
from src import load_data
|
|
|
|
|
2024-02-20 16:58:35 +00:00
|
|
|
bib_df = prep_data.observations_with_metadata_df(
|
|
|
|
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
|
|
|
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
|
|
|
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
2024-02-13 15:12:08 +00:00
|
|
|
)
|
2024-02-20 16:58:35 +00:00
|
|
|
raw_observations = None
|
2024-02-13 15:12:08 +00:00
|
|
|
zot_df = None
|
|
|
|
df_country_groups = None
|
|
|
|
```
|
|
|
|
|
2024-02-20 16:58:35 +00:00
|
|
|
prep data:
|
2024-02-13 15:12:08 +00:00
|
|
|
|
|
|
|
Map a 0-5 external validity score based on 'representativeness' to rows:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
df=bib_df
|
|
|
|
vd=df[(df['design'] == 'quasi-experimental') | (df['design'] == 'experimental')]
|
|
|
|
|
|
|
|
vd = vd.assign(valid_ext=0)
|
|
|
|
vd["representativeness"] = vd["representativeness"].fillna("")
|
|
|
|
|
|
|
|
mask_subnational = vd['representativeness'].str.contains("subnational")
|
|
|
|
mask_national = vd['representativeness'].str.contains("national")
|
|
|
|
mask_regional = vd['representativeness'].str.contains("regional")
|
|
|
|
mask_local = vd['representativeness'].str.contains("local")
|
|
|
|
|
|
|
|
vd.loc[mask_regional, 'valid_ext'] = 5
|
|
|
|
vd.loc[mask_national, 'valid_ext'] = 4
|
|
|
|
vd.loc[mask_subnational, 'valid_ext'] = 3
|
|
|
|
vd.loc[mask_local, 'valid_ext'] = 2
|
|
|
|
|
|
|
|
vd[['representativeness', 'valid_ext']]
|
|
|
|
```
|
|
|
|
|
|
|
|
Map an internal validity score based on study design/method:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
vd = vd.assign(valid_int=0)
|
|
|
|
vd["method"] = vd["method"].fillna("")
|
|
|
|
|
|
|
|
vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["RD","regression.vdiscontinuity"])), 'valid_int'] = 4.5
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference"])), 'valid_int'] = 3.0
|
|
|
|
vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0
|
|
|
|
vd[['method', 'valid_int']]
|
|
|
|
```
|
|
|
|
|
2024-02-20 16:58:35 +00:00
|
|
|
## visualize data:
|
|
|
|
|
|
|
|
Prep the by_intervention dataframe:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-intervention-types
|
|
|
|
#| fig-cap: Available studies by primary type of intervention
|
|
|
|
|
|
|
|
by_intervention = (
|
|
|
|
bib_df
|
|
|
|
.fillna("")
|
|
|
|
.groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"])
|
|
|
|
.agg(
|
|
|
|
{
|
|
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
|
|
}
|
|
|
|
)
|
|
|
|
.reset_index()
|
|
|
|
.drop_duplicates()
|
|
|
|
.assign(
|
|
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
|
|
),
|
|
|
|
)
|
|
|
|
.explode("intervention")
|
|
|
|
)
|
|
|
|
sort_order = by_intervention["intervention"].value_counts().index
|
|
|
|
|
|
|
|
fig = plt.figure()
|
|
|
|
fig.set_size_inches(6, 3)
|
|
|
|
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
|
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
|
|
rotation_mode="anchor")
|
|
|
|
plt.show()
|
|
|
|
```
|
|
|
|
|
|
|
|
then visualize:
|
|
|
|
|
|
|
|
validities as distplot with external as categorical x and internal as hue facet.
|
|
|
|
Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5
|
|
|
|
(subnational and census-based)
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-validity-density
|
|
|
|
from src.model import validity
|
|
|
|
import seaborn.objects as so
|
|
|
|
|
|
|
|
validities = validity.calculate(by_intervention)
|
|
|
|
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
|
|
|
|
|
|
|
|
|
|
|
# As distplot to show hue-facetted density
|
|
|
|
|
|
|
|
sns.displot(
|
|
|
|
data=validities,
|
|
|
|
x="external_validity", hue="internal_validity",
|
|
|
|
kind="kde",
|
|
|
|
multiple="fill", clip=(0, None),
|
|
|
|
palette="ch:rot=-0.5,hue=1.5,light=0.9",
|
|
|
|
)
|
|
|
|
```
|
|
|
|
As a point-plot which shows the x and y correlation and the spread (roughly) per external validity
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-validity-points
|
|
|
|
from src.model import validity
|
|
|
|
import seaborn.objects as so
|
|
|
|
|
|
|
|
validities = validity.calculate(by_intervention)
|
|
|
|
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sns.pointplot(
|
|
|
|
data=validities,
|
|
|
|
x="internal_validity", y="external_validity"
|
|
|
|
)
|
|
|
|
```
|
2024-02-21 10:28:51 +00:00
|
|
|
|
|
|
|
As a relation-chart which shows the internal-external relation and the deviation from individual points.
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-validity-relation
|
|
|
|
#| fig-cap: "Relation between internal and external validity"
|
|
|
|
#| fig-height: 5
|
|
|
|
#| code-fold: true
|
|
|
|
|
|
|
|
from src.model import validity
|
|
|
|
|
|
|
|
validities = validity.calculate(by_intervention)
|
|
|
|
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
|
|
|
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
|
|
|
|
#validities["external_validity"] = validities["external_validity"].astype('category')
|
|
|
|
validities["internal_validity"] = validities["internal_validity"].astype('category')
|
|
|
|
|
|
|
|
sns.pointplot(
|
|
|
|
data=validities,
|
|
|
|
x="internal_validity", y="external_validity",
|
|
|
|
)
|
|
|
|
```
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-validity-distribution
|
|
|
|
#| fig-cap: "Distribution of internal validities"
|
|
|
|
#| fig-height: 5
|
|
|
|
#| code-fold: true
|
|
|
|
|
|
|
|
fig, ax = plt.subplots()
|
|
|
|
|
|
|
|
#sns.displot(
|
|
|
|
# data=validities,
|
|
|
|
# x="external_validity", hue="internal_validity",
|
|
|
|
# kind="kde",
|
|
|
|
# multiple="fill", clip=(0, None),
|
|
|
|
# palette="ch:rot=-0.5,hue=1.5,light=0.9",
|
|
|
|
# bw_adjust=.65, cut=0,
|
|
|
|
# warn_singular = False
|
|
|
|
#)
|
|
|
|
```
|
|
|
|
Following plots need at least one axis, preferably external to be set to categorical.
|
|
|
|
|
|
|
|
As a heatmap plot for categorical data between x-y:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-validity-distribution
|
|
|
|
sns.displot(
|
|
|
|
data=validities,
|
|
|
|
x="internal_validity", y="external_validity", hue="design",
|
|
|
|
palette="ch:rot=-0.75,hue=1.5,light=0.9",
|
|
|
|
)
|
|
|
|
```
|
|
|
|
|
|
|
|
As a violin plot showing distribution of external along internal category:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
sns.violinplot(
|
|
|
|
data=validities,
|
|
|
|
x="internal_validity", y="external_validity", hue="design",
|
|
|
|
cut=0, bw_method="scott",
|
|
|
|
orient="x"
|
|
|
|
)
|
|
|
|
# optional swarmplot showing the actual amount of data points for each rank
|
|
|
|
sns.swarmplot(
|
|
|
|
data=validities,
|
|
|
|
x="internal_validity", y="external_validity",
|
|
|
|
color="red",
|
|
|
|
s=6
|
|
|
|
)
|
|
|
|
```
|
|
|
|
|