wow-inequalities/00-notebooks/rank_validities.qmd

162 lines
4.9 KiB
Text

load data, boilerplate:
```{python}
#| label: load-data
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
sns.set_style("whitegrid")
DATA_DIR=Path("./02-data")
RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
from src import prep_data
# raw database-search results
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
# the complete library of sampled (and working) literature
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
# load relevant studies
from src import load_data
bib_df = prep_data.observations_with_metadata_df(
raw_observations = load_data.from_yml(PROCESSED_DATA),
study_metadata = prep_data.bib_metadata_df(bib_sample),
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
)
raw_observations = None
zot_df = None
df_country_groups = None
```
prep data:
Map a 0-5 external validity score based on 'representativeness' to rows:
```{python}
df=bib_df
vd=df[(df['design'] == 'quasi-experimental') | (df['design'] == 'experimental')]
vd = vd.assign(valid_ext=0)
vd["representativeness"] = vd["representativeness"].fillna("")
mask_subnational = vd['representativeness'].str.contains("subnational")
mask_national = vd['representativeness'].str.contains("national")
mask_regional = vd['representativeness'].str.contains("regional")
mask_local = vd['representativeness'].str.contains("local")
vd.loc[mask_regional, 'valid_ext'] = 5
vd.loc[mask_national, 'valid_ext'] = 4
vd.loc[mask_subnational, 'valid_ext'] = 3
vd.loc[mask_local, 'valid_ext'] = 2
vd[['representativeness', 'valid_ext']]
```
Map an internal validity score based on study design/method:
```{python}
vd = vd.assign(valid_int=0)
vd["method"] = vd["method"].fillna("")
vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0
vd.loc[vd['method'].str.contains("|".join(["RD","regression.vdiscontinuity"])), 'valid_int'] = 4.5
vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0
vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5
vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0
vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference"])), 'valid_int'] = 3.0
vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0
vd[['method', 'valid_int']]
```
## visualize data:
Prep the by_intervention dataframe:
```{python}
#| label: fig-intervention-types
#| fig-cap: Available studies by primary type of intervention
by_intervention = (
bib_df
.fillna("")
.groupby(["author", "year", "title", "design", "method", "region", "representativeness", "citation"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
```
then visualize:
validities as distplot with external as categorical x and internal as hue facet.
Nicely shows that lower-internal generally have higher external and there are two external humps at 3 and 5
(subnational and census-based)
```{python}
#| label: fig-validity-density
from src.model import validity
import seaborn.objects as so
validities = validity.calculate(by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
# As distplot to show hue-facetted density
sns.displot(
data=validities,
x="external_validity", hue="internal_validity",
kind="kde",
multiple="fill", clip=(0, None),
palette="ch:rot=-0.5,hue=1.5,light=0.9",
)
```
As a point-plot which shows the x and y correlation and the spread (roughly) per external validity
```{python}
#| label: fig-validity-points
from src.model import validity
import seaborn.objects as so
validities = validity.calculate(by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
sns.pointplot(
data=validities,
x="internal_validity", y="external_validity"
)
```