129 lines
4.7 KiB
Text
129 lines
4.7 KiB
Text
|
load data, boilerplate:
|
||
|
|
||
|
```{python}
|
||
|
#| echo: false
|
||
|
from pathlib import Path
|
||
|
import re
|
||
|
## standard imports
|
||
|
from IPython.core.display import Markdown as md
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from matplotlib import pyplot as plt
|
||
|
import seaborn as sns
|
||
|
from tabulate import tabulate
|
||
|
import bibtexparser
|
||
|
|
||
|
sns.set_style("whitegrid")
|
||
|
|
||
|
DATA_DIR=Path("./02-data")
|
||
|
RAW_DATA=DATA_DIR.joinpath("raw")
|
||
|
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
||
|
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||
|
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||
|
|
||
|
bib_string=""
|
||
|
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
||
|
with open(partial_bib) as f:
|
||
|
bib_string+="\n".join(f.readlines())
|
||
|
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
||
|
|
||
|
bib_string=""
|
||
|
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
||
|
with open(partial_bib) as f:
|
||
|
bib_string+="\n".join(f.readlines())
|
||
|
bib_sample = bibtexparser.parse_string(bib_string)
|
||
|
```
|
||
|
|
||
|
```{python}
|
||
|
# load relevant studies
|
||
|
from src import load_data
|
||
|
|
||
|
# load zotero-based metadata: citations and uses
|
||
|
zot_df = pd.DataFrame([
|
||
|
[
|
||
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
||
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
||
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
||
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
||
|
]
|
||
|
for entry in bib_sample.entries
|
||
|
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
||
|
|
||
|
# Add WB country grouping definitions (income group, world region)
|
||
|
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
||
|
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
||
|
|
||
|
bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
|
||
|
.assign(
|
||
|
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
||
|
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
||
|
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
||
|
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
||
|
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
||
|
year = lambda _df: _df["date"].dt.year,
|
||
|
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
||
|
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
||
|
)
|
||
|
.query("year >= 2000")
|
||
|
)
|
||
|
zot_df = None
|
||
|
df_country_groups = None
|
||
|
```
|
||
|
|
||
|
```{python}
|
||
|
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
|
||
|
|
||
|
def countries_to_regions(countries:str):
|
||
|
res = set()
|
||
|
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
|
||
|
if c in df_country_groups.index:
|
||
|
region = df_country_groups.at[c,'Region']
|
||
|
res.add(region)
|
||
|
return ";".join(res)
|
||
|
|
||
|
# countries_to_regions("India; Nicaragua")
|
||
|
bib_df['region'] = bib_df['country'].map(countries_to_regions)
|
||
|
bib_df['region'].value_counts().plot.bar()
|
||
|
```
|
||
|
|
||
|
prep data:
|
||
|
|
||
|
Map a 0-5 external validity score based on 'representativeness' to rows:
|
||
|
|
||
|
```{python}
|
||
|
df=bib_df
|
||
|
vd=df[(df['design'] == 'quasi-experimental') | (df['design'] == 'experimental')]
|
||
|
|
||
|
vd = vd.assign(valid_ext=0)
|
||
|
vd["representativeness"] = vd["representativeness"].fillna("")
|
||
|
|
||
|
mask_subnational = vd['representativeness'].str.contains("subnational")
|
||
|
mask_national = vd['representativeness'].str.contains("national")
|
||
|
mask_regional = vd['representativeness'].str.contains("regional")
|
||
|
mask_local = vd['representativeness'].str.contains("local")
|
||
|
|
||
|
vd.loc[mask_regional, 'valid_ext'] = 5
|
||
|
vd.loc[mask_national, 'valid_ext'] = 4
|
||
|
vd.loc[mask_subnational, 'valid_ext'] = 3
|
||
|
vd.loc[mask_local, 'valid_ext'] = 2
|
||
|
|
||
|
vd[['representativeness', 'valid_ext']]
|
||
|
```
|
||
|
|
||
|
Map an internal validity score based on study design/method:
|
||
|
|
||
|
```{python}
|
||
|
vd = vd.assign(valid_int=0)
|
||
|
vd["method"] = vd["method"].fillna("")
|
||
|
|
||
|
vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0
|
||
|
vd.loc[vd['method'].str.contains("|".join(["RD","regression.vdiscontinuity"])), 'valid_int'] = 4.5
|
||
|
vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0
|
||
|
vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5
|
||
|
vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0
|
||
|
vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference"])), 'valid_int'] = 3.0
|
||
|
vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0
|
||
|
vd[['method', 'valid_int']]
|
||
|
```
|
||
|
|