load data, boilerplate: ```{python} #| echo: false from pathlib import Path import re ## standard imports from IPython.core.display import Markdown as md import numpy as np import pandas as pd from matplotlib import pyplot as plt import seaborn as sns from tabulate import tabulate import bibtexparser sns.set_style("whitegrid") DATA_DIR=Path("./02-data") RAW_DATA=DATA_DIR.joinpath("raw") WORKING_DATA=DATA_DIR.joinpath("intermediate") PROCESSED_DATA=DATA_DIR.joinpath("processed") SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") bib_string="" for partial_bib in RAW_DATA.glob("**/*.bib"): with open(partial_bib) as f: bib_string+="\n".join(f.readlines()) bib_sample_raw_db = bibtexparser.parse_string(bib_string) bib_string="" for partial_bib in WORKING_DATA.glob("**/*.bib"): with open(partial_bib) as f: bib_string+="\n".join(f.readlines()) bib_sample = bibtexparser.parse_string(bib_string) ``` ```{python} # load relevant studies from src import load_data # load zotero-based metadata: citations and uses zot_df = pd.DataFrame([ [ entry["doi"] if "doi" in entry.fields_dict else None, entry["times-cited"] if "times-cited" in entry.fields_dict else None, entry["usage"] if "usage" in entry.fields_dict else None, entry["keywords"] if "keywords" in entry.fields_dict else None, ] for entry in bib_sample.entries ], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi") # Add WB country grouping definitions (income group, world region) WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy") bib_df = (load_data.from_yml(f"{PROCESSED_DATA}") .assign( doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False), zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]), zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]), zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]), date = lambda _df: pd.to_datetime(_df["year"], format="%Y"), year = lambda _df: _df["date"].dt.year, region = lambda _df: _df["country"].map(df_country_groups["Region"]), income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]), ) .query("year >= 2000") ) zot_df = None df_country_groups = None ``` ```{python} df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy") def countries_to_regions(countries:str): res = set() for c in countries.replace(" ;", ";").replace("; ",";").split(";"): if c in df_country_groups.index: region = df_country_groups.at[c,'Region'] res.add(region) return ";".join(res) # countries_to_regions("India; Nicaragua") bib_df['region'] = bib_df['country'].map(countries_to_regions) bib_df['region'].value_counts().plot.bar() ``` prep data: Map a 0-5 external validity score based on 'representativeness' to rows: ```{python} df=bib_df vd=df[(df['design'] == 'quasi-experimental') | (df['design'] == 'experimental')] vd = vd.assign(valid_ext=0) vd["representativeness"] = vd["representativeness"].fillna("") mask_subnational = vd['representativeness'].str.contains("subnational") mask_national = vd['representativeness'].str.contains("national") mask_regional = vd['representativeness'].str.contains("regional") mask_local = vd['representativeness'].str.contains("local") vd.loc[mask_regional, 'valid_ext'] = 5 vd.loc[mask_national, 'valid_ext'] = 4 vd.loc[mask_subnational, 'valid_ext'] = 3 vd.loc[mask_local, 'valid_ext'] = 2 vd[['representativeness', 'valid_ext']] ``` Map an internal validity score based on study design/method: ```{python} vd = vd.assign(valid_int=0) vd["method"] = vd["method"].fillna("") vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0 vd.loc[vd['method'].str.contains("|".join(["RD","regression.vdiscontinuity"])), 'valid_int'] = 4.5 vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0 vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5 vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0 vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference"])), 'valid_int'] = 3.0 vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0 vd[['method', 'valid_int']] ```