From d364cd51179ea8f296ef2ee62b96c6520b45738f Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 13 Feb 2024 16:12:08 +0100 Subject: [PATCH] feat(code): Add notebook to rank study validities --- 00-notebooks/rank_validities.qmd | 128 +++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 00-notebooks/rank_validities.qmd diff --git a/00-notebooks/rank_validities.qmd b/00-notebooks/rank_validities.qmd new file mode 100644 index 0000000..38e1e5d --- /dev/null +++ b/00-notebooks/rank_validities.qmd @@ -0,0 +1,128 @@ +load data, boilerplate: + +```{python} +#| echo: false +from pathlib import Path +import re +## standard imports +from IPython.core.display import Markdown as md +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +import seaborn as sns +from tabulate import tabulate +import bibtexparser + +sns.set_style("whitegrid") + +DATA_DIR=Path("./02-data") +RAW_DATA=DATA_DIR.joinpath("raw") +WORKING_DATA=DATA_DIR.joinpath("intermediate") +PROCESSED_DATA=DATA_DIR.joinpath("processed") +SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") + +bib_string="" +for partial_bib in RAW_DATA.glob("**/*.bib"): + with open(partial_bib) as f: + bib_string+="\n".join(f.readlines()) +bib_sample_raw_db = bibtexparser.parse_string(bib_string) + +bib_string="" +for partial_bib in WORKING_DATA.glob("**/*.bib"): + with open(partial_bib) as f: + bib_string+="\n".join(f.readlines()) +bib_sample = bibtexparser.parse_string(bib_string) +``` + +```{python} +# load relevant studies +from src import load_data + +# load zotero-based metadata: citations and uses +zot_df = pd.DataFrame([ + [ + entry["doi"] if "doi" in entry.fields_dict else None, + entry["times-cited"] if "times-cited" in entry.fields_dict else None, + entry["usage"] if "usage" in entry.fields_dict else None, + entry["keywords"] if "keywords" in entry.fields_dict else None, + ] + for entry in bib_sample.entries +], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi") + +# Add WB country grouping definitions (income group, world region) +WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() +df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy") + +bib_df = (load_data.from_yml(f"{PROCESSED_DATA}") + .assign( + doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False), + zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]), + zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]), + zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]), + date = lambda _df: pd.to_datetime(_df["year"], format="%Y"), + year = lambda _df: _df["date"].dt.year, + region = lambda _df: _df["country"].map(df_country_groups["Region"]), + income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]), + ) + .query("year >= 2000") +) +zot_df = None +df_country_groups = None +``` + +```{python} +df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy") + +def countries_to_regions(countries:str): + res = set() + for c in countries.replace(" ;", ";").replace("; ",";").split(";"): + if c in df_country_groups.index: + region = df_country_groups.at[c,'Region'] + res.add(region) + return ";".join(res) + +# countries_to_regions("India; Nicaragua") +bib_df['region'] = bib_df['country'].map(countries_to_regions) +bib_df['region'].value_counts().plot.bar() +``` + +prep data: + +Map a 0-5 external validity score based on 'representativeness' to rows: + +```{python} +df=bib_df +vd=df[(df['design'] == 'quasi-experimental') | (df['design'] == 'experimental')] + +vd = vd.assign(valid_ext=0) +vd["representativeness"] = vd["representativeness"].fillna("") + +mask_subnational = vd['representativeness'].str.contains("subnational") +mask_national = vd['representativeness'].str.contains("national") +mask_regional = vd['representativeness'].str.contains("regional") +mask_local = vd['representativeness'].str.contains("local") + +vd.loc[mask_regional, 'valid_ext'] = 5 +vd.loc[mask_national, 'valid_ext'] = 4 +vd.loc[mask_subnational, 'valid_ext'] = 3 +vd.loc[mask_local, 'valid_ext'] = 2 + +vd[['representativeness', 'valid_ext']] +``` + +Map an internal validity score based on study design/method: + +```{python} +vd = vd.assign(valid_int=0) +vd["method"] = vd["method"].fillna("") + +vd.loc[vd['method'].str.contains("RCT"), 'valid_int'] = 5.0 +vd.loc[vd['method'].str.contains("|".join(["RD","regression.vdiscontinuity"])), 'valid_int'] = 4.5 +vd.loc[vd['method'].str.contains("|".join(["IV","instrumental.variable"])), 'valid_int'] = 4.0 +vd.loc[vd['method'].str.contains("|".join(["PSM","propensity.score.matching"])), 'valid_int'] = 3.5 +vd.loc[vd['method'].str.contains("|".join(["DM","discontinuity.matching"])), 'valid_int'] = 3.0 +vd.loc[vd['method'].str.contains("|".join(["DID","difference.in.difference"])), 'valid_int'] = 3.0 +vd.loc[vd['method'].str.contains("|".join(["OLS","ordinary.least.square"])), 'valid_int'] = 2.0 +vd[['method', 'valid_int']] +``` +