From 50c4f5e8c07b653bde74e04f78d624f47c46b742 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 20 Feb 2024 17:58:55 +0100 Subject: [PATCH] feat(code): Add experiments for summary tables --- 00-notebooks/main-findings.qmd | 149 +++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 00-notebooks/main-findings.qmd diff --git a/00-notebooks/main-findings.qmd b/00-notebooks/main-findings.qmd new file mode 100644 index 0000000..663c2b2 --- /dev/null +++ b/00-notebooks/main-findings.qmd @@ -0,0 +1,149 @@ +load data, boilerplate: + +```{python} +#| echo: false +from pathlib import Path +import re +## standard imports +from IPython.core.display import Markdown as md +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +import seaborn as sns +from tabulate import tabulate +import bibtexparser + +sns.set_style("whitegrid") + +DATA_DIR=Path("./02-data") +RAW_DATA=DATA_DIR.joinpath("raw") +WORKING_DATA=DATA_DIR.joinpath("intermediate") +PROCESSED_DATA=DATA_DIR.joinpath("processed") +SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") + +bib_string="" +for partial_bib in RAW_DATA.glob("**/*.bib"): + with open(partial_bib) as f: + bib_string+="\n".join(f.readlines()) +bib_sample_raw_db = bibtexparser.parse_string(bib_string) + +bib_string="" +for partial_bib in WORKING_DATA.glob("**/*.bib"): + with open(partial_bib) as f: + bib_string+="\n".join(f.readlines()) +bib_sample = bibtexparser.parse_string(bib_string) +``` + +```{python} +# load relevant studies +from src import load_data + +# load zotero-based metadata: citations and uses +zot_df = pd.DataFrame([ + [ + entry["doi"] if "doi" in entry.fields_dict else None, + entry["times-cited"] if "times-cited" in entry.fields_dict else None, + entry["usage"] if "usage" in entry.fields_dict else None, + entry["keywords"] if "keywords" in entry.fields_dict else None, + ] + for entry in bib_sample.entries +], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi") + +# Add WB country grouping definitions (income group, world region) +WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() +df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy") + +bib_df = (load_data.from_yml(f"{PROCESSED_DATA}") + .assign( + doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False), + zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]), + zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]), + zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]), + date = lambda _df: pd.to_datetime(_df["year"], format="%Y"), + year = lambda _df: _df["date"].dt.year, + region = lambda _df: _df["country"].map(df_country_groups["Region"]), + income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]), + ) + .query("year >= 2000") +) +zot_df = None +df_country_groups = None +``` + +```{python} +df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy") + +def countries_to_regions(countries:str): + res = set() + for c in countries.replace(" ;", ";").replace("; ",";").split(";"): + if c in df_country_groups.index: + region = df_country_groups.at[c,'Region'] + res.add(region) + return ";".join(res) + +# countries_to_regions("India; Nicaragua") +bib_df['region'] = bib_df['country'].map(countries_to_regions) +bib_df['region'].value_counts().plot.bar() +``` + +```{python} +#| label: fig-intervention-types +#| fig-cap: Predominant type of intervention + +by_intervention = ( + bib_df.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"]) + .agg( + { + "intervention": lambda _col: "; ".join(_col), + } + ) + .reset_index() + .drop_duplicates() + .assign( + intervention=lambda _df: _df["intervention"].apply( + lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) + ), + ) + .explode("intervention") +) +sort_order = by_intervention["intervention"].value_counts().index + +fig = plt.figure() +fig.set_size_inches(6, 3) +ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) +plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") +plt.show() +``` + +datavis: + + +```{python} +findings_institutional = pd.read_csv("02-data/supplementary/findings-institutional.csv") +findings_institutional +from src.model import validity +import math + +validities = validity.calculate(by_intervention) +valid_subset = validities[["internal_validity", "external_validity", "citation"]].fillna(1.0).drop_duplicates(subset=["citation"]).sort_values("internal_validity") +def combined_validities(df_in, column: str = "internal_validity"): + if not isinstance(df_in, str): + return + combined = 0.0 + for study in df_in.split(";"): + new = valid_subset.loc[valid_subset["citation"] == study, column] + if len(new) > 0 and not math.isnan(new.iat[0]): + combined += new.iat[0] + if combined: + return combined + return 0.0 +def combined_external(df_in, column: str = "external_validity"): + return combined_validities(df_in, column) + +findings_institutional["internal_validity"] = findings_institutional["studies"].apply(combined_validities) +findings_institutional["external_validity"] = findings_institutional["studies"].apply(combined_external) +findings_institutional[["area of policy", "internal_validity", "external_validity", "findings", "channels"]] +``` + +