From 4c763fac0f1437cd5981a165ba7ecf4866adb1e3 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Thu, 21 Dec 2023 17:01:51 +0100 Subject: [PATCH] chore(code): Update explore experiments --- 00-notebooks/explore.qmd | 118 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 9 deletions(-) diff --git a/00-notebooks/explore.qmd b/00-notebooks/explore.qmd index b2c9aa6..f536cbb 100644 --- a/00-notebooks/explore.qmd +++ b/00-notebooks/explore.qmd @@ -158,14 +158,114 @@ pd.crosstab(df_income["Intervention"], df_income["Inequality"]) #| label: tbl-income-crosstab #| tbl-cap: Interventions targeting income inequality -def inequality_crosstab(df, inequality:str): - temp_df = df.copy() - temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip() - temp_df = temp_df.loc[temp_df['Inequality'] == inequality].copy() - temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip() - tab = pd.crosstab(temp_df["Intervention"], temp_df["Inequality"]) - temp_df=None - return tab +temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True) +temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip() +temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip() -inequality_crosstab(df, "income") +gender_df = temp_df.loc[temp_df["Inequality"] == "gender"] +income_df = temp_df.loc[temp_df["Inequality"] == "income"] +``` + +prep full data set: + +```{python} +#| echo: false +from pathlib import Path +import re +## standard imports +from IPython.core.display import Markdown as md +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +import seaborn as sns +from tabulate import tabulate +import bibtexparser + +sns.set_style("whitegrid") + +DATA_DIR=Path("./02-data") +RAW_DATA=DATA_DIR.joinpath("raw") +WORKING_DATA=DATA_DIR.joinpath("intermediate") +PROCESSED_DATA=DATA_DIR.joinpath("processed") +SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") + +bib_string="" +for partial_bib in RAW_DATA.glob("**/*.bib"): + with open(partial_bib) as f: + bib_string+="\n".join(f.readlines()) +bib_sample_raw_db = bibtexparser.parse_string(bib_string) + +bib_string="" +for partial_bib in WORKING_DATA.glob("**/*.bib"): + with open(partial_bib) as f: + bib_string+="\n".join(f.readlines()) +bib_sample = bibtexparser.parse_string(bib_string) +``` + +```{python} +# load relevant studies +from src import data + +# load zotero-based metadata: citations and uses +zot_df = pd.DataFrame([ + [ + entry["doi"] if "doi" in entry.fields_dict else None, + entry["times-cited"] if "times-cited" in entry.fields_dict else None, + entry["usage"] if "usage" in entry.fields_dict else None, + entry["keywords"] if "keywords" in entry.fields_dict else None, + ] + for entry in bib_sample.entries +], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi") + +# Add WB country grouping definitions (income group, world region) +WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() +df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy") + +bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant") + .assign( + doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False), + zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]), + zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]), + zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]), + date = lambda _df: pd.to_datetime(_df["year"], format="%Y"), + year = lambda _df: _df["date"].dt.year, + region = lambda _df: _df["country"].map(df_country_groups["Region"]), + income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]), + ) + .query("year >= 2000") +) +zot_df = None +df_country_groups = None +``` + +```{python} +df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy") + +def countries_to_regions(countries:str): + res = set() + for c in countries.replace(" ;", ";").replace("; ",";").split(";"): + if c in df_country_groups.index: + region = df_country_groups.at[c,'Region'] + res.add(region) + return ";".join(res) + +# countries_to_regions("India; Nicaragua") +bib_df['region'] = bib_df['country'].map(countries_to_regions) +``` + +```{python} +bib_df = (bib_df + .assign( + # create de-duplicated joins for all observations + region=lambda _df: _df["region"].apply( + lambda _cell: set([x.strip() for x in _cell.split(";")]) + ), + ) + .explode("region") +) +# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip() +ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index) +plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") +plt.show() ```