diff --git a/00-notebooks/explore.qmd b/00-notebooks/explore.qmd index f536cbb..85f2615 100644 --- a/00-notebooks/explore.qmd +++ b/00-notebooks/explore.qmd @@ -3,6 +3,8 @@ bibliography: 02-data/supplementary/lib.bib title: Grab yml --- +## Separate data acquisition + ```{python} import pandas as pd from src import data @@ -166,6 +168,8 @@ gender_df = temp_df.loc[temp_df["Inequality"] == "gender"] income_df = temp_df.loc[temp_df["Inequality"] == "income"] ``` +## Complete data replication from scoping + prep full data set: ```{python} @@ -221,7 +225,7 @@ zot_df = pd.DataFrame([ WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy") -bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant") +bib_df = (data.from_yml(f"{PROCESSED_DATA}") .assign( doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False), zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]), @@ -239,7 +243,7 @@ df_country_groups = None ``` ```{python} -df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy") +df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy") def countries_to_regions(countries:str): res = set() @@ -251,6 +255,7 @@ def countries_to_regions(countries:str): # countries_to_regions("India; Nicaragua") bib_df['region'] = bib_df['country'].map(countries_to_regions) +bib_df['region'].value_counts().plot.bar() ``` ```{python} @@ -269,3 +274,49 @@ plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() ``` + +```{python} +df_inequality = (bib_df[["region", "intervention", "inequality"]] + .assign( + Intervention = lambda _df: (_df["intervention"] + .str.replace(r"\(.+\)", "", regex=True) + .str.replace(r" ?; ?", ";", regex=True) + .str.strip() + .str.split(";") + ), + Inequality = lambda _df: (_df["inequality"] + .str.replace(r"\(.+\)", "", regex=True) + .str.replace(r" ?; ?", ";", regex=True) + .str.strip() + .str.split(";") + ) + ) + .explode("Intervention") + .explode("Inequality") + .reset_index(drop=True) +) +``` + +```{python} +def crosstab_inequality(df, inequality:str, **kwargs): + df_temp = df.loc[(df["Inequality"] == inequality) | (df["Inequality"] == "income")] + tab = pd.crosstab(df_temp["Intervention"], df_temp["Inequality"], **kwargs) + return tab.drop(tab[tab[inequality] == 0].index) +``` + +## Gender inequality + +```{python} +#| label: tbl-gender-crosstab +#| tbl-cap: Interventions targeting gender inequality + +crosstab_inequality(df_inequality, "gender", normalize=False).sort_values("gender", ascending=False) +``` + +```{python} +def region_vis_inequality(df, inequality:str): + df_temp = df.loc[(df["Inequality"] == inequality)] + return sns.countplot(df_temp, x="region", order=df_temp["region"].value_counts().index) +region_vis_inequality(df_inequality, "spatial") +``` +