chore(notebooks): Update exploration nb

2024-01-06 09:18:19 +01:00 · 2024-01-06 09:18:19 +01:00 · 3bb3602f90
commit 3bb3602f90
parent 3deb30b5b3
1 changed files with 53 additions and 2 deletions
--- a/00-notebooks/explore.qmd
+++ b/00-notebooks/explore.qmd
@ -3,6 +3,8 @@ bibliography: 02-data/supplementary/lib.bib
 title: Grab yml
 ---

+## Separate data acquisition
+
 ```{python}
 import pandas as pd
 from src import data
@ -166,6 +168,8 @@ gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
 income_df = temp_df.loc[temp_df["Inequality"] == "income"]
 ```

+## Complete data replication from scoping
+
 prep full data set:

 ```{python}
@ -221,7 +225,7 @@ zot_df = pd.DataFrame([
 WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
 df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")

-bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
+bib_df = (data.from_yml(f"{PROCESSED_DATA}")
    .assign(
        doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
        zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
@ -239,7 +243,7 @@ df_country_groups = None
 ```

 ```{python}
-df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
+df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")

 def countries_to_regions(countries:str):
    res = set()
@ -251,6 +255,7 @@ def countries_to_regions(countries:str):

 # countries_to_regions("India; Nicaragua")
 bib_df['region'] = bib_df['country'].map(countries_to_regions)
+bib_df['region'].value_counts().plot.bar()
 ```

 ```{python}
@ -269,3 +274,49 @@ plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
 plt.show()
 ```
+
+```{python}
+df_inequality = (bib_df[["region", "intervention", "inequality"]]
+    .assign(
+        Intervention = lambda _df: (_df["intervention"]
+            .str.replace(r"\(.+\)", "", regex=True)
+            .str.replace(r" ?; ?", ";", regex=True)
+            .str.strip()
+            .str.split(";")
+        ),
+        Inequality = lambda _df: (_df["inequality"]
+            .str.replace(r"\(.+\)", "", regex=True)
+            .str.replace(r" ?; ?", ";", regex=True)
+            .str.strip()
+            .str.split(";")
+        )
+    )
+    .explode("Intervention")
+    .explode("Inequality")
+    .reset_index(drop=True)
+)
+```
+
+```{python}
+def crosstab_inequality(df, inequality:str, **kwargs):
+    df_temp = df.loc[(df["Inequality"] == inequality) | (df["Inequality"] == "income")]
+    tab = pd.crosstab(df_temp["Intervention"], df_temp["Inequality"], **kwargs)
+    return tab.drop(tab[tab[inequality] == 0].index)
+```
+
+## Gender inequality
+
+```{python}
+#| label: tbl-gender-crosstab
+#| tbl-cap: Interventions targeting gender inequality
+
+crosstab_inequality(df_inequality, "gender", normalize=False).sort_values("gender", ascending=False)
+```
+
+```{python}
+def region_vis_inequality(df, inequality:str):
+    df_temp = df.loc[(df["Inequality"] == inequality)]
+    return sns.countplot(df_temp, x="region", order=df_temp["region"].value_counts().index)
+region_vis_inequality(df_inequality, "spatial")
+```
+