chore(notebooks): Update exploration nb

This commit is contained in:
Marty Oehme 2024-01-06 09:18:19 +01:00
parent 3deb30b5b3
commit 3bb3602f90
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -3,6 +3,8 @@ bibliography: 02-data/supplementary/lib.bib
title: Grab yml
---
## Separate data acquisition
```{python}
import pandas as pd
from src import data
@ -166,6 +168,8 @@ gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
income_df = temp_df.loc[temp_df["Inequality"] == "income"]
```
## Complete data replication from scoping
prep full data set:
```{python}
@ -221,7 +225,7 @@ zot_df = pd.DataFrame([
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
bib_df = (data.from_yml(f"{PROCESSED_DATA}")
.assign(
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
@ -239,7 +243,7 @@ df_country_groups = None
```
```{python}
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
def countries_to_regions(countries:str):
res = set()
@ -251,6 +255,7 @@ def countries_to_regions(countries:str):
# countries_to_regions("India; Nicaragua")
bib_df['region'] = bib_df['country'].map(countries_to_regions)
bib_df['region'].value_counts().plot.bar()
```
```{python}
@ -269,3 +274,49 @@ plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
```
```{python}
df_inequality = (bib_df[["region", "intervention", "inequality"]]
.assign(
Intervention = lambda _df: (_df["intervention"]
.str.replace(r"\(.+\)", "", regex=True)
.str.replace(r" ?; ?", ";", regex=True)
.str.strip()
.str.split(";")
),
Inequality = lambda _df: (_df["inequality"]
.str.replace(r"\(.+\)", "", regex=True)
.str.replace(r" ?; ?", ";", regex=True)
.str.strip()
.str.split(";")
)
)
.explode("Intervention")
.explode("Inequality")
.reset_index(drop=True)
)
```
```{python}
def crosstab_inequality(df, inequality:str, **kwargs):
df_temp = df.loc[(df["Inequality"] == inequality) | (df["Inequality"] == "income")]
tab = pd.crosstab(df_temp["Intervention"], df_temp["Inequality"], **kwargs)
return tab.drop(tab[tab[inequality] == 0].index)
```
## Gender inequality
```{python}
#| label: tbl-gender-crosstab
#| tbl-cap: Interventions targeting gender inequality
crosstab_inequality(df_inequality, "gender", normalize=False).sort_values("gender", ascending=False)
```
```{python}
def region_vis_inequality(df, inequality:str):
df_temp = df.loc[(df["Inequality"] == inequality)]
return sns.countplot(df_temp, x="region", order=df_temp["region"].value_counts().index)
region_vis_inequality(df_inequality, "spatial")
```