chore(code): Update explore experiments

This commit is contained in:
Marty Oehme 2023-12-21 17:01:51 +01:00
parent 259d95693f
commit 4c763fac0f
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -158,14 +158,114 @@ pd.crosstab(df_income["Intervention"], df_income["Inequality"])
#| label: tbl-income-crosstab
#| tbl-cap: Interventions targeting income inequality
def inequality_crosstab(df, inequality:str):
temp_df = df.copy()
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
temp_df = temp_df.loc[temp_df['Inequality'] == inequality].copy()
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
tab = pd.crosstab(temp_df["Intervention"], temp_df["Inequality"])
temp_df=None
return tab
temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
inequality_crosstab(df, "income")
gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
income_df = temp_df.loc[temp_df["Inequality"] == "income"]
```
prep full data set:
```{python}
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
sns.set_style("whitegrid")
DATA_DIR=Path("./02-data")
RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
bib_string=""
for partial_bib in RAW_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
bib_string=""
for partial_bib in WORKING_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
bib_sample = bibtexparser.parse_string(bib_string)
```
```{python}
# load relevant studies
from src import data
# load zotero-based metadata: citations and uses
zot_df = pd.DataFrame([
[
entry["doi"] if "doi" in entry.fields_dict else None,
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
entry["usage"] if "usage" in entry.fields_dict else None,
entry["keywords"] if "keywords" in entry.fields_dict else None,
]
for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
.assign(
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
year = lambda _df: _df["date"].dt.year,
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
)
.query("year >= 2000")
)
zot_df = None
df_country_groups = None
```
```{python}
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
def countries_to_regions(countries:str):
res = set()
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
if c in df_country_groups.index:
region = df_country_groups.at[c,'Region']
res.add(region)
return ";".join(res)
# countries_to_regions("India; Nicaragua")
bib_df['region'] = bib_df['country'].map(countries_to_regions)
```
```{python}
bib_df = (bib_df
.assign(
# create de-duplicated joins for all observations
region=lambda _df: _df["region"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
.explode("region")
)
# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()
ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
```