chore(code): Update explore experiments
This commit is contained in:
parent
259d95693f
commit
4c763fac0f
1 changed files with 109 additions and 9 deletions
|
@ -158,14 +158,114 @@ pd.crosstab(df_income["Intervention"], df_income["Inequality"])
|
|||
#| label: tbl-income-crosstab
|
||||
#| tbl-cap: Interventions targeting income inequality
|
||||
|
||||
def inequality_crosstab(df, inequality:str):
|
||||
temp_df = df.copy()
|
||||
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
||||
temp_df = temp_df.loc[temp_df['Inequality'] == inequality].copy()
|
||||
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
||||
tab = pd.crosstab(temp_df["Intervention"], temp_df["Inequality"])
|
||||
temp_df=None
|
||||
return tab
|
||||
temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)
|
||||
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
||||
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
||||
|
||||
inequality_crosstab(df, "income")
|
||||
gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
|
||||
income_df = temp_df.loc[temp_df["Inequality"] == "income"]
|
||||
```
|
||||
|
||||
prep full data set:
|
||||
|
||||
```{python}
|
||||
#| echo: false
|
||||
from pathlib import Path
|
||||
import re
|
||||
## standard imports
|
||||
from IPython.core.display import Markdown as md
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
from tabulate import tabulate
|
||||
import bibtexparser
|
||||
|
||||
sns.set_style("whitegrid")
|
||||
|
||||
DATA_DIR=Path("./02-data")
|
||||
RAW_DATA=DATA_DIR.joinpath("raw")
|
||||
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
||||
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||||
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
bib_sample = bibtexparser.parse_string(bib_string)
|
||||
```
|
||||
|
||||
```{python}
|
||||
# load relevant studies
|
||||
from src import data
|
||||
|
||||
# load zotero-based metadata: citations and uses
|
||||
zot_df = pd.DataFrame([
|
||||
[
|
||||
entry["doi"] if "doi" in entry.fields_dict else None,
|
||||
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
||||
entry["usage"] if "usage" in entry.fields_dict else None,
|
||||
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
||||
]
|
||||
for entry in bib_sample.entries
|
||||
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
||||
|
||||
# Add WB country grouping definitions (income group, world region)
|
||||
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
||||
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
||||
|
||||
bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
|
||||
.assign(
|
||||
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
||||
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
||||
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
||||
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
||||
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
||||
year = lambda _df: _df["date"].dt.year,
|
||||
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
||||
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
||||
)
|
||||
.query("year >= 2000")
|
||||
)
|
||||
zot_df = None
|
||||
df_country_groups = None
|
||||
```
|
||||
|
||||
```{python}
|
||||
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['World'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
|
||||
|
||||
def countries_to_regions(countries:str):
|
||||
res = set()
|
||||
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
|
||||
if c in df_country_groups.index:
|
||||
region = df_country_groups.at[c,'Region']
|
||||
res.add(region)
|
||||
return ";".join(res)
|
||||
|
||||
# countries_to_regions("India; Nicaragua")
|
||||
bib_df['region'] = bib_df['country'].map(countries_to_regions)
|
||||
```
|
||||
|
||||
```{python}
|
||||
bib_df = (bib_df
|
||||
.assign(
|
||||
# create de-duplicated joins for all observations
|
||||
region=lambda _df: _df["region"].apply(
|
||||
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
||||
),
|
||||
)
|
||||
.explode("region")
|
||||
)
|
||||
# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()
|
||||
ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)
|
||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||
rotation_mode="anchor")
|
||||
plt.show()
|
||||
```
|
||||
|
|
Loading…
Reference in a new issue