wow-inequalities/notebooks/explore.qmd
Marty Oehme 4f9acd0816
chore(repo): Move references to reference data dir
Begin restructuring data dir by separating out references into their own
data sub-dir containing only references and bibtex files.
2024-07-16 15:59:41 +02:00

317 lines
9.6 KiB
Text

---
bibliography: data/intermediate/zotero-library.bib
title: Grab yml
---
## Separate data acquisition
```{python}
import pandas as pd
from src import load_data
df = load_data.from_yml()
```
Get interventions:
```{python}
df['intervention'].str.split(";").explode().str.strip().value_counts()
```
Get inequalities:
```{python}
df['inequality'].str.split(";").explode().str.strip().value_counts()
```
```{python}
df.groupby(["author", "year", "title"]).first().join(df['intervention'])
```
Unique values in chain method:
```{python}
(
df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col:"; ".join(_col),
"inequality": lambda _col:"; ".join(_col),
}
)
.drop_duplicates()
.explode("inequality")
["inequality"].str.strip()
.value_counts()
)
```
Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
Any required columns can be calculated similar to the agg function here.
```{python}
by_study = (
df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
"inequality": lambda _col: "; ".join(_col),
"date": lambda _col: "; ".join(_col),
"findings": lambda _col: "; ".join(_col),
# "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
# "income_group": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
# create de-duplicated joins for all observations
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
inequality=lambda _df: _df["inequality"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
)
```
```{python}
by_study = (
df.groupby(["author", "year", "title"])
.first()
.reset_index()
.drop_duplicates()
.assign(
# create de-duplicated joins for all observations
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
inequality=lambda _df: _df["inequality"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
)
```
```{python}
import re
from matplotlib import pyplot as plt
import seaborn as sns
by_intervention = (
df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
by_intervention = None
```
```{python}
#| label: fig-publications-per-year
#| fig-cap: Publications per year
df_study_years = (
df.groupby(["author", "year", "title"])
.first()
.reset_index()
.drop_duplicates()
)
# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
# FIXME should be timeseries plot so no years are missing
ax = sns.countplot(df_study_years, x="year", native_scale=True)
ax.tick_params(axis='x', rotation=45)
ax.set_xlabel("")
plt.tight_layout()
plt.show()
df_study_years = None
```
```{python}
#| label: tbl-income-crosstab
#| tbl-cap: Interventions targeting income inequality
df_income = df.copy()
df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip()
df_income = df_income.loc[df_income['Inequality'] == "income"].copy()
df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
pd.crosstab(df_income["Intervention"], df_income["Inequality"])
```
```{python}
#| label: tbl-income-crosstab
#| tbl-cap: Interventions targeting income inequality
temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
income_df = temp_df.loc[temp_df["Inequality"] == "income"]
```
## Complete data replication from scoping
prep full data set:
```{python}
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
import src.globals as g
sns.set_style("whitegrid")
bib_string=""
for partial_bib in g.REFERENCE_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
bib_string=""
for partial_bib in g.REFERENCE_DATA.joinpath("zotero-library.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
bib_sample = bibtexparser.parse_string(bib_string)
```
```{python}
# load relevant studies
from src import load_data
# load zotero-based metadata: citations and uses
zot_df = pd.DataFrame([
[
entry["doi"] if "doi" in entry.fields_dict else None,
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
entry["usage"] if "usage" in entry.fields_dict else None,
entry["keywords"] if "keywords" in entry.fields_dict else None,
]
for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
bib_df = (load_data.from_yml(f"{g.PROCESSED_DATA}")
.assign(
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
year = lambda _df: _df["date"].dt.year,
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
)
.query("year >= 2000")
)
zot_df = None
df_country_groups = None
```
```{python}
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
def countries_to_regions(countries:str):
res = set()
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
if c in df_country_groups.index:
region = df_country_groups.at[c,'Region']
res.add(region)
return ";".join(res)
# countries_to_regions("India; Nicaragua")
bib_df['region'] = bib_df['country'].map(countries_to_regions)
bib_df['region'].value_counts().plot.bar()
```
```{python}
bib_df = (bib_df
.assign(
# create de-duplicated joins for all observations
region=lambda _df: _df["region"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
.explode("region")
)
# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()
ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
```
```{python}
df_inequality = (bib_df[["region", "intervention", "inequality"]]
.assign(
Intervention = lambda _df: (_df["intervention"]
.str.replace(r"\(.+\)", "", regex=True)
.str.replace(r" ?; ?", ";", regex=True)
.str.strip()
.str.split(";")
),
Inequality = lambda _df: (_df["inequality"]
.str.replace(r"\(.+\)", "", regex=True)
.str.replace(r" ?; ?", ";", regex=True)
.str.strip()
.str.split(";")
)
)
.explode("Intervention")
.explode("Inequality")
.reset_index(drop=True)
)
```
```{python}
def crosstab_inequality(df, inequality:str, **kwargs):
df_temp = df.loc[(df["Inequality"] == inequality) | (df["Inequality"] == "income")]
tab = pd.crosstab(df_temp["Intervention"], df_temp["Inequality"], **kwargs)
return tab.drop(tab[tab[inequality] == 0].index)
```
## Gender inequality
```{python}
#| label: tbl-gender-crosstab
#| tbl-cap: Interventions targeting gender inequality
crosstab_inequality(df_inequality, "gender", normalize=False).sort_values("gender", ascending=False)
```
```{python}
def region_vis_inequality(df, inequality:str):
df_temp = df.loc[(df["Inequality"] == inequality)]
return sns.countplot(df_temp, x="region", order=df_temp["region"].value_counts().index)
region_vis_inequality(df_inequality, "spatial")
```