Marty Oehme
4f9acd0816
Begin restructuring data dir by separating out references into their own data sub-dir containing only references and bibtex files.
317 lines
9.6 KiB
Text
317 lines
9.6 KiB
Text
---
|
|
bibliography: data/intermediate/zotero-library.bib
|
|
title: Grab yml
|
|
---
|
|
|
|
## Separate data acquisition
|
|
|
|
```{python}
|
|
import pandas as pd
|
|
from src import load_data
|
|
|
|
df = load_data.from_yml()
|
|
```
|
|
|
|
Get interventions:
|
|
|
|
```{python}
|
|
df['intervention'].str.split(";").explode().str.strip().value_counts()
|
|
```
|
|
|
|
Get inequalities:
|
|
|
|
```{python}
|
|
df['inequality'].str.split(";").explode().str.strip().value_counts()
|
|
```
|
|
|
|
```{python}
|
|
df.groupby(["author", "year", "title"]).first().join(df['intervention'])
|
|
```
|
|
|
|
Unique values in chain method:
|
|
|
|
```{python}
|
|
(
|
|
df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"intervention": lambda _col:"; ".join(_col),
|
|
"inequality": lambda _col:"; ".join(_col),
|
|
}
|
|
)
|
|
.drop_duplicates()
|
|
.explode("inequality")
|
|
["inequality"].str.strip()
|
|
.value_counts()
|
|
)
|
|
```
|
|
|
|
Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
|
|
Any required columns can be calculated similar to the agg function here.
|
|
|
|
```{python}
|
|
by_study = (
|
|
df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
"inequality": lambda _col: "; ".join(_col),
|
|
"date": lambda _col: "; ".join(_col),
|
|
"findings": lambda _col: "; ".join(_col),
|
|
# "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
|
|
# "income_group": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates()
|
|
.assign(
|
|
# create de-duplicated joins for all observations
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
inequality=lambda _df: _df["inequality"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
)
|
|
)
|
|
```
|
|
|
|
```{python}
|
|
by_study = (
|
|
df.groupby(["author", "year", "title"])
|
|
.first()
|
|
.reset_index()
|
|
.drop_duplicates()
|
|
.assign(
|
|
# create de-duplicated joins for all observations
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
inequality=lambda _df: _df["inequality"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
)
|
|
)
|
|
```
|
|
|
|
```{python}
|
|
import re
|
|
from matplotlib import pyplot as plt
|
|
import seaborn as sns
|
|
by_intervention = (
|
|
df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates()
|
|
.assign(
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("intervention")
|
|
)
|
|
sort_order = by_intervention["intervention"].value_counts().index
|
|
|
|
fig = plt.figure()
|
|
fig.set_size_inches(6, 3)
|
|
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
by_intervention = None
|
|
```
|
|
|
|
```{python}
|
|
#| label: fig-publications-per-year
|
|
#| fig-cap: Publications per year
|
|
|
|
df_study_years = (
|
|
df.groupby(["author", "year", "title"])
|
|
.first()
|
|
.reset_index()
|
|
.drop_duplicates()
|
|
)
|
|
# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
|
|
# FIXME should be timeseries plot so no years are missing
|
|
ax = sns.countplot(df_study_years, x="year", native_scale=True)
|
|
ax.tick_params(axis='x', rotation=45)
|
|
ax.set_xlabel("")
|
|
plt.tight_layout()
|
|
plt.show()
|
|
df_study_years = None
|
|
```
|
|
|
|
```{python}
|
|
#| label: tbl-income-crosstab
|
|
#| tbl-cap: Interventions targeting income inequality
|
|
|
|
df_income = df.copy()
|
|
df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
|
df_income = df_income.loc[df_income['Inequality'] == "income"].copy()
|
|
df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
|
pd.crosstab(df_income["Intervention"], df_income["Inequality"])
|
|
```
|
|
|
|
```{python}
|
|
#| label: tbl-income-crosstab
|
|
#| tbl-cap: Interventions targeting income inequality
|
|
|
|
temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)
|
|
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
|
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
|
|
|
gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
|
|
income_df = temp_df.loc[temp_df["Inequality"] == "income"]
|
|
```
|
|
|
|
## Complete data replication from scoping
|
|
|
|
prep full data set:
|
|
|
|
```{python}
|
|
#| echo: false
|
|
from pathlib import Path
|
|
import re
|
|
## standard imports
|
|
from IPython.core.display import Markdown as md
|
|
import numpy as np
|
|
import pandas as pd
|
|
from matplotlib import pyplot as plt
|
|
import seaborn as sns
|
|
from tabulate import tabulate
|
|
import bibtexparser
|
|
import src.globals as g
|
|
|
|
sns.set_style("whitegrid")
|
|
|
|
bib_string=""
|
|
for partial_bib in g.REFERENCE_DATA.glob("**/*.bib"):
|
|
with open(partial_bib) as f:
|
|
bib_string+="\n".join(f.readlines())
|
|
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
|
|
|
bib_string=""
|
|
for partial_bib in g.REFERENCE_DATA.joinpath("zotero-library.bib"):
|
|
with open(partial_bib) as f:
|
|
bib_string+="\n".join(f.readlines())
|
|
bib_sample = bibtexparser.parse_string(bib_string)
|
|
```
|
|
|
|
```{python}
|
|
# load relevant studies
|
|
from src import load_data
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
zot_df = pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
|
|
|
# Add WB country grouping definitions (income group, world region)
|
|
WB_COUNTRY_GROUPS_FILE = Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
|
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
|
|
|
bib_df = (load_data.from_yml(f"{g.PROCESSED_DATA}")
|
|
.assign(
|
|
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
|
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
|
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
|
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
|
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
|
year = lambda _df: _df["date"].dt.year,
|
|
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
|
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
|
)
|
|
.query("year >= 2000")
|
|
)
|
|
zot_df = None
|
|
df_country_groups = None
|
|
```
|
|
|
|
```{python}
|
|
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
|
|
|
|
def countries_to_regions(countries:str):
|
|
res = set()
|
|
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
|
|
if c in df_country_groups.index:
|
|
region = df_country_groups.at[c,'Region']
|
|
res.add(region)
|
|
return ";".join(res)
|
|
|
|
# countries_to_regions("India; Nicaragua")
|
|
bib_df['region'] = bib_df['country'].map(countries_to_regions)
|
|
bib_df['region'].value_counts().plot.bar()
|
|
```
|
|
|
|
```{python}
|
|
bib_df = (bib_df
|
|
.assign(
|
|
# create de-duplicated joins for all observations
|
|
region=lambda _df: _df["region"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
)
|
|
.explode("region")
|
|
)
|
|
# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()
|
|
ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|
|
|
|
```{python}
|
|
df_inequality = (bib_df[["region", "intervention", "inequality"]]
|
|
.assign(
|
|
Intervention = lambda _df: (_df["intervention"]
|
|
.str.replace(r"\(.+\)", "", regex=True)
|
|
.str.replace(r" ?; ?", ";", regex=True)
|
|
.str.strip()
|
|
.str.split(";")
|
|
),
|
|
Inequality = lambda _df: (_df["inequality"]
|
|
.str.replace(r"\(.+\)", "", regex=True)
|
|
.str.replace(r" ?; ?", ";", regex=True)
|
|
.str.strip()
|
|
.str.split(";")
|
|
)
|
|
)
|
|
.explode("Intervention")
|
|
.explode("Inequality")
|
|
.reset_index(drop=True)
|
|
)
|
|
```
|
|
|
|
```{python}
|
|
def crosstab_inequality(df, inequality:str, **kwargs):
|
|
df_temp = df.loc[(df["Inequality"] == inequality) | (df["Inequality"] == "income")]
|
|
tab = pd.crosstab(df_temp["Intervention"], df_temp["Inequality"], **kwargs)
|
|
return tab.drop(tab[tab[inequality] == 0].index)
|
|
```
|
|
|
|
## Gender inequality
|
|
|
|
```{python}
|
|
#| label: tbl-gender-crosstab
|
|
#| tbl-cap: Interventions targeting gender inequality
|
|
|
|
crosstab_inequality(df_inequality, "gender", normalize=False).sort_values("gender", ascending=False)
|
|
```
|
|
|
|
```{python}
|
|
def region_vis_inequality(df, inequality:str):
|
|
df_temp = df.loc[(df["Inequality"] == inequality)]
|
|
return sns.countplot(df_temp, x="region", order=df_temp["region"].value_counts().index)
|
|
region_vis_inequality(df_inequality, "spatial")
|
|
```
|
|
|