Marty Oehme
95ad5ed641
To not double-commit every library change, we simply export ALL of the zotero library into a single file in the 'intermediate' data directory. Technically this still works just as well since it still reflects our 'intermediate' stage of tagging, screening, keywording the library contents. It just contains the non-sampled contents as well now.
322 lines
9.7 KiB
Text
322 lines
9.7 KiB
Text
---
|
|
bibliography: 02-data/intermediate/zotero-library.bib
|
|
title: Grab yml
|
|
---
|
|
|
|
## Separate data acquisition
|
|
|
|
```{python}
|
|
import pandas as pd
|
|
from src import data
|
|
|
|
df = data.from_yml()
|
|
```
|
|
|
|
Get interventions:
|
|
|
|
```{python}
|
|
df['intervention'].str.split(";").explode().str.strip().value_counts()
|
|
```
|
|
|
|
Get inequalities:
|
|
|
|
```{python}
|
|
df['inequality'].str.split(";").explode().str.strip().value_counts()
|
|
```
|
|
|
|
```{python}
|
|
df.groupby(["author", "year", "title"]).first().join(df['intervention'])
|
|
```
|
|
|
|
Unique values in chain method:
|
|
|
|
```{python}
|
|
(
|
|
df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"intervention": lambda _col:"; ".join(_col),
|
|
"inequality": lambda _col:"; ".join(_col),
|
|
}
|
|
)
|
|
.drop_duplicates()
|
|
.explode("inequality")
|
|
["inequality"].str.strip()
|
|
.value_counts()
|
|
)
|
|
```
|
|
|
|
Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
|
|
Any required columns can be calculated similar to the agg function here.
|
|
|
|
```{python}
|
|
by_study = (
|
|
df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
"inequality": lambda _col: "; ".join(_col),
|
|
"date": lambda _col: "; ".join(_col),
|
|
"findings": lambda _col: "; ".join(_col),
|
|
# "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
|
|
# "income_group": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates()
|
|
.assign(
|
|
# create de-duplicated joins for all observations
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
inequality=lambda _df: _df["inequality"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
)
|
|
)
|
|
```
|
|
|
|
```{python}
|
|
by_study = (
|
|
df.groupby(["author", "year", "title"])
|
|
.first()
|
|
.reset_index()
|
|
.drop_duplicates()
|
|
.assign(
|
|
# create de-duplicated joins for all observations
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
inequality=lambda _df: _df["inequality"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
)
|
|
)
|
|
```
|
|
|
|
```{python}
|
|
import re
|
|
from matplotlib import pyplot as plt
|
|
import seaborn as sns
|
|
by_intervention = (
|
|
df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates()
|
|
.assign(
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("intervention")
|
|
)
|
|
sort_order = by_intervention["intervention"].value_counts().index
|
|
|
|
fig = plt.figure()
|
|
fig.set_size_inches(6, 3)
|
|
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
by_intervention = None
|
|
```
|
|
|
|
```{python}
|
|
#| label: fig-publications-per-year
|
|
#| fig-cap: Publications per year
|
|
|
|
df_study_years = (
|
|
df.groupby(["author", "year", "title"])
|
|
.first()
|
|
.reset_index()
|
|
.drop_duplicates()
|
|
)
|
|
# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
|
|
# FIXME should be timeseries plot so no years are missing
|
|
ax = sns.countplot(df_study_years, x="year", native_scale=True)
|
|
ax.tick_params(axis='x', rotation=45)
|
|
ax.set_xlabel("")
|
|
plt.tight_layout()
|
|
plt.show()
|
|
df_study_years = None
|
|
```
|
|
|
|
```{python}
|
|
#| label: tbl-income-crosstab
|
|
#| tbl-cap: Interventions targeting income inequality
|
|
|
|
df_income = df.copy()
|
|
df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
|
df_income = df_income.loc[df_income['Inequality'] == "income"].copy()
|
|
df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
|
pd.crosstab(df_income["Intervention"], df_income["Inequality"])
|
|
```
|
|
|
|
```{python}
|
|
#| label: tbl-income-crosstab
|
|
#| tbl-cap: Interventions targeting income inequality
|
|
|
|
temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True)
|
|
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
|
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
|
|
|
gender_df = temp_df.loc[temp_df["Inequality"] == "gender"]
|
|
income_df = temp_df.loc[temp_df["Inequality"] == "income"]
|
|
```
|
|
|
|
## Complete data replication from scoping
|
|
|
|
prep full data set:
|
|
|
|
```{python}
|
|
#| echo: false
|
|
from pathlib import Path
|
|
import re
|
|
## standard imports
|
|
from IPython.core.display import Markdown as md
|
|
import numpy as np
|
|
import pandas as pd
|
|
from matplotlib import pyplot as plt
|
|
import seaborn as sns
|
|
from tabulate import tabulate
|
|
import bibtexparser
|
|
|
|
sns.set_style("whitegrid")
|
|
|
|
DATA_DIR=Path("./02-data")
|
|
RAW_DATA=DATA_DIR.joinpath("raw")
|
|
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
|
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
|
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
|
|
|
bib_string=""
|
|
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
|
with open(partial_bib) as f:
|
|
bib_string+="\n".join(f.readlines())
|
|
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
|
|
|
bib_string=""
|
|
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
|
with open(partial_bib) as f:
|
|
bib_string+="\n".join(f.readlines())
|
|
bib_sample = bibtexparser.parse_string(bib_string)
|
|
```
|
|
|
|
```{python}
|
|
# load relevant studies
|
|
from src import data
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
zot_df = pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
|
|
|
# Add WB country grouping definitions (income group, world region)
|
|
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
|
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
|
|
|
bib_df = (data.from_yml(f"{PROCESSED_DATA}")
|
|
.assign(
|
|
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
|
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
|
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
|
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
|
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
|
year = lambda _df: _df["date"].dt.year,
|
|
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
|
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
|
)
|
|
.query("year >= 2000")
|
|
)
|
|
zot_df = None
|
|
df_country_groups = None
|
|
```
|
|
|
|
```{python}
|
|
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
|
|
|
|
def countries_to_regions(countries:str):
|
|
res = set()
|
|
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
|
|
if c in df_country_groups.index:
|
|
region = df_country_groups.at[c,'Region']
|
|
res.add(region)
|
|
return ";".join(res)
|
|
|
|
# countries_to_regions("India; Nicaragua")
|
|
bib_df['region'] = bib_df['country'].map(countries_to_regions)
|
|
bib_df['region'].value_counts().plot.bar()
|
|
```
|
|
|
|
```{python}
|
|
bib_df = (bib_df
|
|
.assign(
|
|
# create de-duplicated joins for all observations
|
|
region=lambda _df: _df["region"].apply(
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
),
|
|
)
|
|
.explode("region")
|
|
)
|
|
# bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip()
|
|
ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|
|
|
|
```{python}
|
|
df_inequality = (bib_df[["region", "intervention", "inequality"]]
|
|
.assign(
|
|
Intervention = lambda _df: (_df["intervention"]
|
|
.str.replace(r"\(.+\)", "", regex=True)
|
|
.str.replace(r" ?; ?", ";", regex=True)
|
|
.str.strip()
|
|
.str.split(";")
|
|
),
|
|
Inequality = lambda _df: (_df["inequality"]
|
|
.str.replace(r"\(.+\)", "", regex=True)
|
|
.str.replace(r" ?; ?", ";", regex=True)
|
|
.str.strip()
|
|
.str.split(";")
|
|
)
|
|
)
|
|
.explode("Intervention")
|
|
.explode("Inequality")
|
|
.reset_index(drop=True)
|
|
)
|
|
```
|
|
|
|
```{python}
|
|
def crosstab_inequality(df, inequality:str, **kwargs):
|
|
df_temp = df.loc[(df["Inequality"] == inequality) | (df["Inequality"] == "income")]
|
|
tab = pd.crosstab(df_temp["Intervention"], df_temp["Inequality"], **kwargs)
|
|
return tab.drop(tab[tab[inequality] == 0].index)
|
|
```
|
|
|
|
## Gender inequality
|
|
|
|
```{python}
|
|
#| label: tbl-gender-crosstab
|
|
#| tbl-cap: Interventions targeting gender inequality
|
|
|
|
crosstab_inequality(df_inequality, "gender", normalize=False).sort_values("gender", ascending=False)
|
|
```
|
|
|
|
```{python}
|
|
def region_vis_inequality(df, inequality:str):
|
|
df_temp = df.loc[(df["Inequality"] == inequality)]
|
|
return sns.countplot(df_temp, x="region", order=df_temp["region"].value_counts().index)
|
|
region_vis_inequality(df_inequality, "spatial")
|
|
```
|
|
|