2023-12-09 17:50:43 +00:00
|
|
|
---
|
|
|
|
bibliography: 02-data/supplementary/lib.bib
|
|
|
|
title: Grab yml
|
|
|
|
---
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
import pandas as pd
|
|
|
|
from src import data
|
|
|
|
|
|
|
|
df = data.from_yml()
|
|
|
|
```
|
|
|
|
|
|
|
|
Get interventions:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
df['intervention'].str.split(";").explode().str.strip().value_counts()
|
|
|
|
```
|
|
|
|
|
|
|
|
Get inequalities:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
df['inequality'].str.split(";").explode().str.strip().value_counts()
|
|
|
|
```
|
2023-12-11 16:14:50 +00:00
|
|
|
|
|
|
|
```{python}
|
|
|
|
df.groupby(["author", "year", "title"]).first().join(df['intervention'])
|
|
|
|
```
|
|
|
|
|
|
|
|
Unique values in chain method:
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
(
|
|
|
|
df.groupby(["author", "year", "title"])
|
|
|
|
.agg(
|
|
|
|
{
|
|
|
|
"intervention": lambda _col:"; ".join(_col),
|
|
|
|
"inequality": lambda _col:"; ".join(_col),
|
|
|
|
}
|
|
|
|
)
|
|
|
|
.drop_duplicates()
|
|
|
|
.explode("inequality")
|
|
|
|
["inequality"].str.strip()
|
|
|
|
.value_counts()
|
|
|
|
)
|
|
|
|
```
|
|
|
|
|
|
|
|
Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
|
|
|
|
Any required columns can be calculated similar to the agg function here.
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
by_study = (
|
|
|
|
df.groupby(["author", "year", "title"])
|
|
|
|
.agg(
|
|
|
|
{
|
|
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
|
|
"inequality": lambda _col: "; ".join(_col),
|
|
|
|
"date": lambda _col: "; ".join(_col),
|
|
|
|
"findings": lambda _col: "; ".join(_col),
|
|
|
|
# "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
|
|
|
|
# "income_group": lambda _col: "; ".join(_col),
|
|
|
|
}
|
|
|
|
)
|
|
|
|
.reset_index()
|
|
|
|
.drop_duplicates()
|
|
|
|
.assign(
|
|
|
|
# create de-duplicated joins for all observations
|
|
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
|
|
),
|
|
|
|
inequality=lambda _df: _df["inequality"].apply(
|
|
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
|
|
),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
```
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
by_study = (
|
|
|
|
df.groupby(["author", "year", "title"])
|
|
|
|
.first()
|
|
|
|
.reset_index()
|
|
|
|
.drop_duplicates()
|
|
|
|
.assign(
|
|
|
|
# create de-duplicated joins for all observations
|
|
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
|
|
),
|
|
|
|
inequality=lambda _df: _df["inequality"].apply(
|
|
|
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
|
|
|
),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
```
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
import re
|
2023-12-21 10:40:05 +00:00
|
|
|
from matplotlib import pyplot as plt
|
|
|
|
import seaborn as sns
|
2023-12-11 16:14:50 +00:00
|
|
|
by_intervention = (
|
|
|
|
df.groupby(["author", "year", "title"])
|
|
|
|
.agg(
|
|
|
|
{
|
|
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
|
|
}
|
|
|
|
)
|
|
|
|
.reset_index()
|
|
|
|
.drop_duplicates()
|
|
|
|
.assign(
|
|
|
|
intervention=lambda _df: _df["intervention"].apply(
|
|
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
|
|
),
|
|
|
|
)
|
|
|
|
.explode("intervention")
|
|
|
|
)
|
|
|
|
sort_order = by_intervention["intervention"].value_counts().index
|
|
|
|
|
|
|
|
fig = plt.figure()
|
|
|
|
fig.set_size_inches(6, 3)
|
|
|
|
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
|
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
|
|
rotation_mode="anchor")
|
|
|
|
plt.show()
|
|
|
|
by_intervention = None
|
|
|
|
```
|
2023-12-21 10:40:05 +00:00
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: fig-publications-per-year
|
|
|
|
#| fig-cap: Publications per year
|
|
|
|
|
|
|
|
df_study_years = (
|
|
|
|
df.groupby(["author", "year", "title"])
|
|
|
|
.first()
|
|
|
|
.reset_index()
|
|
|
|
.drop_duplicates()
|
|
|
|
)
|
|
|
|
# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
|
|
|
|
# FIXME should be timeseries plot so no years are missing
|
|
|
|
ax = sns.countplot(df_study_years, x="year", native_scale=True)
|
|
|
|
ax.tick_params(axis='x', rotation=45)
|
|
|
|
ax.set_xlabel("")
|
|
|
|
plt.tight_layout()
|
|
|
|
plt.show()
|
|
|
|
df_study_years = None
|
|
|
|
```
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: tbl-income-crosstab
|
|
|
|
#| tbl-cap: Interventions targeting income inequality
|
|
|
|
|
|
|
|
df_income = df.copy()
|
|
|
|
df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
|
|
|
df_income = df_income.loc[df_income['Inequality'] == "income"].copy()
|
|
|
|
df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
|
|
|
pd.crosstab(df_income["Intervention"], df_income["Inequality"])
|
|
|
|
```
|
|
|
|
|
|
|
|
```{python}
|
|
|
|
#| label: tbl-income-crosstab
|
|
|
|
#| tbl-cap: Interventions targeting income inequality
|
|
|
|
|
|
|
|
def inequality_crosstab(df, inequality:str):
|
|
|
|
temp_df = df.copy()
|
|
|
|
temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip()
|
|
|
|
temp_df = temp_df.loc[temp_df['Inequality'] == inequality].copy()
|
|
|
|
temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip()
|
|
|
|
tab = pd.crosstab(temp_df["Intervention"], temp_df["Inequality"])
|
|
|
|
temp_df=None
|
|
|
|
return tab
|
|
|
|
|
|
|
|
inequality_crosstab(df, "income")
|
|
|
|
```
|