feat(code): Add examples of list handling notebook

Extracts interventions/inequalities and explodes them for value counts.
This commit is contained in:
Marty Oehme 2023-12-11 17:14:50 +01:00
parent 85497854c1
commit b5e467e016
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -21,3 +21,102 @@ Get inequalities:
```{python} ```{python}
df['inequality'].str.split(";").explode().str.strip().value_counts() df['inequality'].str.split(";").explode().str.strip().value_counts()
``` ```
```{python}
df.groupby(["author", "year", "title"]).first().join(df['intervention'])
```
Unique values in chain method:
```{python}
(
df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col:"; ".join(_col),
"inequality": lambda _col:"; ".join(_col),
}
)
.drop_duplicates()
.explode("inequality")
["inequality"].str.strip()
.value_counts()
)
```
Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
Any required columns can be calculated similar to the agg function here.
```{python}
by_study = (
df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
"inequality": lambda _col: "; ".join(_col),
"date": lambda _col: "; ".join(_col),
"findings": lambda _col: "; ".join(_col),
# "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
# "income_group": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
# create de-duplicated joins for all observations
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
inequality=lambda _df: _df["inequality"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
)
```
```{python}
by_study = (
df.groupby(["author", "year", "title"])
.first()
.reset_index()
.drop_duplicates()
.assign(
# create de-duplicated joins for all observations
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
inequality=lambda _df: _df["inequality"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
)
```
```{python}
import re
by_intervention = (
df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
by_intervention = None
```