From b5e467e016b0b7e533f2fd519a575d4e9de85eb0 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Mon, 11 Dec 2023 17:14:50 +0100 Subject: [PATCH] feat(code): Add examples of list handling notebook Extracts interventions/inequalities and explodes them for value counts. --- 00-notebooks/yml-grab.qmd | 99 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/00-notebooks/yml-grab.qmd b/00-notebooks/yml-grab.qmd index 9d6feb4..eebf5eb 100644 --- a/00-notebooks/yml-grab.qmd +++ b/00-notebooks/yml-grab.qmd @@ -21,3 +21,102 @@ Get inequalities: ```{python} df['inequality'].str.split(";").explode().str.strip().value_counts() ``` + +```{python} +df.groupby(["author", "year", "title"]).first().join(df['intervention']) +``` + +Unique values in chain method: + +```{python} +( + df.groupby(["author", "year", "title"]) + .agg( + { + "intervention": lambda _col:"; ".join(_col), + "inequality": lambda _col:"; ".join(_col), + } + ) + .drop_duplicates() + .explode("inequality") + ["inequality"].str.strip() + .value_counts() +) +``` + +Merge dataset so it is collected by *STUDY* not by *OBSERVATION*. +Any required columns can be calculated similar to the agg function here. + +```{python} +by_study = ( + df.groupby(["author", "year", "title"]) + .agg( + { + "intervention": lambda _col: "; ".join(_col), + "inequality": lambda _col: "; ".join(_col), + "date": lambda _col: "; ".join(_col), + "findings": lambda _col: "; ".join(_col), + # "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data + # "income_group": lambda _col: "; ".join(_col), + } + ) + .reset_index() + .drop_duplicates() + .assign( + # create de-duplicated joins for all observations + intervention=lambda _df: _df["intervention"].apply( + lambda _cell: set([x.strip() for x in _cell.split(";")]) + ), + inequality=lambda _df: _df["inequality"].apply( + lambda _cell: set([x.strip() for x in _cell.split(";")]) + ), + ) +) +``` + +```{python} +by_study = ( + df.groupby(["author", "year", "title"]) + .first() + .reset_index() + .drop_duplicates() + .assign( + # create de-duplicated joins for all observations + intervention=lambda _df: _df["intervention"].apply( + lambda _cell: set([x.strip() for x in _cell.split(";")]) + ), + inequality=lambda _df: _df["inequality"].apply( + lambda _cell: set([x.strip() for x in _cell.split(";")]) + ), + ) +) +``` + +```{python} +import re +by_intervention = ( + df.groupby(["author", "year", "title"]) + .agg( + { + "intervention": lambda _col: "; ".join(_col), + } + ) + .reset_index() + .drop_duplicates() + .assign( + intervention=lambda _df: _df["intervention"].apply( + lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) + ), + ) + .explode("intervention") +) +sort_order = by_intervention["intervention"].value_counts().index + +fig = plt.figure() +fig.set_size_inches(6, 3) +ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) +plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") +plt.show() +by_intervention = None +```