feat(code): Add examples of list handling notebook

Extracts interventions/inequalities and explodes them for value counts.
2023-12-11 17:14:50 +01:00 · 2023-12-11 17:14:50 +01:00 · b5e467e016
commit b5e467e016
parent 85497854c1
1 changed files with 99 additions and 0 deletions
--- a/00-notebooks/yml-grab.qmd
+++ b/00-notebooks/yml-grab.qmd
@ -21,3 +21,102 @@ Get inequalities:
 ```{python}
 df['inequality'].str.split(";").explode().str.strip().value_counts()
 ```
 ```{python}
 df.groupby(["author", "year", "title"]).first().join(df['intervention'])
 ```
 Unique values in chain method:
 ```{python}
 (
    df.groupby(["author", "year", "title"])
    .agg(
        {
            "intervention": lambda _col:"; ".join(_col),
            "inequality": lambda _col:"; ".join(_col),
        }
    )
    .drop_duplicates()
    .explode("inequality")
    ["inequality"].str.strip()
    .value_counts()
 )
 ```
 Merge dataset so it is collected by *STUDY* not by *OBSERVATION*.
 Any required columns can be calculated similar to the agg function here.
 ```{python}
 by_study = (
    df.groupby(["author", "year", "title"])
    .agg(
        {
            "intervention": lambda _col: "; ".join(_col),
            "inequality": lambda _col: "; ".join(_col),
            "date": lambda _col: "; ".join(_col),
            "findings": lambda _col: "; ".join(_col),
            # "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data
            # "income_group": lambda _col: "; ".join(_col),
        }
    )
    .reset_index()
    .drop_duplicates()
    .assign(
        # create de-duplicated joins for all observations
        intervention=lambda _df: _df["intervention"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
        inequality=lambda _df: _df["inequality"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
    )
 )
 ```
 ```{python}
 by_study = (
    df.groupby(["author", "year", "title"])
    .first()
    .reset_index()
    .drop_duplicates()
    .assign(
        # create de-duplicated joins for all observations
        intervention=lambda _df: _df["intervention"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
        inequality=lambda _df: _df["inequality"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
    )
 )
 ```
 ```{python}
 import re
 by_intervention = (
    df.groupby(["author", "year", "title"])
    .agg(
        {
            "intervention": lambda _col: "; ".join(_col),
        }
    )
    .reset_index()
    .drop_duplicates()
    .assign(
        intervention=lambda _df: _df["intervention"].apply(
            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
        ),
    )
    .explode("intervention")
 )
 sort_order = by_intervention["intervention"].value_counts().index
 fig = plt.figure()
 fig.set_size_inches(6, 3)
 ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
 plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
 plt.show()
 by_intervention = None
 ```