--- bibliography: 02-data/supplementary/lib.bib title: Grab yml --- ```{python} import pandas as pd from src import data df = data.from_yml() ``` Get interventions: ```{python} df['intervention'].str.split(";").explode().str.strip().value_counts() ``` Get inequalities: ```{python} df['inequality'].str.split(";").explode().str.strip().value_counts() ``` ```{python} df.groupby(["author", "year", "title"]).first().join(df['intervention']) ``` Unique values in chain method: ```{python} ( df.groupby(["author", "year", "title"]) .agg( { "intervention": lambda _col:"; ".join(_col), "inequality": lambda _col:"; ".join(_col), } ) .drop_duplicates() .explode("inequality") ["inequality"].str.strip() .value_counts() ) ``` Merge dataset so it is collected by *STUDY* not by *OBSERVATION*. Any required columns can be calculated similar to the agg function here. ```{python} by_study = ( df.groupby(["author", "year", "title"]) .agg( { "intervention": lambda _col: "; ".join(_col), "inequality": lambda _col: "; ".join(_col), "date": lambda _col: "; ".join(_col), "findings": lambda _col: "; ".join(_col), # "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data # "income_group": lambda _col: "; ".join(_col), } ) .reset_index() .drop_duplicates() .assign( # create de-duplicated joins for all observations intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), inequality=lambda _df: _df["inequality"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), ) ) ``` ```{python} by_study = ( df.groupby(["author", "year", "title"]) .first() .reset_index() .drop_duplicates() .assign( # create de-duplicated joins for all observations intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), inequality=lambda _df: _df["inequality"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), ) ) ``` ```{python} import re by_intervention = ( df.groupby(["author", "year", "title"]) .agg( { "intervention": lambda _col: "; ".join(_col), } ) .reset_index() .drop_duplicates() .assign( intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) ), ) .explode("intervention") ) sort_order = by_intervention["intervention"].value_counts().index fig = plt.figure() fig.set_size_inches(6, 3) ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() by_intervention = None ```