--- bibliography: 02-data/supplementary/lib.bib title: Grab yml --- ```{python} import pandas as pd from src import data df = data.from_yml() ``` Get interventions: ```{python} df['intervention'].str.split(";").explode().str.strip().value_counts() ``` Get inequalities: ```{python} df['inequality'].str.split(";").explode().str.strip().value_counts() ``` ```{python} df.groupby(["author", "year", "title"]).first().join(df['intervention']) ``` Unique values in chain method: ```{python} ( df.groupby(["author", "year", "title"]) .agg( { "intervention": lambda _col:"; ".join(_col), "inequality": lambda _col:"; ".join(_col), } ) .drop_duplicates() .explode("inequality") ["inequality"].str.strip() .value_counts() ) ``` Merge dataset so it is collected by *STUDY* not by *OBSERVATION*. Any required columns can be calculated similar to the agg function here. ```{python} by_study = ( df.groupby(["author", "year", "title"]) .agg( { "intervention": lambda _col: "; ".join(_col), "inequality": lambda _col: "; ".join(_col), "date": lambda _col: "; ".join(_col), "findings": lambda _col: "; ".join(_col), # "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data # "income_group": lambda _col: "; ".join(_col), } ) .reset_index() .drop_duplicates() .assign( # create de-duplicated joins for all observations intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), inequality=lambda _df: _df["inequality"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), ) ) ``` ```{python} by_study = ( df.groupby(["author", "year", "title"]) .first() .reset_index() .drop_duplicates() .assign( # create de-duplicated joins for all observations intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), inequality=lambda _df: _df["inequality"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), ) ) ``` ```{python} import re from matplotlib import pyplot as plt import seaborn as sns by_intervention = ( df.groupby(["author", "year", "title"]) .agg( { "intervention": lambda _col: "; ".join(_col), } ) .reset_index() .drop_duplicates() .assign( intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) ), ) .explode("intervention") ) sort_order = by_intervention["intervention"].value_counts().index fig = plt.figure() fig.set_size_inches(6, 3) ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() by_intervention = None ``` ```{python} #| label: fig-publications-per-year #| fig-cap: Publications per year df_study_years = ( df.groupby(["author", "year", "title"]) .first() .reset_index() .drop_duplicates() ) # plot by year TODO decide if we want to distinguish by literature type/region/etc as hue # FIXME should be timeseries plot so no years are missing ax = sns.countplot(df_study_years, x="year", native_scale=True) ax.tick_params(axis='x', rotation=45) ax.set_xlabel("") plt.tight_layout() plt.show() df_study_years = None ``` ```{python} #| label: tbl-income-crosstab #| tbl-cap: Interventions targeting income inequality df_income = df.copy() df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip() df_income = df_income.loc[df_income['Inequality'] == "income"].copy() df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip() pd.crosstab(df_income["Intervention"], df_income["Inequality"]) ``` ```{python} #| label: tbl-income-crosstab #| tbl-cap: Interventions targeting income inequality def inequality_crosstab(df, inequality:str): temp_df = df.copy() temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip() temp_df = temp_df.loc[temp_df['Inequality'] == inequality].copy() temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip() tab = pd.crosstab(temp_df["Intervention"], temp_df["Inequality"]) temp_df=None return tab inequality_crosstab(df, "income") ```