--- bibliography: data/intermediate/zotero-library.bib title: Grab yml --- ## Separate data acquisition ```{python} import pandas as pd from src import load_data df = load_data.from_yml() ``` Get interventions: ```{python} df['intervention'].str.split(";").explode().str.strip().value_counts() ``` Get inequalities: ```{python} df['inequality'].str.split(";").explode().str.strip().value_counts() ``` ```{python} df.groupby(["author", "year", "title"]).first().join(df['intervention']) ``` Unique values in chain method: ```{python} ( df.groupby(["author", "year", "title"]) .agg( { "intervention": lambda _col:"; ".join(_col), "inequality": lambda _col:"; ".join(_col), } ) .drop_duplicates() .explode("inequality") ["inequality"].str.strip() .value_counts() ) ``` Merge dataset so it is collected by *STUDY* not by *OBSERVATION*. Any required columns can be calculated similar to the agg function here. ```{python} by_study = ( df.groupby(["author", "year", "title"]) .agg( { "intervention": lambda _col: "; ".join(_col), "inequality": lambda _col: "; ".join(_col), "date": lambda _col: "; ".join(_col), "findings": lambda _col: "; ".join(_col), # "region": lambda _col: "; ".join(_col), # only accessible when merging with WB data # "income_group": lambda _col: "; ".join(_col), } ) .reset_index() .drop_duplicates() .assign( # create de-duplicated joins for all observations intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), inequality=lambda _df: _df["inequality"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), ) ) ``` ```{python} by_study = ( df.groupby(["author", "year", "title"]) .first() .reset_index() .drop_duplicates() .assign( # create de-duplicated joins for all observations intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), inequality=lambda _df: _df["inequality"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), ) ) ``` ```{python} import re from matplotlib import pyplot as plt import seaborn as sns by_intervention = ( df.groupby(["author", "year", "title"]) .agg( { "intervention": lambda _col: "; ".join(_col), } ) .reset_index() .drop_duplicates() .assign( intervention=lambda _df: _df["intervention"].apply( lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) ), ) .explode("intervention") ) sort_order = by_intervention["intervention"].value_counts().index fig = plt.figure() fig.set_size_inches(6, 3) ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() by_intervention = None ``` ```{python} #| label: fig-publications-per-year #| fig-cap: Publications per year df_study_years = ( df.groupby(["author", "year", "title"]) .first() .reset_index() .drop_duplicates() ) # plot by year TODO decide if we want to distinguish by literature type/region/etc as hue # FIXME should be timeseries plot so no years are missing ax = sns.countplot(df_study_years, x="year", native_scale=True) ax.tick_params(axis='x', rotation=45) ax.set_xlabel("") plt.tight_layout() plt.show() df_study_years = None ``` ```{python} #| label: tbl-income-crosstab #| tbl-cap: Interventions targeting income inequality df_income = df.copy() df_income['Inequality'] = df_income['inequality'].str.split(";").explode(ignore_index=True).str.strip() df_income = df_income.loc[df_income['Inequality'] == "income"].copy() df_income['Intervention'] = df_income['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip() pd.crosstab(df_income["Intervention"], df_income["Inequality"]) ``` ```{python} #| label: tbl-income-crosstab #| tbl-cap: Interventions targeting income inequality temp_df = df[["intervention", "inequality"]].copy().reset_index(drop=True) temp_df['Inequality'] = temp_df['inequality'].str.split(";").explode(ignore_index=True).str.strip() temp_df['Intervention'] = temp_df['intervention'].str.split(";").explode(ignore_index=True).str.replace(r"\(.+\)", "", regex=True).str.strip() gender_df = temp_df.loc[temp_df["Inequality"] == "gender"] income_df = temp_df.loc[temp_df["Inequality"] == "income"] ``` ## Complete data replication from scoping prep full data set: ```{python} #| echo: false from pathlib import Path import re ## standard imports from IPython.core.display import Markdown as md import numpy as np import pandas as pd from matplotlib import pyplot as plt import seaborn as sns from tabulate import tabulate import bibtexparser import src.globals as g sns.set_style("whitegrid") bib_string="" for partial_bib in g.REFERENCE_DATA.glob("**/*.bib"): with open(partial_bib) as f: bib_string+="\n".join(f.readlines()) bib_sample_raw_db = bibtexparser.parse_string(bib_string) bib_string="" for partial_bib in g.REFERENCE_DATA.joinpath("zotero-library.bib"): with open(partial_bib) as f: bib_string+="\n".join(f.readlines()) bib_sample = bibtexparser.parse_string(bib_string) ``` ```{python} # load relevant studies from src import load_data # load zotero-based metadata: citations and uses zot_df = pd.DataFrame([ [ entry["doi"] if "doi" in entry.fields_dict else None, entry["times-cited"] if "times-cited" in entry.fields_dict else None, entry["usage"] if "usage" in entry.fields_dict else None, entry["keywords"] if "keywords" in entry.fields_dict else None, ] for entry in bib_sample.entries ], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi") # Add WB country grouping definitions (income group, world region) WB_COUNTRY_GROUPS_FILE = Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy") bib_df = (load_data.from_yml(f"{g.PROCESSED_DATA}") .assign( doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False), zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]), zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]), zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]), date = lambda _df: pd.to_datetime(_df["year"], format="%Y"), year = lambda _df: _df["date"].dt.year, region = lambda _df: _df["country"].map(df_country_groups["Region"]), income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]), ) .query("year >= 2000") ) zot_df = None df_country_groups = None ``` ```{python} df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy") def countries_to_regions(countries:str): res = set() for c in countries.replace(" ;", ";").replace("; ",";").split(";"): if c in df_country_groups.index: region = df_country_groups.at[c,'Region'] res.add(region) return ";".join(res) # countries_to_regions("India; Nicaragua") bib_df['region'] = bib_df['country'].map(countries_to_regions) bib_df['region'].value_counts().plot.bar() ``` ```{python} bib_df = (bib_df .assign( # create de-duplicated joins for all observations region=lambda _df: _df["region"].apply( lambda _cell: set([x.strip() for x in _cell.split(";")]) ), ) .explode("region") ) # bib_df["region"] = bib_df["region"].str.split(";").explode().str.strip() ax = sns.countplot(bib_df, x="region", order=bib_df["region"].value_counts().index) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() ``` ```{python} df_inequality = (bib_df[["region", "intervention", "inequality"]] .assign( Intervention = lambda _df: (_df["intervention"] .str.replace(r"\(.+\)", "", regex=True) .str.replace(r" ?; ?", ";", regex=True) .str.strip() .str.split(";") ), Inequality = lambda _df: (_df["inequality"] .str.replace(r"\(.+\)", "", regex=True) .str.replace(r" ?; ?", ";", regex=True) .str.strip() .str.split(";") ) ) .explode("Intervention") .explode("Inequality") .reset_index(drop=True) ) ``` ```{python} def crosstab_inequality(df, inequality:str, **kwargs): df_temp = df.loc[(df["Inequality"] == inequality) | (df["Inequality"] == "income")] tab = pd.crosstab(df_temp["Intervention"], df_temp["Inequality"], **kwargs) return tab.drop(tab[tab[inequality] == 0].index) ``` ## Gender inequality ```{python} #| label: tbl-gender-crosstab #| tbl-cap: Interventions targeting gender inequality crosstab_inequality(df_inequality, "gender", normalize=False).sort_values("gender", ascending=False) ``` ```{python} def region_vis_inequality(df, inequality:str): df_temp = df.loc[(df["Inequality"] == inequality)] return sns.countplot(df_temp, x="region", order=df_temp["region"].value_counts().index) region_vis_inequality(df_inequality, "spatial") ```