From af2df5736c84cb2b6f51c1c74f4dcd01c2ece974 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sun, 10 Dec 2023 18:00:27 +0100 Subject: [PATCH] chore(script): Refactor pandas data ingestion Load data at top of file, then use chained methods for visualizations. --- scoping_review.qmd | 73 ++++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/scoping_review.qmd b/scoping_review.qmd index 7e94f94..fa8b6a8 100644 --- a/scoping_review.qmd +++ b/scoping_review.qmd @@ -24,6 +24,7 @@ zotero: ```{python} #| echo: false from pathlib import Path +import re ## standard imports from IPython.core.display import Markdown as md import numpy as np @@ -32,7 +33,6 @@ from matplotlib import pyplot as plt import seaborn as sns from tabulate import tabulate import bibtexparser -from bibtexparser.model import Field sns.set_style("whitegrid") @@ -487,17 +487,20 @@ Keeping in mind that these results are not yet screened for their full relevance #| label: fig-publications-per-year #| fig-cap: Publications per year -# create dummy category for white or gray lit type (based on 'article' appearing in type) -bib_df["pubtype"].value_counts() -bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray") -bib_df["literature"] = bib_df["literature"].astype("category") - -# plot by year, distinguished by literature type -ax = sns.countplot(bib_df, x="year") +df_study_years = ( + bib_df.groupby(["author", "year", "title"]) + .first() + .reset_index() + .drop_duplicates() +) +# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue +# FIXME should be timeseries plot so no years are missing +ax = sns.countplot(df_study_years, x="year") ax.tick_params(axis='x', rotation=45) -# ax.set_xlabel("") +ax.set_xlabel("") plt.tight_layout() plt.show() +df_study_years = None ``` Anomalies such as the relatively significant dips in output in 2016 and 2012 become especially interesting against the strong later increase of output. @@ -543,21 +546,31 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter #| label: fig-intervention-types #| fig-cap: Predominant type of intervention -interv_type_df = ( - bib_df["zot_keywords"] - .str.replace(r"\_", " ") - .str.extractall(r"type::([\w ]+)") - .reset_index(drop=True) - .rename(columns = {0:"intervention type"}) +by_intervention = ( + bib_df.groupby(["author", "year", "title"]) + .agg( + { + "intervention": lambda _col: "; ".join(_col), + } + ) + .reset_index() + .drop_duplicates() + .assign( + intervention=lambda _df: _df["intervention"].apply( + lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) + ), + ) + .explode("intervention") ) +sort_order = by_intervention["intervention"].value_counts().index -sort_order = interv_type_df["intervention type"].value_counts(ascending=False).index fig = plt.figure() fig.set_size_inches(6, 3) -ax = sns.countplot(interv_type_df, x="intervention type", order=sort_order) +ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() +by_intervention = None ``` {{++ TODO: describe intervention types with complete dataset ++}} @@ -566,21 +579,31 @@ plt.show() #| label: fig-inequality-types #| fig-cap: Types of inequality analyzed -inequ_type_df = ( - bib_df["zot_keywords"] - .str.replace(r"\_", " ") - .str.extractall(r"inequality::([\w ]+)") - .reset_index(drop=True) - .rename(columns = {0:"inequality type"}) +by_inequality = ( + bib_df.groupby(["author", "year", "title"]) + .agg( + { + "inequality": lambda _col: "; ".join(_col), + } + ) + .reset_index() + .drop_duplicates() + .assign( + inequality=lambda _df: _df["inequality"].apply( + lambda _cell: set([x.strip() for x in _cell.split(";")]) + ), + ) + .explode("inequality") ) +sort_order = by_inequality["inequality"].value_counts().index -sort_order = inequ_type_df["inequality type"].value_counts(ascending=False).index fig = plt.figure() fig.set_size_inches(6, 3) -ax = sns.countplot(inequ_type_df, x="inequality type", order=sort_order) +ax = sns.countplot(by_inequality, x="inequality", order=by_inequality["inequality"].value_counts().index) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.show() +by_inequality = None ``` Income inequality is the primary type of inequality interrogated in most of the relevant studies.