From af2df5736c84cb2b6f51c1c74f4dcd01c2ece974 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Sun, 10 Dec 2023 18:00:27 +0100
Subject: [PATCH] chore(script): Refactor pandas data ingestion

Load data at top of file, then use chained methods for
visualizations.
---
 scoping_review.qmd | 73 ++++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/scoping_review.qmd b/scoping_review.qmd
index 7e94f94..fa8b6a8 100644
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@@ -24,6 +24,7 @@ zotero:
 ```{python}
 #| echo: false
 from pathlib import Path
+import re
 ## standard imports
 from IPython.core.display import Markdown as md
 import numpy as np
@@ -32,7 +33,6 @@ from matplotlib import pyplot as plt
 import seaborn as sns
 from tabulate import tabulate
 import bibtexparser
-from bibtexparser.model import Field
 
 sns.set_style("whitegrid")
 
@@ -487,17 +487,20 @@ Keeping in mind that these results are not yet screened for their full relevance
 #| label: fig-publications-per-year
 #| fig-cap: Publications per year
 
-# create dummy category for white or gray lit type (based on 'article' appearing in type)
-bib_df["pubtype"].value_counts()
-bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray")
-bib_df["literature"] = bib_df["literature"].astype("category")
-
-# plot by year, distinguished by literature type
-ax = sns.countplot(bib_df, x="year")
+df_study_years = (
+    bib_df.groupby(["author", "year", "title"])
+    .first()
+    .reset_index()
+    .drop_duplicates()
+)
+# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
+# FIXME should be timeseries plot so no years are missing
+ax = sns.countplot(df_study_years, x="year")
 ax.tick_params(axis='x', rotation=45)
-# ax.set_xlabel("")
+ax.set_xlabel("")
 plt.tight_layout()
 plt.show()
+df_study_years = None
 ```
 
 Anomalies such as the relatively significant dips in output in 2016 and 2012 become especially interesting against the strong later increase of output.
@@ -543,21 +546,31 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
 #| label: fig-intervention-types
 #| fig-cap: Predominant type of intervention
 
-interv_type_df = (
-    bib_df["zot_keywords"]
-    .str.replace(r"\_", " ")
-    .str.extractall(r"type::([\w ]+)")
-    .reset_index(drop=True)
-    .rename(columns = {0:"intervention type"})
+by_intervention = (
+    bib_df.groupby(["author", "year", "title"])
+    .agg(
+        {
+            "intervention": lambda _col: "; ".join(_col),
+        }
+    )
+    .reset_index()
+    .drop_duplicates()
+    .assign(
+        intervention=lambda _df: _df["intervention"].apply(
+            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
+        ),
+    )
+    .explode("intervention")
 )
+sort_order = by_intervention["intervention"].value_counts().index
 
-sort_order = interv_type_df["intervention type"].value_counts(ascending=False).index
 fig = plt.figure()
 fig.set_size_inches(6, 3)
-ax = sns.countplot(interv_type_df, x="intervention type", order=sort_order)
+ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
 plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
          rotation_mode="anchor")
 plt.show()
+by_intervention = None
 ```
 
 {{++ TODO: describe intervention types with complete dataset ++}}
@@ -566,21 +579,31 @@ plt.show()
 #| label: fig-inequality-types
 #| fig-cap: Types of inequality analyzed
 
-inequ_type_df = (
-    bib_df["zot_keywords"]
-    .str.replace(r"\_", " ")
-    .str.extractall(r"inequality::([\w ]+)")
-    .reset_index(drop=True)
-    .rename(columns = {0:"inequality type"})
+by_inequality = (
+    bib_df.groupby(["author", "year", "title"])
+    .agg(
+        {
+            "inequality": lambda _col: "; ".join(_col),
+        }
+    )
+    .reset_index()
+    .drop_duplicates()
+    .assign(
+        inequality=lambda _df: _df["inequality"].apply(
+            lambda _cell: set([x.strip() for x in _cell.split(";")])
+        ),
+    )
+    .explode("inequality")
 )
+sort_order = by_inequality["inequality"].value_counts().index
 
-sort_order = inequ_type_df["inequality type"].value_counts(ascending=False).index
 fig = plt.figure()
 fig.set_size_inches(6, 3)
-ax = sns.countplot(inequ_type_df, x="inequality type", order=sort_order)
+ax = sns.countplot(by_inequality, x="inequality", order=by_inequality["inequality"].value_counts().index)
 plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
          rotation_mode="anchor")
 plt.show()
+by_inequality = None
 ```
 
 Income inequality is the primary type of inequality interrogated in most of the relevant studies.