chore(script): Refactor pandas data ingestion

Load data at top of file, then use chained methods for visualizations.
2023-12-10 18:00:27 +01:00 · 2023-12-10 18:00:27 +01:00 · ca7eab92d3
commit ca7eab92d3
parent 1ba2daeacd
1 changed files with 48 additions and 25 deletions
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@ -24,6 +24,7 @@ zotero:
 ```{python}
 #| echo: false
 from pathlib import Path
 import re
 ## standard imports
 from IPython.core.display import Markdown as md
 import numpy as np
@ -32,7 +33,6 @@ from matplotlib import pyplot as plt
 import seaborn as sns
 from tabulate import tabulate
 import bibtexparser
 from bibtexparser.model import Field
 sns.set_style("whitegrid")
@ -487,17 +487,20 @@ Keeping in mind that these results are not yet screened for their full relevance
 #| label: fig-publications-per-year
 #| fig-cap: Publications per year
-# create dummy category for white or gray lit type (based on 'article' appearing in type)
+df_study_years = (
-bib_df["pubtype"].value_counts()
+    bib_df.groupby(["author", "year", "title"])
-bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray")
+    .first()
-bib_df["literature"] = bib_df["literature"].astype("category")
+    .reset_index()
-
+    .drop_duplicates()
-# plot by year, distinguished by literature type
+)
-ax = sns.countplot(bib_df, x="year")
+# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
 # FIXME should be timeseries plot so no years are missing
 ax = sns.countplot(df_study_years, x="year")
 ax.tick_params(axis='x', rotation=45)
-# ax.set_xlabel("")
+ax.set_xlabel("")
 plt.tight_layout()
 plt.show()
 df_study_years = None
 ```
 Anomalies such as the relatively significant dips in output in 2016 and 2012 become especially interesting against the strong later increase of output.
@ -543,21 +546,31 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
 #| label: fig-intervention-types
 #| fig-cap: Predominant type of intervention
-interv_type_df = (
+by_intervention = (
-    bib_df["zot_keywords"]
+    bib_df.groupby(["author", "year", "title"])
-    .str.replace(r"\_", " ")
+    .agg(
-    .str.extractall(r"type::([\w ]+)")
+        {
-    .reset_index(drop=True)
+            "intervention": lambda _col: "; ".join(_col),
-    .rename(columns = {0:"intervention type"})
+        }
    )
    .reset_index()
    .drop_duplicates()
    .assign(
        intervention=lambda _df: _df["intervention"].apply(
            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
        ),
    )
    .explode("intervention")
 )
 sort_order = by_intervention["intervention"].value_counts().index
 sort_order = interv_type_df["intervention type"].value_counts(ascending=False).index
 fig = plt.figure()
 fig.set_size_inches(6, 3)
-ax = sns.countplot(interv_type_df, x="intervention type", order=sort_order)
+ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
 plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
 plt.show()
 by_intervention = None
 ```
 {{++ TODO: describe intervention types with complete dataset ++}}
@ -566,21 +579,31 @@ plt.show()
 #| label: fig-inequality-types
 #| fig-cap: Types of inequality analyzed
-inequ_type_df = (
+by_inequality = (
-    bib_df["zot_keywords"]
+    bib_df.groupby(["author", "year", "title"])
-    .str.replace(r"\_", " ")
+    .agg(
-    .str.extractall(r"inequality::([\w ]+)")
+        {
-    .reset_index(drop=True)
+            "inequality": lambda _col: "; ".join(_col),
-    .rename(columns = {0:"inequality type"})
+        }
    )
    .reset_index()
    .drop_duplicates()
    .assign(
        inequality=lambda _df: _df["inequality"].apply(
            lambda _cell: set([x.strip() for x in _cell.split(";")])
        ),
    )
    .explode("inequality")
 )
 sort_order = by_inequality["inequality"].value_counts().index
 sort_order = inequ_type_df["inequality type"].value_counts(ascending=False).index
 fig = plt.figure()
 fig.set_size_inches(6, 3)
-ax = sns.countplot(inequ_type_df, x="inequality type", order=sort_order)
+ax = sns.countplot(by_inequality, x="inequality", order=by_inequality["inequality"].value_counts().index)
 plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
 plt.show()
 by_inequality = None
 ```
 Income inequality is the primary type of inequality interrogated in most of the relevant studies.