chore(script): Refactor pandas data ingestion

Load data at top of file, then use chained methods for
visualizations.
This commit is contained in:
Marty Oehme 2023-12-10 18:00:27 +01:00
parent 1ba2daeacd
commit ca7eab92d3
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -24,6 +24,7 @@ zotero:
```{python} ```{python}
#| echo: false #| echo: false
from pathlib import Path from pathlib import Path
import re
## standard imports ## standard imports
from IPython.core.display import Markdown as md from IPython.core.display import Markdown as md
import numpy as np import numpy as np
@ -32,7 +33,6 @@ from matplotlib import pyplot as plt
import seaborn as sns import seaborn as sns
from tabulate import tabulate from tabulate import tabulate
import bibtexparser import bibtexparser
from bibtexparser.model import Field
sns.set_style("whitegrid") sns.set_style("whitegrid")
@ -487,17 +487,20 @@ Keeping in mind that these results are not yet screened for their full relevance
#| label: fig-publications-per-year #| label: fig-publications-per-year
#| fig-cap: Publications per year #| fig-cap: Publications per year
# create dummy category for white or gray lit type (based on 'article' appearing in type) df_study_years = (
bib_df["pubtype"].value_counts() bib_df.groupby(["author", "year", "title"])
bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray") .first()
bib_df["literature"] = bib_df["literature"].astype("category") .reset_index()
.drop_duplicates()
# plot by year, distinguished by literature type )
ax = sns.countplot(bib_df, x="year") # plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
# FIXME should be timeseries plot so no years are missing
ax = sns.countplot(df_study_years, x="year")
ax.tick_params(axis='x', rotation=45) ax.tick_params(axis='x', rotation=45)
# ax.set_xlabel("") ax.set_xlabel("")
plt.tight_layout() plt.tight_layout()
plt.show() plt.show()
df_study_years = None
``` ```
Anomalies such as the relatively significant dips in output in 2016 and 2012 become especially interesting against the strong later increase of output. Anomalies such as the relatively significant dips in output in 2016 and 2012 become especially interesting against the strong later increase of output.
@ -543,21 +546,31 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
#| label: fig-intervention-types #| label: fig-intervention-types
#| fig-cap: Predominant type of intervention #| fig-cap: Predominant type of intervention
interv_type_df = ( by_intervention = (
bib_df["zot_keywords"] bib_df.groupby(["author", "year", "title"])
.str.replace(r"\_", " ") .agg(
.str.extractall(r"type::([\w ]+)") {
.reset_index(drop=True) "intervention": lambda _col: "; ".join(_col),
.rename(columns = {0:"intervention type"}) }
) )
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
sort_order = interv_type_df["intervention type"].value_counts(ascending=False).index
fig = plt.figure() fig = plt.figure()
fig.set_size_inches(6, 3) fig.set_size_inches(6, 3)
ax = sns.countplot(interv_type_df, x="intervention type", order=sort_order) ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor") rotation_mode="anchor")
plt.show() plt.show()
by_intervention = None
``` ```
{{++ TODO: describe intervention types with complete dataset ++}} {{++ TODO: describe intervention types with complete dataset ++}}
@ -566,21 +579,31 @@ plt.show()
#| label: fig-inequality-types #| label: fig-inequality-types
#| fig-cap: Types of inequality analyzed #| fig-cap: Types of inequality analyzed
inequ_type_df = ( by_inequality = (
bib_df["zot_keywords"] bib_df.groupby(["author", "year", "title"])
.str.replace(r"\_", " ") .agg(
.str.extractall(r"inequality::([\w ]+)") {
.reset_index(drop=True) "inequality": lambda _col: "; ".join(_col),
.rename(columns = {0:"inequality type"}) }
) )
.reset_index()
.drop_duplicates()
.assign(
inequality=lambda _df: _df["inequality"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
.explode("inequality")
)
sort_order = by_inequality["inequality"].value_counts().index
sort_order = inequ_type_df["inequality type"].value_counts(ascending=False).index
fig = plt.figure() fig = plt.figure()
fig.set_size_inches(6, 3) fig.set_size_inches(6, 3)
ax = sns.countplot(inequ_type_df, x="inequality type", order=sort_order) ax = sns.countplot(by_inequality, x="inequality", order=by_inequality["inequality"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor") rotation_mode="anchor")
plt.show() plt.show()
by_inequality = None
``` ```
Income inequality is the primary type of inequality interrogated in most of the relevant studies. Income inequality is the primary type of inequality interrogated in most of the relevant studies.