chore(script): Refactor pandas data ingestion

Load data at top of file, then use chained methods for
visualizations.
This commit is contained in:
Marty Oehme 2023-12-10 18:00:27 +01:00
parent 0e29a3332c
commit af2df5736c
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -24,6 +24,7 @@ zotero:
```{python}
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
@ -32,7 +33,6 @@ from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
from bibtexparser.model import Field
sns.set_style("whitegrid")
@ -487,17 +487,20 @@ Keeping in mind that these results are not yet screened for their full relevance
#| label: fig-publications-per-year
#| fig-cap: Publications per year
# create dummy category for white or gray lit type (based on 'article' appearing in type)
bib_df["pubtype"].value_counts()
bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray")
bib_df["literature"] = bib_df["literature"].astype("category")
# plot by year, distinguished by literature type
ax = sns.countplot(bib_df, x="year")
df_study_years = (
bib_df.groupby(["author", "year", "title"])
.first()
.reset_index()
.drop_duplicates()
)
# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
# FIXME should be timeseries plot so no years are missing
ax = sns.countplot(df_study_years, x="year")
ax.tick_params(axis='x', rotation=45)
# ax.set_xlabel("")
ax.set_xlabel("")
plt.tight_layout()
plt.show()
df_study_years = None
```
Anomalies such as the relatively significant dips in output in 2016 and 2012 become especially interesting against the strong later increase of output.
@ -543,21 +546,31 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
#| label: fig-intervention-types
#| fig-cap: Predominant type of intervention
interv_type_df = (
bib_df["zot_keywords"]
.str.replace(r"\_", " ")
.str.extractall(r"type::([\w ]+)")
.reset_index(drop=True)
.rename(columns = {0:"intervention type"})
by_intervention = (
bib_df.groupby(["author", "year", "title"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
sort_order = by_intervention["intervention"].value_counts().index
sort_order = interv_type_df["intervention type"].value_counts(ascending=False).index
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(interv_type_df, x="intervention type", order=sort_order)
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
by_intervention = None
```
{{++ TODO: describe intervention types with complete dataset ++}}
@ -566,21 +579,31 @@ plt.show()
#| label: fig-inequality-types
#| fig-cap: Types of inequality analyzed
inequ_type_df = (
bib_df["zot_keywords"]
.str.replace(r"\_", " ")
.str.extractall(r"inequality::([\w ]+)")
.reset_index(drop=True)
.rename(columns = {0:"inequality type"})
by_inequality = (
bib_df.groupby(["author", "year", "title"])
.agg(
{
"inequality": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
inequality=lambda _df: _df["inequality"].apply(
lambda _cell: set([x.strip() for x in _cell.split(";")])
),
)
.explode("inequality")
)
sort_order = by_inequality["inequality"].value_counts().index
sort_order = inequ_type_df["inequality type"].value_counts(ascending=False).index
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(inequ_type_df, x="inequality type", order=sort_order)
ax = sns.countplot(by_inequality, x="inequality", order=by_inequality["inequality"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
plt.show()
by_inequality = None
```
Income inequality is the primary type of inequality interrogated in most of the relevant studies.