chore(script): Refactor pandas data ingestion
Load data at top of file, then use chained methods for visualizations.
This commit is contained in:
parent
1ba2daeacd
commit
ca7eab92d3
1 changed files with 48 additions and 25 deletions
|
@ -24,6 +24,7 @@ zotero:
|
||||||
```{python}
|
```{python}
|
||||||
#| echo: false
|
#| echo: false
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import re
|
||||||
## standard imports
|
## standard imports
|
||||||
from IPython.core.display import Markdown as md
|
from IPython.core.display import Markdown as md
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -32,7 +33,6 @@ from matplotlib import pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
import bibtexparser
|
import bibtexparser
|
||||||
from bibtexparser.model import Field
|
|
||||||
|
|
||||||
sns.set_style("whitegrid")
|
sns.set_style("whitegrid")
|
||||||
|
|
||||||
|
@ -487,17 +487,20 @@ Keeping in mind that these results are not yet screened for their full relevance
|
||||||
#| label: fig-publications-per-year
|
#| label: fig-publications-per-year
|
||||||
#| fig-cap: Publications per year
|
#| fig-cap: Publications per year
|
||||||
|
|
||||||
# create dummy category for white or gray lit type (based on 'article' appearing in type)
|
df_study_years = (
|
||||||
bib_df["pubtype"].value_counts()
|
bib_df.groupby(["author", "year", "title"])
|
||||||
bib_df["literature"] = np.where(bib_df["pubtype"].str.contains("article", case=False, regex=False), "white", "gray")
|
.first()
|
||||||
bib_df["literature"] = bib_df["literature"].astype("category")
|
.reset_index()
|
||||||
|
.drop_duplicates()
|
||||||
# plot by year, distinguished by literature type
|
)
|
||||||
ax = sns.countplot(bib_df, x="year")
|
# plot by year TODO decide if we want to distinguish by literature type/region/etc as hue
|
||||||
|
# FIXME should be timeseries plot so no years are missing
|
||||||
|
ax = sns.countplot(df_study_years, x="year")
|
||||||
ax.tick_params(axis='x', rotation=45)
|
ax.tick_params(axis='x', rotation=45)
|
||||||
# ax.set_xlabel("")
|
ax.set_xlabel("")
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.show()
|
plt.show()
|
||||||
|
df_study_years = None
|
||||||
```
|
```
|
||||||
|
|
||||||
Anomalies such as the relatively significant dips in output in 2016 and 2012 become especially interesting against the strong later increase of output.
|
Anomalies such as the relatively significant dips in output in 2016 and 2012 become especially interesting against the strong later increase of output.
|
||||||
|
@ -543,21 +546,31 @@ Should they point towards gaps (or over-optimization) of sepcific areas of inter
|
||||||
#| label: fig-intervention-types
|
#| label: fig-intervention-types
|
||||||
#| fig-cap: Predominant type of intervention
|
#| fig-cap: Predominant type of intervention
|
||||||
|
|
||||||
interv_type_df = (
|
by_intervention = (
|
||||||
bib_df["zot_keywords"]
|
bib_df.groupby(["author", "year", "title"])
|
||||||
.str.replace(r"\_", " ")
|
.agg(
|
||||||
.str.extractall(r"type::([\w ]+)")
|
{
|
||||||
.reset_index(drop=True)
|
"intervention": lambda _col: "; ".join(_col),
|
||||||
.rename(columns = {0:"intervention type"})
|
}
|
||||||
)
|
)
|
||||||
|
.reset_index()
|
||||||
|
.drop_duplicates()
|
||||||
|
.assign(
|
||||||
|
intervention=lambda _df: _df["intervention"].apply(
|
||||||
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.explode("intervention")
|
||||||
|
)
|
||||||
|
sort_order = by_intervention["intervention"].value_counts().index
|
||||||
|
|
||||||
sort_order = interv_type_df["intervention type"].value_counts(ascending=False).index
|
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
fig.set_size_inches(6, 3)
|
fig.set_size_inches(6, 3)
|
||||||
ax = sns.countplot(interv_type_df, x="intervention type", order=sort_order)
|
ax = sns.countplot(by_intervention, x="intervention", order=by_intervention["intervention"].value_counts().index)
|
||||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||||
rotation_mode="anchor")
|
rotation_mode="anchor")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
by_intervention = None
|
||||||
```
|
```
|
||||||
|
|
||||||
{{++ TODO: describe intervention types with complete dataset ++}}
|
{{++ TODO: describe intervention types with complete dataset ++}}
|
||||||
|
@ -566,21 +579,31 @@ plt.show()
|
||||||
#| label: fig-inequality-types
|
#| label: fig-inequality-types
|
||||||
#| fig-cap: Types of inequality analyzed
|
#| fig-cap: Types of inequality analyzed
|
||||||
|
|
||||||
inequ_type_df = (
|
by_inequality = (
|
||||||
bib_df["zot_keywords"]
|
bib_df.groupby(["author", "year", "title"])
|
||||||
.str.replace(r"\_", " ")
|
.agg(
|
||||||
.str.extractall(r"inequality::([\w ]+)")
|
{
|
||||||
.reset_index(drop=True)
|
"inequality": lambda _col: "; ".join(_col),
|
||||||
.rename(columns = {0:"inequality type"})
|
}
|
||||||
)
|
)
|
||||||
|
.reset_index()
|
||||||
|
.drop_duplicates()
|
||||||
|
.assign(
|
||||||
|
inequality=lambda _df: _df["inequality"].apply(
|
||||||
|
lambda _cell: set([x.strip() for x in _cell.split(";")])
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.explode("inequality")
|
||||||
|
)
|
||||||
|
sort_order = by_inequality["inequality"].value_counts().index
|
||||||
|
|
||||||
sort_order = inequ_type_df["inequality type"].value_counts(ascending=False).index
|
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
fig.set_size_inches(6, 3)
|
fig.set_size_inches(6, 3)
|
||||||
ax = sns.countplot(inequ_type_df, x="inequality type", order=sort_order)
|
ax = sns.countplot(by_inequality, x="inequality", order=by_inequality["inequality"].value_counts().index)
|
||||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||||
rotation_mode="anchor")
|
rotation_mode="anchor")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
by_inequality = None
|
||||||
```
|
```
|
||||||
|
|
||||||
Income inequality is the primary type of inequality interrogated in most of the relevant studies.
|
Income inequality is the primary type of inequality interrogated in most of the relevant studies.
|
||||||
|
|
Loading…
Reference in a new issue