Marty Oehme
4f9acd0816
Begin restructuring data dir by separating out references into their own data sub-dir containing only references and bibtex files.
355 lines
13 KiB
Text
355 lines
13 KiB
Text
---
|
|
bibliography: ../data/intermediate/zotero-library.bib
|
|
csl: /home/marty/documents/library/utilities/styles/APA-7.csl
|
|
papersize: A4
|
|
linestretch: 1.5
|
|
fontfamily: lmodern
|
|
fontsize: "12"
|
|
geometry:
|
|
- left=2.2cm
|
|
- right=3.5cm
|
|
- top=2.5cm
|
|
- bottom=2.5cm
|
|
lang: en
|
|
title: "Scoping Review: Preliminary findings"
|
|
subtitle: Addressing inequalities in the World of Work
|
|
---
|
|
|
|
```{python}
|
|
#| echo: false
|
|
from pathlib import Path
|
|
import re
|
|
## standard imports
|
|
from IPython.core.display import Markdown as md
|
|
import numpy as np
|
|
import pandas as pd
|
|
from matplotlib import pyplot as plt
|
|
import seaborn as sns
|
|
from tabulate import tabulate
|
|
import bibtexparser
|
|
|
|
sns.set_style("whitegrid")
|
|
|
|
from src import globals as g
|
|
|
|
from src.process.generate_dataframes import bib_sample_raw_db, bib_sample
|
|
|
|
# load relevant studies
|
|
from src.extract import load_data
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
zot_df = pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
|
|
|
# Add WB country grouping definitions (income group, world region)
|
|
WB_COUNTRY_GROUPS_FILE = Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
|
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
|
|
|
bib_df = (load_data.from_yml(f"{g.PROCESSED_DATA}/relevant")
|
|
.assign(
|
|
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
|
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
|
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
|
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
|
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
|
year = lambda _df: _df["date"].dt.year,
|
|
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
|
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
|
)
|
|
.query("year >= 2000")
|
|
)
|
|
zot_df = None
|
|
df_country_groups = None
|
|
```
|
|
|
|
# The data sample
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| output: asis
|
|
|
|
FULL_RAW_SAMPLE_NOTHING_REMOVED = 2396
|
|
nr_database_query_raw = len(bib_sample_raw_db.entries)
|
|
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - len(bib_sample.entries)
|
|
nr_other_sources = (len(bib_sample.entries) + nr_out_duplicates) - nr_database_query_raw
|
|
|
|
all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
|
|
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + 400
|
|
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + 400
|
|
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + 300
|
|
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
|
|
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
|
|
|
|
t3 = "`" * 3
|
|
# FIXME use data/supplementary undeduplciated counts to get database starting and snowballing counts
|
|
# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
|
|
print(f"""
|
|
```{{mermaid}}
|
|
%%| label: fig-prisma
|
|
%%| fig-cap: "Sample sorting process through identification and screening"
|
|
%%| fig-width: 6
|
|
flowchart TD;
|
|
search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
|
|
search_prev["Records identified through other sources (n={nr_other_sources})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
|
|
|
|
starting_sample -- "Duplicate removal ({nr_out_duplicates} removed) "--> dedup["Records after duplicates removed (n={len(bib_sample.entries)})"];
|
|
|
|
dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={len(bib_sample.entries) - nr_out_title})"];
|
|
|
|
title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract})"];
|
|
|
|
abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language})"];
|
|
|
|
language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done}) STILL OUTSTANDING: {len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language - nr_extraction_done}"];
|
|
{t3}
|
|
""")
|
|
```
|
|
|
|
- strongest focus on income inequality (vertical), with many horizontal inequality studies including aspect of income inequality
|
|
- horizontal inequalities: strongest focus on income - gender inequalities (horizontal)
|
|
- interventions:
|
|
- strongest research base on labour rights protection interventions
|
|
- second on infrastructural interventions
|
|
- third on agency-strengthening ones: training, financial access, education programmes
|
|
|
|
- formalization & social protection research rarely goes into inequality outcomes beyond 'income' effects; most excluded for that reason
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| label: fig-inequality-types-whole-sample
|
|
#| fig-cap: Overall inequality types in sample
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
pi = (pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"])
|
|
.drop_duplicates("doi")
|
|
.assign(
|
|
inequality=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('inequality::([\w\_]+),?')
|
|
).dropna(subset="inequality")
|
|
.assign(
|
|
inequality=lambda _df: _df["inequality"].str.replace("_", " "),
|
|
projected = 1
|
|
).reset_index()
|
|
)
|
|
pi
|
|
|
|
inequality = (pd.concat([
|
|
bib_df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"inequality": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.assign(
|
|
projected=0
|
|
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates() , pi])
|
|
.assign( inequality=lambda _df: _df["inequality"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("inequality")
|
|
.drop_duplicates()
|
|
)
|
|
|
|
sort_order = inequality["inequality"].value_counts().index
|
|
i = inequality[inequality["inequality"].str.contains(r"(?:structural|institutional|agency)") == False]
|
|
fig = plt.figure()
|
|
fig.set_size_inches(6, 3)
|
|
ax = sns.countplot(i, x="inequality", hue="projected" ,order=i["inequality"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|
|
|
|
# Preliminary findings
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| label: fig-inequality-types
|
|
#| fig-cap: Finished and projected inequality types
|
|
inequality = (pd.concat([
|
|
bib_df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"inequality": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.assign(
|
|
projected=0
|
|
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
|
|
.assign( inequality=lambda _df: _df["inequality"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("inequality")
|
|
.drop_duplicates()
|
|
)
|
|
|
|
sort_order = inequality["inequality"].value_counts().index
|
|
i = inequality[inequality["inequality"].str.contains(r"(?:structural|institutional|agency)") == False]
|
|
fig = plt.figure()
|
|
fig.set_size_inches(6, 3)
|
|
ax = sns.countplot(i, x="inequality", hue="projected" ,order=i["inequality"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|
|
|
|
- interventions most strongly target gender-income divide
|
|
- most studies here recommend further scale-integration between agency/structural approaches
|
|
- most studies also only focus on analysing a single scale however
|
|
- interventions often have intersectional impacts even if not targeted at them
|
|
- most visible for institutional/structural interventions and spatial inequalities
|
|
- studies analysing intersectional inequalities near unanimously recommend intersectional targeting
|
|
|
|
- individual agency-based interventions (training, subsidies, maternity benefits, transfers, microcredit, etc):
|
|
- seem most effective for targeting WoW outcomes of disability inequalities
|
|
- seem marginally effective for targeting WoW outcomes of gender inequalities
|
|
- require additional mediating scales for other inequalities
|
|
- more structural interventions (education, infrastructural, ubi, trade liberalization, collective action):
|
|
- seem most effective for spatial, income, education-generational inequalities
|
|
- often show longer-term impacts, requiring longer periods of analyses
|
|
- can work without additional agency-based interventions, few studies analyse both at same time
|
|
|
|
# Preliminary limitations
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| label: fig-intervention-types
|
|
#| fig-cap: Finished and projected intervention types
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
pi = (pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"])
|
|
.drop_duplicates("doi")
|
|
.assign(
|
|
intervention=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('type::([\w\_]+),?')
|
|
).dropna(subset="intervention")
|
|
.assign(
|
|
intervention=lambda _df: _df["intervention"].str.replace("_", " "),
|
|
projected = 1
|
|
).reset_index()
|
|
)
|
|
pi
|
|
|
|
by_intervention = (pd.concat([
|
|
bib_df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.assign(
|
|
projected=0
|
|
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
|
|
.assign( intervention=lambda _df: _df["intervention"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("intervention")
|
|
.drop_duplicates()
|
|
)
|
|
|
|
sort_order = by_intervention["intervention"].value_counts().index
|
|
i = by_intervention[by_intervention["intervention"].str.contains(r"(?:structural|institutional|agency)") == False]
|
|
fig = plt.figure()
|
|
fig.set_size_inches(6, 3)
|
|
ax = sns.countplot(i, x="intervention", hue="projected" ,order=i["intervention"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|
|
|
|
- stronger institutional-structural research focus in developed countries, with more structural-agency based in developing countries
|
|
- employment creation as a category is often subsumed in other structural/institutional analyses
|
|
- little evidence-based research on effect of interventions targeting education on world of work outcomes
|
|
- spatial inequality most evenly geographically spread evidence base
|
|
- empirical base on interventions targeting disability inequalities strongly restricted on developed countries, especially United States
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| label: fig-countries
|
|
#| fig-cap: Country spread
|
|
#| column: screen
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
pi = (pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"])
|
|
.drop_duplicates("doi")
|
|
.assign(
|
|
country=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('country::([\w\_]+),?')
|
|
).dropna(subset="country")
|
|
.assign(
|
|
country=lambda _df: _df["country"].str.replace("_", " ").str.replace("US", "United States").str.replace("Britain", "United Kingdom"),
|
|
projected = 1
|
|
).reset_index()
|
|
)
|
|
pi
|
|
|
|
by_country = (pd.concat([
|
|
bib_df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"country": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.assign(
|
|
projected=0
|
|
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
|
|
.assign( country=lambda _df: _df["country"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("country")
|
|
.drop_duplicates()
|
|
)
|
|
|
|
sort_order = by_country["country"].value_counts().index
|
|
i = by_country[by_country["country"].str.contains(r"(?:structural|institutional|agency)") == False]
|
|
fig = plt.figure()
|
|
fig.set_size_inches(12, 5)
|
|
ax = sns.countplot(i, x="country", hue="projected" ,order=i["country"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|