Marty Oehme
95ad5ed641
To not double-commit every library change, we simply export ALL of the zotero library into a single file in the 'intermediate' data directory. Technically this still works just as well since it still reflects our 'intermediate' stage of tagging, screening, keywording the library contents. It just contains the non-sampled contents as well now.
369 lines
14 KiB
Text
369 lines
14 KiB
Text
---
|
|
bibliography: 02-data/intermediate/zotero-library.bib
|
|
csl: /home/marty/documents/library/utilities/styles/APA-7.csl
|
|
papersize: A4
|
|
linestretch: 1.5
|
|
fontfamily: lmodern
|
|
fontsize: "12"
|
|
geometry:
|
|
- left=2.2cm
|
|
- right=3.5cm
|
|
- top=2.5cm
|
|
- bottom=2.5cm
|
|
lang: en
|
|
title: "Scoping Review: Preliminary findings"
|
|
subtitle: Addressing inequalities in the World of Work
|
|
---
|
|
|
|
```{python}
|
|
#| echo: false
|
|
from pathlib import Path
|
|
import re
|
|
## standard imports
|
|
from IPython.core.display import Markdown as md
|
|
import numpy as np
|
|
import pandas as pd
|
|
from matplotlib import pyplot as plt
|
|
import seaborn as sns
|
|
from tabulate import tabulate
|
|
import bibtexparser
|
|
|
|
sns.set_style("whitegrid")
|
|
|
|
DATA_DIR=Path("./02-data")
|
|
RAW_DATA=DATA_DIR.joinpath("raw")
|
|
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
|
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
|
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
|
|
|
bib_string=""
|
|
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
|
with open(partial_bib) as f:
|
|
bib_string+="\n".join(f.readlines())
|
|
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
|
|
|
bib_string=""
|
|
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
|
with open(partial_bib) as f:
|
|
bib_string+="\n".join(f.readlines())
|
|
bib_sample = bibtexparser.parse_string(bib_string)
|
|
|
|
# load relevant studies
|
|
from src import data
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
zot_df = pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
|
|
|
# Add WB country grouping definitions (income group, world region)
|
|
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
|
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
|
|
|
bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
|
|
.assign(
|
|
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
|
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
|
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
|
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
|
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
|
year = lambda _df: _df["date"].dt.year,
|
|
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
|
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
|
)
|
|
.query("year >= 2000")
|
|
)
|
|
zot_df = None
|
|
df_country_groups = None
|
|
```
|
|
|
|
# The data sample
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| output: asis
|
|
|
|
FULL_RAW_SAMPLE_NOTHING_REMOVED = 2396
|
|
nr_database_query_raw = len(bib_sample_raw_db.entries)
|
|
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - len(bib_sample.entries)
|
|
nr_other_sources = (len(bib_sample.entries) + nr_out_duplicates) - nr_database_query_raw
|
|
|
|
all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
|
|
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + 400
|
|
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + 400
|
|
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + 300
|
|
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
|
|
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
|
|
|
|
t3 = "`" * 3
|
|
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
|
|
# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
|
|
print(f"""
|
|
```{{mermaid}}
|
|
%%| label: fig-prisma
|
|
%%| fig-cap: "Sample sorting process through identification and screening"
|
|
%%| fig-width: 6
|
|
flowchart TD;
|
|
search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
|
|
search_prev["Records identified through other sources (n={nr_other_sources})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
|
|
|
|
starting_sample -- "Duplicate removal ({nr_out_duplicates} removed) "--> dedup["Records after duplicates removed (n={len(bib_sample.entries)})"];
|
|
|
|
dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={len(bib_sample.entries) - nr_out_title})"];
|
|
|
|
title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract})"];
|
|
|
|
abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language})"];
|
|
|
|
language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done}) STILL OUTSTANDING: {len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language - nr_extraction_done}"];
|
|
{t3}
|
|
""")
|
|
```
|
|
|
|
- strongest focus on income inequality (vertical), with many horizontal inequality studies including aspect of income inequality
|
|
- horizontal inequalities: strongest focus on income - gender inequalities (horizontal)
|
|
- interventions:
|
|
- strongest research base on labour rights protection interventions
|
|
- second on infrastructural interventions
|
|
- third on agency-strengthening ones: training, financial access, education programmes
|
|
|
|
- formalization & social protection research rarely goes into inequality outcomes beyond 'income' effects; most excluded for that reason
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| label: fig-inequality-types-whole-sample
|
|
#| fig-cap: Overall inequality types in sample
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
pi = (pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"])
|
|
.drop_duplicates("doi")
|
|
.assign(
|
|
inequality=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('inequality::([\w\_]+),?')
|
|
).dropna(subset="inequality")
|
|
.assign(
|
|
inequality=lambda _df: _df["inequality"].str.replace("_", " "),
|
|
projected = 1
|
|
).reset_index()
|
|
)
|
|
pi
|
|
|
|
inequality = (pd.concat([
|
|
bib_df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"inequality": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.assign(
|
|
projected=0
|
|
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates() , pi])
|
|
.assign( inequality=lambda _df: _df["inequality"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("inequality")
|
|
.drop_duplicates()
|
|
)
|
|
|
|
sort_order = inequality["inequality"].value_counts().index
|
|
i = inequality[inequality["inequality"].str.contains(r"(?:structural|institutional|agency)") == False]
|
|
fig = plt.figure()
|
|
fig.set_size_inches(6, 3)
|
|
ax = sns.countplot(i, x="inequality", hue="projected" ,order=i["inequality"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|
|
|
|
# Preliminary findings
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| label: fig-inequality-types
|
|
#| fig-cap: Finished and projected inequality types
|
|
inequality = (pd.concat([
|
|
bib_df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"inequality": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.assign(
|
|
projected=0
|
|
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
|
|
.assign( inequality=lambda _df: _df["inequality"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("inequality")
|
|
.drop_duplicates()
|
|
)
|
|
|
|
sort_order = inequality["inequality"].value_counts().index
|
|
i = inequality[inequality["inequality"].str.contains(r"(?:structural|institutional|agency)") == False]
|
|
fig = plt.figure()
|
|
fig.set_size_inches(6, 3)
|
|
ax = sns.countplot(i, x="inequality", hue="projected" ,order=i["inequality"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|
|
|
|
- interventions most strongly target gender-income divide
|
|
- most studies here recommend further scale-integration between agency/structural approaches
|
|
- most studies also only focus on analysing a single scale however
|
|
- interventions often have intersectional impacts even if not targeted at them
|
|
- most visible for institutional/structural interventions and spatial inequalities
|
|
- studies analysing intersectional inequalities near unanimously recommend intersectional targeting
|
|
|
|
- individual agency-based interventions (training, subsidies, maternity benefits, transfers, microcredit, etc):
|
|
- seem most effective for targeting WoW outcomes of disability inequalities
|
|
- seem marginally effective for targeting WoW outcomes of gender inequalities
|
|
- require additional mediating scales for other inequalities
|
|
- more structural interventions (education, infrastructural, ubi, trade liberalization, collective action):
|
|
- seem most effective for spatial, income, education-generational inequalities
|
|
- often show longer-term impacts, requiring longer periods of analyses
|
|
- can work without additional agency-based interventions, few studies analyse both at same time
|
|
|
|
# Preliminary limitations
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| label: fig-intervention-types
|
|
#| fig-cap: Finished and projected intervention types
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
pi = (pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"])
|
|
.drop_duplicates("doi")
|
|
.assign(
|
|
intervention=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('type::([\w\_]+),?')
|
|
).dropna(subset="intervention")
|
|
.assign(
|
|
intervention=lambda _df: _df["intervention"].str.replace("_", " "),
|
|
projected = 1
|
|
).reset_index()
|
|
)
|
|
pi
|
|
|
|
by_intervention = (pd.concat([
|
|
bib_df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"intervention": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.assign(
|
|
projected=0
|
|
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
|
|
.assign( intervention=lambda _df: _df["intervention"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("intervention")
|
|
.drop_duplicates()
|
|
)
|
|
|
|
sort_order = by_intervention["intervention"].value_counts().index
|
|
i = by_intervention[by_intervention["intervention"].str.contains(r"(?:structural|institutional|agency)") == False]
|
|
fig = plt.figure()
|
|
fig.set_size_inches(6, 3)
|
|
ax = sns.countplot(i, x="intervention", hue="projected" ,order=i["intervention"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|
|
|
|
- stronger institutional-structural research focus in developed countries, with more structural-agency based in developing countries
|
|
- employment creation as a category is often subsumed in other structural/institutional analyses
|
|
- little evidence-based research on effect of interventions targeting education on world of work outcomes
|
|
- spatial inequality most evenly geographically spread evidence base
|
|
- empirical base on interventions targeting disability inequalities strongly restricted on developed countries, especially United States
|
|
|
|
```{python}
|
|
#| echo: false
|
|
#| label: fig-countries
|
|
#| fig-cap: Country spread
|
|
#| column: screen
|
|
|
|
# load zotero-based metadata: citations and uses
|
|
pi = (pd.DataFrame([
|
|
[
|
|
entry["doi"] if "doi" in entry.fields_dict else None,
|
|
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
|
entry["usage"] if "usage" in entry.fields_dict else None,
|
|
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
|
]
|
|
for entry in bib_sample.entries
|
|
], columns = ["doi", "cited", "usage", "keywords"])
|
|
.drop_duplicates("doi")
|
|
.assign(
|
|
country=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('country::([\w\_]+),?')
|
|
).dropna(subset="country")
|
|
.assign(
|
|
country=lambda _df: _df["country"].str.replace("_", " ").str.replace("US", "United States").str.replace("Britain", "United Kingdom"),
|
|
projected = 1
|
|
).reset_index()
|
|
)
|
|
pi
|
|
|
|
by_country = (pd.concat([
|
|
bib_df.groupby(["author", "year", "title"])
|
|
.agg(
|
|
{
|
|
"country": lambda _col: "; ".join(_col),
|
|
}
|
|
)
|
|
.assign(
|
|
projected=0
|
|
|
|
)
|
|
.reset_index()
|
|
.drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
|
|
.assign( country=lambda _df: _df["country"].apply(
|
|
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
),
|
|
)
|
|
.explode("country")
|
|
.drop_duplicates()
|
|
)
|
|
|
|
sort_order = by_country["country"].value_counts().index
|
|
i = by_country[by_country["country"].str.contains(r"(?:structural|institutional|agency)") == False]
|
|
fig = plt.figure()
|
|
fig.set_size_inches(12, 5)
|
|
ax = sns.countplot(i, x="country", hue="projected" ,order=i["country"].value_counts().index)
|
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
|
rotation_mode="anchor")
|
|
plt.show()
|
|
```
|