wow-inequalities/manuscript/meeting_eoy.qmd

---
bibliography: ../data/references/zotero-library.bib
csl: /home/marty/documents/library/utilities/styles/APA-7.csl
papersize: A4
linestretch: 1.5
fontfamily: lmodern
fontsize: "12"
geometry:
    - left=2.2cm
    - right=3.5cm
    - top=2.5cm
    - bottom=2.5cm
lang: en
title: "Scoping Review: Preliminary findings"
subtitle: Addressing inequalities in the World of Work
---

```{python}
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser

sns.set_style("whitegrid")

from src import globals as g

from src import bib_sample

# load relevant studies
from src.extract import load_data

# load zotero-based metadata: citations and uses
zot_df = pd.DataFrame([
    [
        entry["doi"] if "doi" in entry.fields_dict else None,
        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
        entry["usage"] if "usage" in entry.fields_dict else None,
        entry["keywords"] if "keywords" in entry.fields_dict else None,
    ]
    for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")

# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")

bib_df = (load_data.from_yml(f"{g.EXTRACTED_DATA}")
    .assign(
        doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
        zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
        zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
        zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
        date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
        year = lambda _df: _df["date"].dt.year,
        region = lambda _df: _df["country"].map(df_country_groups["Region"]),
        income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
    )
    .query("year >= 2000")
)
zot_df = None
df_country_groups = None
```

# The data sample

```{python}
#| echo: false
#| output: asis

FULL_RAW_SAMPLE_NOTHING_REMOVED = 2396
nr_database_query_raw = len(bib_sample_raw_db.entries)
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - len(bib_sample.entries)
nr_other_sources = (len(bib_sample.entries) + nr_out_duplicates) - nr_database_query_raw

all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + 400
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + 400
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + 300
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])

t3 = "`" * 3
# FIXME use data/supplementary undeduplciated counts to get database starting and snowballing counts
# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
print(f"""
```{{mermaid}}
%%| label: fig-prisma
%%| fig-cap: "Sample sorting process through identification and screening"
%%| fig-width: 6
flowchart TD;
    search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
    search_prev["Records identified through other sources (n={nr_other_sources})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];

    starting_sample -- "Duplicate removal ({nr_out_duplicates} removed) "--> dedup["Records after duplicates removed (n={len(bib_sample.entries)})"];

    dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={len(bib_sample.entries) - nr_out_title})"];

    title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract})"];

    abstract_screened -- "  Language screening ({nr_out_language} excluded)  "--> language_screened["Records after language screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language})"];

    language_screened -- "  Full-text screening ({nr_out_fulltext} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done}) STILL OUTSTANDING: {len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language - nr_extraction_done}"];
{t3}
""")
```

- strongest focus on income inequality (vertical), with many horizontal inequality studies including aspect of income inequality
- horizontal inequalities: strongest focus on income - gender inequalities (horizontal)
- interventions:
    - strongest research base on labour rights protection interventions
    - second on infrastructural interventions
    - third on agency-strengthening ones: training, financial access, education programmes

- formalization & social protection research rarely goes into inequality outcomes beyond 'income' effects; most excluded for that reason

```{python}
#| echo: false
#| label: fig-inequality-types-whole-sample
#| fig-cap: Overall inequality types in sample

# load zotero-based metadata: citations and uses
pi = (pd.DataFrame([
    [
        entry["doi"] if "doi" in entry.fields_dict else None,
        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
        entry["usage"] if "usage" in entry.fields_dict else None,
        entry["keywords"] if "keywords" in entry.fields_dict else None,
    ]
    for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"])
    .drop_duplicates("doi")
     .assign(
        inequality=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('inequality::([\w\_]+),?')
    ).dropna(subset="inequality")
     .assign(
        inequality=lambda _df: _df["inequality"].str.replace("_", " "),
        projected = 1
      ).reset_index()
 )
pi

inequality = (pd.concat([
    bib_df.groupby(["author", "year", "title"])
    .agg(
        {
            "inequality": lambda _col: "; ".join(_col),
        }
    )
    .assign(
                projected=0

    )
    .reset_index()
    .drop_duplicates() , pi])
    .assign( inequality=lambda _df: _df["inequality"].apply(
            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
        ),
    )
    .explode("inequality")
.drop_duplicates()
)

sort_order = inequality["inequality"].value_counts().index
i = inequality[inequality["inequality"].str.contains(r"(?:structural|institutional|agency)") == False]
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(i, x="inequality", hue="projected" ,order=i["inequality"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
plt.show()
```

# Preliminary findings

```{python}
#| echo: false
#| label: fig-inequality-types
#| fig-cap: Finished and projected inequality types
inequality = (pd.concat([
    bib_df.groupby(["author", "year", "title"])
    .agg(
        {
            "inequality": lambda _col: "; ".join(_col),
        }
    )
    .assign(
                projected=0

    )
    .reset_index()
    .drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
    .assign( inequality=lambda _df: _df["inequality"].apply(
            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
        ),
    )
    .explode("inequality")
.drop_duplicates()
)

sort_order = inequality["inequality"].value_counts().index
i = inequality[inequality["inequality"].str.contains(r"(?:structural|institutional|agency)") == False]
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(i, x="inequality", hue="projected" ,order=i["inequality"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
plt.show()
```

- interventions most strongly target gender-income divide
    - most studies here recommend further scale-integration between agency/structural approaches
    - most studies also only focus on analysing a single scale however
- interventions often have intersectional impacts even if not targeted at them
    - most visible for institutional/structural interventions and spatial inequalities
    - studies analysing intersectional inequalities near unanimously recommend intersectional targeting

- individual agency-based interventions (training, subsidies, maternity benefits, transfers, microcredit, etc):
    - seem most effective for targeting WoW outcomes of disability inequalities
    - seem marginally effective for targeting WoW outcomes of gender inequalities
    - require additional mediating scales for other inequalities
- more structural interventions (education, infrastructural, ubi, trade liberalization, collective action):
    - seem most effective for spatial, income, education-generational inequalities
    - often show longer-term impacts, requiring longer periods of analyses
    - can work without additional agency-based interventions, few studies analyse both at same time

# Preliminary limitations

```{python}
#| echo: false
#| label: fig-intervention-types
#| fig-cap: Finished and projected intervention types

# load zotero-based metadata: citations and uses
pi = (pd.DataFrame([
    [
        entry["doi"] if "doi" in entry.fields_dict else None,
        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
        entry["usage"] if "usage" in entry.fields_dict else None,
        entry["keywords"] if "keywords" in entry.fields_dict else None,
    ]
    for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"])
    .drop_duplicates("doi")
     .assign(
        intervention=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('type::([\w\_]+),?')
    ).dropna(subset="intervention")
     .assign(
        intervention=lambda _df: _df["intervention"].str.replace("_", " "),
        projected = 1
      ).reset_index()
 )
pi

by_intervention = (pd.concat([
    bib_df.groupby(["author", "year", "title"])
    .agg(
        {
            "intervention": lambda _col: "; ".join(_col),
        }
    )
    .assign(
                projected=0

    )
    .reset_index()
    .drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
    .assign( intervention=lambda _df: _df["intervention"].apply(
            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
        ),
    )
    .explode("intervention")
.drop_duplicates()
)

sort_order = by_intervention["intervention"].value_counts().index
i = by_intervention[by_intervention["intervention"].str.contains(r"(?:structural|institutional|agency)") == False]
fig = plt.figure()
fig.set_size_inches(6, 3)
ax = sns.countplot(i, x="intervention", hue="projected" ,order=i["intervention"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
plt.show()
```

- stronger institutional-structural research focus in developed countries, with more structural-agency based in developing countries
- employment creation as a category is often subsumed in other structural/institutional analyses
- little evidence-based research on effect of interventions targeting education on world of work outcomes
- spatial inequality most evenly geographically spread evidence base
- empirical base on interventions targeting disability inequalities strongly restricted on developed countries, especially United States

```{python}
#| echo: false
#| label: fig-countries
#| fig-cap: Country spread
#| column: screen

# load zotero-based metadata: citations and uses
pi = (pd.DataFrame([
    [
        entry["doi"] if "doi" in entry.fields_dict else None,
        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
        entry["usage"] if "usage" in entry.fields_dict else None,
        entry["keywords"] if "keywords" in entry.fields_dict else None,
    ]
    for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"])
    .drop_duplicates("doi")
     .assign(
        country=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('country::([\w\_]+),?')
    ).dropna(subset="country")
     .assign(
        country=lambda _df: _df["country"].str.replace("_", " ").str.replace("US", "United States").str.replace("Britain", "United Kingdom"),
        projected = 1
      ).reset_index()
 )
pi

by_country = (pd.concat([
    bib_df.groupby(["author", "year", "title"])
    .agg(
        {
            "country": lambda _col: "; ".join(_col),
        }
    )
    .assign(
                projected=0

    )
    .reset_index()
    .drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
    .assign( country=lambda _df: _df["country"].apply(
            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
        ),
    )
    .explode("country")
.drop_duplicates()
)

sort_order = by_country["country"].value_counts().index
i = by_country[by_country["country"].str.contains(r"(?:structural|institutional|agency)") == False]
fig = plt.figure()
fig.set_size_inches(12, 5)
ax = sns.countplot(i, x="country", hue="projected" ,order=i["country"].value_counts().index)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
plt.show()
```