feat(notes): Add intermittent findings and progress

Created small up-to-date quick glance document for findings and data set.
2023-12-11 17:15:43 +01:00 · 2023-12-11 17:15:43 +01:00 · 9855256b00
commit 9855256b00
parent b5e467e016
2 changed files with 370 additions and 0 deletions
--- a/_quarto.yml
+++ b/_quarto.yml
@ -4,6 +4,7 @@ project:
  render:
    - presentation_summary.md
    - notes.qmd
+    - meeting_eoy.qmd
    - scoping_review.qmd

 toc: true
--- a/meeting_eoy.qmd
+++ b/meeting_eoy.qmd
@ -0,0 +1,369 @@
+---
+bibliography: 02-data/supplementary/lib.bib
+csl: /home/marty/documents/library/utilities/styles/APA-7.csl
+papersize: A4
+linestretch: 1.5
+fontfamily: lmodern
+fontsize: "12"
+geometry:
+    - left=2.2cm
+    - right=3.5cm
+    - top=2.5cm
+    - bottom=2.5cm
+lang: en
+title: "Scoping Review: Preliminary findings"
+subtitle: Addressing inequalities in the World of Work
+---
+
+```{python}
+#| echo: false
+from pathlib import Path
+import re
+## standard imports
+from IPython.core.display import Markdown as md
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+import seaborn as sns
+from tabulate import tabulate
+import bibtexparser
+
+sns.set_style("whitegrid")
+
+DATA_DIR=Path("./02-data")
+RAW_DATA=DATA_DIR.joinpath("raw")
+WORKING_DATA=DATA_DIR.joinpath("intermediate")
+PROCESSED_DATA=DATA_DIR.joinpath("processed")
+SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
+
+bib_string=""
+for partial_bib in RAW_DATA.glob("**/*.bib"):
+    with open(partial_bib) as f:
+        bib_string+="\n".join(f.readlines())
+bib_sample_raw_db = bibtexparser.parse_string(bib_string)
+
+bib_string=""
+for partial_bib in WORKING_DATA.glob("**/*.bib"):
+    with open(partial_bib) as f:
+        bib_string+="\n".join(f.readlines())
+bib_sample = bibtexparser.parse_string(bib_string)
+
+# load relevant studies
+from src import data
+
+# load zotero-based metadata: citations and uses
+zot_df = pd.DataFrame([
+    [
+        entry["doi"] if "doi" in entry.fields_dict else None,
+        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
+        entry["usage"] if "usage" in entry.fields_dict else None,
+        entry["keywords"] if "keywords" in entry.fields_dict else None,
+    ]
+    for entry in bib_sample.entries
+], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
+
+# Add WB country grouping definitions (income group, world region)
+WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
+df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
+
+bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
+    .assign(
+        doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
+        zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
+        zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
+        zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
+        date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
+        year = lambda _df: _df["date"].dt.year,
+        region = lambda _df: _df["country"].map(df_country_groups["Region"]),
+        income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
+    )
+    .query("year >= 2000")
+)
+zot_df = None
+df_country_groups = None
+```
+
+# The data sample
+
+```{python}
+#| echo: false
+#| output: asis
+
+FULL_RAW_SAMPLE_NOTHING_REMOVED = 2396
+nr_database_query_raw = len(bib_sample_raw_db.entries)
+nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - len(bib_sample.entries)
+nr_other_sources = (len(bib_sample.entries) + nr_out_duplicates) - nr_database_query_raw
+
+all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
+nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + 400
+nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + 400
+nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + 300
+nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
+nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
+
+t3 = "`" * 3
+# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
+# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
+print(f"""
+```{{mermaid}}
+%%| label: fig-prisma
+%%| fig-cap: "Sample sorting process through identification and screening"
+%%| fig-width: 6
+flowchart TD;
+    search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
+    search_prev["Records identified through other sources (n={nr_other_sources})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
+
+    starting_sample -- "Duplicate removal ({nr_out_duplicates} removed) "--> dedup["Records after duplicates removed (n={len(bib_sample.entries)})"];
+
+    dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={len(bib_sample.entries) - nr_out_title})"];
+
+    title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract})"];
+
+    abstract_screened -- "  Language screening ({nr_out_language} excluded)  "--> language_screened["Records after language screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language})"];
+
+    language_screened -- "  Full-text screening ({nr_out_fulltext} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done}) STILL OUTSTANDING: {len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language - nr_extraction_done}"];
+{t3}
+""")
+```
+
+- strongest focus on income inequality (vertical), with many horizontal inequality studies including aspect of income inequality
+- horizontal inequalities: strongest focus on income - gender inequalities (horizontal)
+- interventions:
+    - strongest research base on labour rights protection interventions
+    - second on infrastructural interventions
+    - third on agency-strengthening ones: training, financial access, education programmes
+
+- formalization & social protection research rarely goes into inequality outcomes beyond 'income' effects; most excluded for that reason
+
+```{python}
+#| echo: false
+#| label: fig-inequality-types-whole-sample
+#| fig-cap: Overall inequality types in sample
+
+# load zotero-based metadata: citations and uses
+pi = (pd.DataFrame([
+    [
+        entry["doi"] if "doi" in entry.fields_dict else None,
+        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
+        entry["usage"] if "usage" in entry.fields_dict else None,
+        entry["keywords"] if "keywords" in entry.fields_dict else None,
+    ]
+    for entry in bib_sample.entries
+], columns = ["doi", "cited", "usage", "keywords"])
+    .drop_duplicates("doi")
+     .assign(
+        inequality=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('inequality::([\w\_]+),?')
+    ).dropna(subset="inequality")
+     .assign(
+        inequality=lambda _df: _df["inequality"].str.replace("_", " "),
+        projected = 1
+      ).reset_index()
+ )
+pi
+
+inequality = (pd.concat([
+    bib_df.groupby(["author", "year", "title"])
+    .agg(
+        {
+            "inequality": lambda _col: "; ".join(_col),
+        }
+    )
+    .assign(
+                projected=0
+
+    )
+    .reset_index()
+    .drop_duplicates() , pi])
+    .assign( inequality=lambda _df: _df["inequality"].apply(
+            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
+        ),
+    )
+    .explode("inequality")
+.drop_duplicates()
+)
+
+sort_order = inequality["inequality"].value_counts().index
+i = inequality[inequality["inequality"].str.contains(r"(?:structural|institutional|agency)") == False]
+fig = plt.figure()
+fig.set_size_inches(6, 3)
+ax = sns.countplot(i, x="inequality", hue="projected" ,order=i["inequality"].value_counts().index)
+plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+         rotation_mode="anchor")
+plt.show()
+```
+
+# Preliminary findings
+
+```{python}
+#| echo: false
+#| label: fig-inequality-types
+#| fig-cap: Finished and projected inequality types
+inequality = (pd.concat([
+    bib_df.groupby(["author", "year", "title"])
+    .agg(
+        {
+            "inequality": lambda _col: "; ".join(_col),
+        }
+    )
+    .assign(
+                projected=0
+
+    )
+    .reset_index()
+    .drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
+    .assign( inequality=lambda _df: _df["inequality"].apply(
+            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
+        ),
+    )
+    .explode("inequality")
+.drop_duplicates()
+)
+
+sort_order = inequality["inequality"].value_counts().index
+i = inequality[inequality["inequality"].str.contains(r"(?:structural|institutional|agency)") == False]
+fig = plt.figure()
+fig.set_size_inches(6, 3)
+ax = sns.countplot(i, x="inequality", hue="projected" ,order=i["inequality"].value_counts().index)
+plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+         rotation_mode="anchor")
+plt.show()
+```
+
+- interventions most strongly target gender-income divide
+    - most studies here recommend further scale-integration between agency/structural approaches
+    - most studies also only focus on analysing a single scale however
+- interventions often have intersectional impacts even if not targeted at them
+    - most visible for institutional/structural interventions and spatial inequalities
+    - studies analysing intersectional inequalities near unanimously recommend intersectional targeting
+
+- individual agency-based interventions (training, subsidies, maternity benefits, transfers, microcredit, etc):
+    - seem most effective for targeting WoW outcomes of disability inequalities
+    - seem marginally effective for targeting WoW outcomes of gender inequalities
+    - require additional mediating scales for other inequalities
+- more structural interventions (education, infrastructural, ubi, trade liberalization, collective action):
+    - seem most effective for spatial, income, education-generational inequalities
+    - often show longer-term impacts, requiring longer periods of analyses
+    - can work without additional agency-based interventions, few studies analyse both at same time
+
+# Preliminary limitations
+
+```{python}
+#| echo: false
+#| label: fig-intervention-types
+#| fig-cap: Finished and projected intervention types
+
+# load zotero-based metadata: citations and uses
+pi = (pd.DataFrame([
+    [
+        entry["doi"] if "doi" in entry.fields_dict else None,
+        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
+        entry["usage"] if "usage" in entry.fields_dict else None,
+        entry["keywords"] if "keywords" in entry.fields_dict else None,
+    ]
+    for entry in bib_sample.entries
+], columns = ["doi", "cited", "usage", "keywords"])
+    .drop_duplicates("doi")
+     .assign(
+        intervention=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('type::([\w\_]+),?')
+    ).dropna(subset="intervention")
+     .assign(
+        intervention=lambda _df: _df["intervention"].str.replace("_", " "),
+        projected = 1
+      ).reset_index()
+ )
+pi
+
+by_intervention = (pd.concat([
+    bib_df.groupby(["author", "year", "title"])
+    .agg(
+        {
+            "intervention": lambda _col: "; ".join(_col),
+        }
+    )
+    .assign(
+                projected=0
+
+    )
+    .reset_index()
+    .drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
+    .assign( intervention=lambda _df: _df["intervention"].apply(
+            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
+        ),
+    )
+    .explode("intervention")
+.drop_duplicates()
+)
+
+sort_order = by_intervention["intervention"].value_counts().index
+i = by_intervention[by_intervention["intervention"].str.contains(r"(?:structural|institutional|agency)") == False]
+fig = plt.figure()
+fig.set_size_inches(6, 3)
+ax = sns.countplot(i, x="intervention", hue="projected" ,order=i["intervention"].value_counts().index)
+plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+         rotation_mode="anchor")
+plt.show()
+```
+
+- stronger institutional-structural research focus in developed countries, with more structural-agency based in developing countries
+- employment creation as a category is often subsumed in other structural/institutional analyses
+- little evidence-based research on effect of interventions targeting education on world of work outcomes
+- spatial inequality most evenly geographically spread evidence base
+- empirical base on interventions targeting disability inequalities strongly restricted on developed countries, especially United States
+
+```{python}
+#| echo: false
+#| label: fig-countries
+#| fig-cap: Country spread
+#| column: screen
+
+# load zotero-based metadata: citations and uses
+pi = (pd.DataFrame([
+    [
+        entry["doi"] if "doi" in entry.fields_dict else None,
+        entry["times-cited"] if "times-cited" in entry.fields_dict else None,
+        entry["usage"] if "usage" in entry.fields_dict else None,
+        entry["keywords"] if "keywords" in entry.fields_dict else None,
+    ]
+    for entry in bib_sample.entries
+], columns = ["doi", "cited", "usage", "keywords"])
+    .drop_duplicates("doi")
+     .assign(
+        country=lambda _df: _df["keywords"].str.replace("\\", "").str.extract('country::([\w\_]+),?')
+    ).dropna(subset="country")
+     .assign(
+        country=lambda _df: _df["country"].str.replace("_", " ").str.replace("US", "United States").str.replace("Britain", "United Kingdom"),
+        projected = 1
+      ).reset_index()
+ )
+pi
+
+by_country = (pd.concat([
+    bib_df.groupby(["author", "year", "title"])
+    .agg(
+        {
+            "country": lambda _col: "; ".join(_col),
+        }
+    )
+    .assign(
+                projected=0
+
+    )
+    .reset_index()
+    .drop_duplicates() , pi[pi["keywords"].str.contains("relevant") == True]])
+    .assign( country=lambda _df: _df["country"].apply(
+            lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
+        ),
+    )
+    .explode("country")
+.drop_duplicates()
+)
+
+sort_order = by_country["country"].value_counts().index
+i = by_country[by_country["country"].str.contains(r"(?:structural|institutional|agency)") == False]
+fig = plt.figure()
+fig.set_size_inches(12, 5)
+ax = sns.countplot(i, x="country", hue="projected" ,order=i["country"].value_counts().index)
+plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+         rotation_mode="anchor")
+plt.show()
+```