From b5c0df16e350804acb7a79a162352822c9048b7d Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Wed, 6 Dec 2023 16:07:22 +0100
Subject: [PATCH] feat(script): Add dynamic screening flowchart

Added mermaid flowchart showing the screening numbers, based on
dynamic calculations done at document rendering. Currently missing
some calculations (un-deduplicated numbers and final relevant number).
---
 scoping_review.qmd | 97 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 78 insertions(+), 19 deletions(-)

diff --git a/scoping_review.qmd b/scoping_review.qmd
index c0aa270..7e7e3e1 100644
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@@ -29,7 +29,8 @@ zotero:
 #| echo: false
 from pathlib import Path
 DATA_DIR=Path("./02-data")
-BIB_PATH = DATA_DIR.joinpath("raw/01_wos-sample_2023-11-02")
+RAW_SAMPLE=DATA_DIR.joinpath("raw")
+WORKING_SAMPLE=DATA_DIR.joinpath("intermediate")
 
 ## standard imports
 from IPython.core.display import Markdown as md
@@ -42,6 +43,24 @@ from tabulate import tabulate
 sns.set_style("whitegrid")
 ```
 
+```{python}
+#| echo: false
+# load and parse overall bibtex sample
+import bibtexparser
+
+bib_string=""
+for partial_bib in RAW_SAMPLE.glob("**/*.bib"):
+    with open(partial_bib) as f:
+        bib_string+="\n".join(f.readlines())
+sample_raw = bibtexparser.parse_string(bib_string)
+
+bib_string=""
+for partial_bib in WORKING_SAMPLE.glob("**/*.bib"):
+    with open(partial_bib) as f:
+        bib_string+="\n".join(f.readlines())
+sample = bibtexparser.parse_string(bib_string)
+```
+
 # Introduction
 
 This section will introduce the reader to the concern of inequality in the World of Work (WoW),
@@ -278,18 +297,6 @@ This section will discuss the systematic scoping review methodology that is prop
 Unlike purely systematic reviews which typically focus on specific policy questions and interventions, systematic scoping reviews focus on a wider spectrum of policies, where different study designs and research questions can be investigated.
 Since scoping reviews allow both broad and in-depth analyses, they are the most appropriate rigorous method to make a synthesis of the current evidence in this area [@Arksey2005].
 
-```{python}
-#| echo: false
-# load and parse overall bibtex sample
-import bibtexparser
-
-bib_string=""
-for partial_bib in BIB_PATH.glob("*.bib"):
-    with open(partial_bib) as f:
-        bib_string+="\n".join(f.readlines())
-sample = bibtexparser.parse_string(bib_string)
-```
-
 The scoping review allows broad focus to be given to a subject for which no unified path with clear edges has been laid out yet by prior reviews, as remains the case with policies targeting inequalities in the world of work.
 It does so through a breadth-first approach through a search protocol which favours working through a large body of literature to subsequently move toward a depth-favouring approach once the literature has been sufficiently delimited.
 Its purpose, clearly mapping a body of literature on a (broad) topic area, is thereby useful on its own or in combination with a systematic approach [@Arksey2005].
@@ -303,7 +310,7 @@ Each of the clusters contains synonymous terms as well as term-adjacent phrase c
 <!-- TODO Why WOS database? -->
 The search protocol then follows a three-staged process of execution: identification, screening and extraction.
 First, in identification, the above categorizations are combined through Boolean operators to conduct a search through the database repository Web of Science.
-The search itself is conducted with English-language search queries only.
+While the resulting study pools could be screened for in multiple languages, the search queries themselves are passed to the databases in English-language only.
 <!-- TODO will we be using gray lit? -->
 Relevant results are then complemented through the adoption of a 'snowballing' technique, which analyses an array of published reviews for their reference lists to find cross-references of potentially missing literature.
 
@@ -359,7 +366,7 @@ wow_terms_cluster = {
 }
 
 df = pd.DataFrame(wow_terms_cluster)
-md(tabulate(df.fillna(""), headers=wow_terms_cluster.keys(), showindex=False, tablefmt="grid"))
+md(tabulate(df.fillna(""), headers=[wow_terms_cluster.keys()], showindex=False, tablefmt="grid"))
 ```
 
 The world of work cluster, like the inequality and policy intervention clusters below, is made up of a general signifier (such as "work", "inequality" or "intervention") which has to be labelled in a study to form part of the sample,
@@ -536,6 +543,51 @@ Should any literature reviews be identified as relevant during this screening pr
 they will in turn be crawled for cited sources in a 'snowballing' process,
 and the sources will be added to the sample to undergo the same screening process explained above.
 
+```{python}
+#| echo: false
+#| output: asis
+
+sample_out_title = []
+sample_out_abstract = []
+sample_out_fulltext = []
+sample_out_language = []
+sample_relvant_done = []
+
+for e in sample.entries:
+    if "keywords" in e.fields_dict.keys():
+        if "out::title" in e["keywords"]:
+            sample_out_title.append(e)
+        elif "out::abstract" in e["keywords"]:
+            sample_out_abstract.append(e)
+        elif "out::full-text" in e["keywords"]:
+            sample_out_fulltext.append(e)
+        elif "done::extracted" in e["keywords"] and "relevant" in e["keywords"]:
+            sample_relvant_done.append(e)
+
+t3 = "`" * 3
+# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
+# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
+print(f"""
+```{{mermaid}}
+%%| label: fig-prisma
+%%| fig-cap: "Sample sorting process through identification and screening"
+flowchart TD;
+    search_db["Records identified through database searching (n=1643)"] --> starting_sample;
+    search_prev["Records identified through other sources (n=753)"] --> starting_sample["Starting sample (n=2396)"];
+
+    starting_sample -- "Duplicate removal ({2396 - len(sample.entries)} removed) "--> dedup["Records after duplicates removed (n={len(sample.entries)})"];
+
+    dedup -- "Title screening ({len(sample_out_title)} excluded)" --> title_screened["Records after titles screened (n={len(sample.entries)-len(sample_out_title)})"];
+
+    title_screened -- "Abstract screening ({len(sample_out_abstract)} excluded)"--> abstract_screened["Records after abstracts screened (n={len(sample.entries)-len(sample_out_title)-len(sample_out_abstract)}"];
+
+    abstract_screened -- "  Language screening ({len(sample_out_language)} excluded)  "--> language_screened["Records after language screened (n={len(sample.entries)-len(sample_out_title)-len(sample_out_abstract)-len(sample_out_language)})"];
+
+    language_screened -- "  Full-text screening ({len(sample_out_fulltext)} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={len(sample_relvant_done)})"];
+{t3}
+""")
+```
+
 All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.
 The results to be identified in the matrix include a study’s: i) key outcome measures (dependent variables), ii) main findings, iii) main policy interventions (independent variables), iv) study design and sample size, v) dataset and methods of evaluation, vi) direction of relation and level of representativeness, vii) level of statistical significance, viii) main limitations.
 
@@ -544,10 +596,17 @@ The results to be identified in the matrix include a study’s: i) key outcome m
 ```{python}
 #| echo: false
 
-sample_size = len(sample.entries)
+sample_size_all = len(sample_raw.entries)
+
+sample_relevant = []
+for e in sample.entries:
+    if "keywords" in e.fields_dict.keys() and "relevant" in e["keywords"]:
+        sample_relevant.append(e)
+
 md(f"""
-The exploratory execution of queries results in an initial sample of {sample_size} potential studies after the identification process.
-This contains all identified studies without duplicate removal, controlling for literature that has been superseded or any other screening criteria.
+The query execution results in an initial sample of {sample_size_all} potential studies after the identification process.
+This accounts for all identified studies without duplicate removal, without controlling for literature that has been superseded or applying any other screening criteria.
+Of these, {len(sample_relevant)} have been identified as relevant studies for the purposes of this scoping review.
 """)
 ```
 
@@ -560,7 +619,7 @@ Keeping in mind that these results are not yet screened for their full relevance
 #| label: fig-publications-per-year
 #| fig-cap: Publications per year
 reformatted = []
-for e in sample.entries:
+for e in sample_raw.entries:
     reformatted.append([e["Year"], e["Author"], e["Title"], e["Type"], e["Times-Cited"], e["Usage-Count-Since-2013"]])
 bib_df = pd.DataFrame(reformatted, columns = ["Year", "Author", "Title", "Type", "Cited", "Usage"])
 bib_df["Date"] = pd.to_datetime(bib_df["Year"], format="%Y")