chore(script): Refactor screening flowchart calculations

Made it much clearer and simpler how numbers are calculated for the
screening flowchart. Now we just keep the actual numbers in memory
and not a copy of the whole bibtex library for each calculation step.

Also renamed the bibtex variables to be more sane, `bib_sample_raw_db`
(the raw, unaltered sample returned from querying the databases), and
`bib_sample` for our working sample including database queries and
snowballing studies but already deduplicated (since we can't keep
an unduplicated version on Zotero).
This commit is contained in:
Marty Oehme 2023-12-09 21:56:44 +01:00
parent 708fa90d29
commit 3f05283f6d
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -46,13 +46,13 @@ bib_string=""
for partial_bib in RAW_DATA.glob("**/*.bib"): for partial_bib in RAW_DATA.glob("**/*.bib"):
with open(partial_bib) as f: with open(partial_bib) as f:
bib_string+="\n".join(f.readlines()) bib_string+="\n".join(f.readlines())
sample_raw = bibtexparser.parse_string(bib_string) bib_sample_raw_db = bibtexparser.parse_string(bib_string)
bib_string="" bib_string=""
for partial_bib in WORKING_DATA.glob("**/*.bib"): for partial_bib in WORKING_DATA.glob("**/*.bib"):
with open(partial_bib) as f: with open(partial_bib) as f:
bib_string+="\n".join(f.readlines()) bib_string+="\n".join(f.readlines())
sample = bibtexparser.parse_string(bib_string) bib_sample = bibtexparser.parse_string(bib_string)
``` ```
# Introduction # Introduction
@ -388,22 +388,17 @@ and the sources will be added to the sample to undergo the same screening proces
#| echo: false #| echo: false
#| output: asis #| output: asis
sample_out_title = [] FULL_RAW_SAMPLE_NOTHING_REMOVED = 2396
sample_out_abstract = [] nr_database_query_raw = len(bib_sample_raw_db.entries)
sample_out_fulltext = [] nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - len(bib_sample.entries)
sample_out_language = [] nr_other_sources = (len(bib_sample.entries) + nr_out_duplicates) - nr_database_query_raw
sample_relvant_done = []
for e in sample.entries: all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
if "keywords" in e.fields_dict.keys(): nr_out_title = len([1 for kw in all_keywords if "out::title" in kw])
if "out::title" in e["keywords"]: nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw])
sample_out_title.append(e) nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw])
elif "out::abstract" in e["keywords"]: nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
sample_out_abstract.append(e) nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
elif "out::full-text" in e["keywords"]:
sample_out_fulltext.append(e)
elif "done::extracted" in e["keywords"] and "relevant" in e["keywords"]:
sample_relvant_done.append(e)
t3 = "`" * 3 t3 = "`" * 3
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts # FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
@ -414,18 +409,18 @@ print(f"""
%%| fig-cap: "Sample sorting process through identification and screening" %%| fig-cap: "Sample sorting process through identification and screening"
%%| fig-width: 6 %%| fig-width: 6
flowchart TD; flowchart TD;
search_db["Records identified through database searching (n=1643)"] --> starting_sample; search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
search_prev["Records identified through other sources (n=753)"] --> starting_sample["Starting sample (n=2396)"]; search_prev["Records identified through other sources (n={nr_other_sources})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
starting_sample -- "Duplicate removal ({2396 - len(sample.entries)} removed) "--> dedup["Records after duplicates removed (n={len(sample.entries)})"]; starting_sample -- "Duplicate removal ({nr_out_duplicates} removed) "--> dedup["Records after duplicates removed (n={len(bib_sample.entries)})"];
dedup -- "Title screening ({len(sample_out_title)} excluded)" --> title_screened["Records after titles screened (n={len(sample.entries)-len(sample_out_title)})"]; dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={len(bib_sample.entries) - nr_out_title})"];
title_screened -- "Abstract screening ({len(sample_out_abstract)} excluded)"--> abstract_screened["Records after abstracts screened (n={len(sample.entries)-len(sample_out_title)-len(sample_out_abstract)}"]; title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract}"];
abstract_screened -- " Language screening ({len(sample_out_language)} excluded) "--> language_screened["Records after language screened (n={len(sample.entries)-len(sample_out_title)-len(sample_out_abstract)-len(sample_out_language)})"]; abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language})"];
language_screened -- " Full-text screening ({len(sample_out_fulltext)} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={len(sample_relvant_done)})"]; language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
{t3} {t3}
""") """)
``` ```
@ -437,18 +432,13 @@ The results to be identified in the matrix include a studys: i) key outcome m
```{python} ```{python}
#| echo: false #| echo: false
# TODO Remove redundant 'relevant' studies observation below once all studies are extracted.
sample_size_all = len(sample_raw.entries) nr_relevant = len([1 for kw in all_keywords if "relevant" in kw])
sample_relevant = []
for e in sample.entries:
if "keywords" in e.fields_dict.keys() and "relevant" in e["keywords"]:
sample_relevant.append(e)
md(f""" md(f"""
The query execution results in an initial sample of {sample_size_all} potential studies after the identification process. The query execution results in an initial sample of {nr_database_query_raw} potential studies identified from the database search as well as {nr_other_sources} potential studies from other sources, leading to a total initial number of {FULL_RAW_SAMPLE_NOTHING_REMOVED}.
This accounts for all identified studies without duplicate removal, without controlling for literature that has been superseded or applying any other screening criteria. This accounts for all identified studies without duplicate removal, without controlling for literature that has been superseded or applying any other screening criteria.
Of these, {len(sample_relevant)} have been identified as relevant studies for the purposes of this scoping review. Of these, {nr_relevant} have been identified as potentially relevant studies for the purposes of this scoping review, from which {nr_extraction_done} have been extracted.
""") """)
``` ```