chore(script): Refactor screening flowchart calculations
Made it much clearer and simpler how numbers are calculated for the screening flowchart. Now we just keep the actual numbers in memory and not a copy of the whole bibtex library for each calculation step. Also renamed the bibtex variables to be more sane, `bib_sample_raw_db` (the raw, unaltered sample returned from querying the databases), and `bib_sample` for our working sample including database queries and snowballing studies but already deduplicated (since we can't keep an unduplicated version on Zotero).
This commit is contained in:
parent
708fa90d29
commit
3f05283f6d
1 changed files with 23 additions and 33 deletions
|
@ -46,13 +46,13 @@ bib_string=""
|
||||||
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
||||||
with open(partial_bib) as f:
|
with open(partial_bib) as f:
|
||||||
bib_string+="\n".join(f.readlines())
|
bib_string+="\n".join(f.readlines())
|
||||||
sample_raw = bibtexparser.parse_string(bib_string)
|
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
||||||
|
|
||||||
bib_string=""
|
bib_string=""
|
||||||
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
||||||
with open(partial_bib) as f:
|
with open(partial_bib) as f:
|
||||||
bib_string+="\n".join(f.readlines())
|
bib_string+="\n".join(f.readlines())
|
||||||
sample = bibtexparser.parse_string(bib_string)
|
bib_sample = bibtexparser.parse_string(bib_string)
|
||||||
```
|
```
|
||||||
|
|
||||||
# Introduction
|
# Introduction
|
||||||
|
@ -388,22 +388,17 @@ and the sources will be added to the sample to undergo the same screening proces
|
||||||
#| echo: false
|
#| echo: false
|
||||||
#| output: asis
|
#| output: asis
|
||||||
|
|
||||||
sample_out_title = []
|
FULL_RAW_SAMPLE_NOTHING_REMOVED = 2396
|
||||||
sample_out_abstract = []
|
nr_database_query_raw = len(bib_sample_raw_db.entries)
|
||||||
sample_out_fulltext = []
|
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - len(bib_sample.entries)
|
||||||
sample_out_language = []
|
nr_other_sources = (len(bib_sample.entries) + nr_out_duplicates) - nr_database_query_raw
|
||||||
sample_relvant_done = []
|
|
||||||
|
|
||||||
for e in sample.entries:
|
all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
|
||||||
if "keywords" in e.fields_dict.keys():
|
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw])
|
||||||
if "out::title" in e["keywords"]:
|
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw])
|
||||||
sample_out_title.append(e)
|
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw])
|
||||||
elif "out::abstract" in e["keywords"]:
|
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
|
||||||
sample_out_abstract.append(e)
|
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
|
||||||
elif "out::full-text" in e["keywords"]:
|
|
||||||
sample_out_fulltext.append(e)
|
|
||||||
elif "done::extracted" in e["keywords"] and "relevant" in e["keywords"]:
|
|
||||||
sample_relvant_done.append(e)
|
|
||||||
|
|
||||||
t3 = "`" * 3
|
t3 = "`" * 3
|
||||||
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
|
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
|
||||||
|
@ -414,18 +409,18 @@ print(f"""
|
||||||
%%| fig-cap: "Sample sorting process through identification and screening"
|
%%| fig-cap: "Sample sorting process through identification and screening"
|
||||||
%%| fig-width: 6
|
%%| fig-width: 6
|
||||||
flowchart TD;
|
flowchart TD;
|
||||||
search_db["Records identified through database searching (n=1643)"] --> starting_sample;
|
search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
|
||||||
search_prev["Records identified through other sources (n=753)"] --> starting_sample["Starting sample (n=2396)"];
|
search_prev["Records identified through other sources (n={nr_other_sources})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
|
||||||
|
|
||||||
starting_sample -- "Duplicate removal ({2396 - len(sample.entries)} removed) "--> dedup["Records after duplicates removed (n={len(sample.entries)})"];
|
starting_sample -- "Duplicate removal ({nr_out_duplicates} removed) "--> dedup["Records after duplicates removed (n={len(bib_sample.entries)})"];
|
||||||
|
|
||||||
dedup -- "Title screening ({len(sample_out_title)} excluded)" --> title_screened["Records after titles screened (n={len(sample.entries)-len(sample_out_title)})"];
|
dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={len(bib_sample.entries) - nr_out_title})"];
|
||||||
|
|
||||||
title_screened -- "Abstract screening ({len(sample_out_abstract)} excluded)"--> abstract_screened["Records after abstracts screened (n={len(sample.entries)-len(sample_out_title)-len(sample_out_abstract)}"];
|
title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract}"];
|
||||||
|
|
||||||
abstract_screened -- " Language screening ({len(sample_out_language)} excluded) "--> language_screened["Records after language screened (n={len(sample.entries)-len(sample_out_title)-len(sample_out_abstract)-len(sample_out_language)})"];
|
abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={len(bib_sample.entries)-nr_out_title-nr_out_abstract-nr_out_language})"];
|
||||||
|
|
||||||
language_screened -- " Full-text screening ({len(sample_out_fulltext)} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={len(sample_relvant_done)})"];
|
language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
|
||||||
{t3}
|
{t3}
|
||||||
""")
|
""")
|
||||||
```
|
```
|
||||||
|
@ -437,18 +432,13 @@ The results to be identified in the matrix include a study’s: i) key outcome m
|
||||||
|
|
||||||
```{python}
|
```{python}
|
||||||
#| echo: false
|
#| echo: false
|
||||||
|
# TODO Remove redundant 'relevant' studies observation below once all studies are extracted.
|
||||||
sample_size_all = len(sample_raw.entries)
|
nr_relevant = len([1 for kw in all_keywords if "relevant" in kw])
|
||||||
|
|
||||||
sample_relevant = []
|
|
||||||
for e in sample.entries:
|
|
||||||
if "keywords" in e.fields_dict.keys() and "relevant" in e["keywords"]:
|
|
||||||
sample_relevant.append(e)
|
|
||||||
|
|
||||||
md(f"""
|
md(f"""
|
||||||
The query execution results in an initial sample of {sample_size_all} potential studies after the identification process.
|
The query execution results in an initial sample of {nr_database_query_raw} potential studies identified from the database search as well as {nr_other_sources} potential studies from other sources, leading to a total initial number of {FULL_RAW_SAMPLE_NOTHING_REMOVED}.
|
||||||
This accounts for all identified studies without duplicate removal, without controlling for literature that has been superseded or applying any other screening criteria.
|
This accounts for all identified studies without duplicate removal, without controlling for literature that has been superseded or applying any other screening criteria.
|
||||||
Of these, {len(sample_relevant)} have been identified as relevant studies for the purposes of this scoping review.
|
Of these, {nr_relevant} have been identified as potentially relevant studies for the purposes of this scoping review, from which {nr_extraction_done} have been extracted.
|
||||||
""")
|
""")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue