feat(script): Move big code chunks out of script
This commit is contained in:
parent
76578e99d3
commit
ed6c8550b6
4 changed files with 88 additions and 173 deletions
37
01-codechunks/_prep-data.py
Normal file
37
01-codechunks/_prep-data.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
from pathlib import Path
|
||||
import re
|
||||
## standard imports
|
||||
from IPython.core.display import Markdown as md
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
from tabulate import tabulate
|
||||
import bibtexparser
|
||||
|
||||
sns.set_style("whitegrid")
|
||||
|
||||
DATA_DIR=Path("./02-data")
|
||||
RAW_DATA=DATA_DIR.joinpath("raw")
|
||||
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
||||
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||||
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||||
|
||||
from src import prep_data
|
||||
|
||||
# raw database-search results
|
||||
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
|
||||
# the complete library of sampled (and working) literature
|
||||
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
||||
|
||||
# load relevant studies
|
||||
from src import load_data
|
||||
|
||||
bib_df = prep_data.observations_with_metadata_df(
|
||||
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
||||
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
||||
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
||||
)
|
||||
raw_observations = None
|
||||
zot_df = None
|
||||
df_country_groups = None
|
45
01-codechunks/_prisma-flowchart.py
Normal file
45
01-codechunks/_prisma-flowchart.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
nr_database_query_raw = len(bib_sample_raw_db.entries)
|
||||
nr_snowballing_raw = 2240
|
||||
|
||||
all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
|
||||
nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
|
||||
nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
|
||||
nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
|
||||
|
||||
FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
|
||||
FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
|
||||
|
||||
NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
|
||||
NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
|
||||
NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
|
||||
|
||||
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
|
||||
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
|
||||
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
|
||||
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
|
||||
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
|
||||
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
|
||||
|
||||
t3 = "`" * 3
|
||||
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
|
||||
# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
|
||||
print(f"""
|
||||
```{{mermaid}}
|
||||
%%| label: fig-prisma
|
||||
%%| fig-cap: "Sample sorting process through identification and screening"
|
||||
%%| fig-width: 6
|
||||
flowchart TD;
|
||||
search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
|
||||
search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
|
||||
|
||||
starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
|
||||
|
||||
dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
|
||||
|
||||
title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
|
||||
|
||||
abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
|
||||
|
||||
language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
|
||||
{t3}
|
||||
""")
|
90
article.qmd
90
article.qmd
|
@ -38,51 +38,16 @@ crossref: # to fix the appendix crossrefs being separate from main
|
|||
latex-list-of-description: Appendix B Table
|
||||
---
|
||||
|
||||
{{< portrait >}}
|
||||
|
||||
```{python}
|
||||
#| label: load-data
|
||||
#| echo: false
|
||||
#| output: false
|
||||
from pathlib import Path
|
||||
import re
|
||||
## standard imports
|
||||
from IPython.core.display import Markdown as md
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
from tabulate import tabulate
|
||||
import bibtexparser
|
||||
|
||||
sns.set_style("whitegrid")
|
||||
|
||||
DATA_DIR=Path("./02-data")
|
||||
RAW_DATA=DATA_DIR.joinpath("raw")
|
||||
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
||||
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||||
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||||
|
||||
from src import prep_data
|
||||
|
||||
# raw database-search results
|
||||
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
|
||||
# the complete library of sampled (and working) literature
|
||||
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
||||
|
||||
# load relevant studies
|
||||
from src import load_data
|
||||
|
||||
bib_df = prep_data.observations_with_metadata_df(
|
||||
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
||||
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
||||
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
||||
)
|
||||
raw_observations = None
|
||||
zot_df = None
|
||||
df_country_groups = None
|
||||
{{< include 01-codechunks/_prep-data.py >}}
|
||||
```
|
||||
|
||||
{{< portrait >}}
|
||||
|
||||
# Introduction
|
||||
|
||||
* Context and statement of the problem
|
||||
|
@ -124,8 +89,6 @@ with a focus on the narrowing criteria specified in @tbl-inclusion-criteria.
|
|||
::: {#tbl-inclusion-criteria}
|
||||
|
||||
```{python}
|
||||
#| label: tbl-inclusion-criteria
|
||||
|
||||
inclusion_criteria = pd.read_csv("02-data/supplementary/inclusion-criteria.tsv", sep="\t")
|
||||
md(tabulate(inclusion_criteria, showindex=False, headers="keys", tablefmt="grid"))
|
||||
```
|
||||
|
@ -177,52 +140,7 @@ ultimately resulting in the process represented in the PRISMA chart in @fig-pris
|
|||
#| label: calculate-scoping-flowchart
|
||||
#| echo: false
|
||||
#| output: asis
|
||||
|
||||
nr_database_query_raw = len(bib_sample_raw_db.entries)
|
||||
nr_snowballing_raw = 2240
|
||||
|
||||
all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
|
||||
nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
|
||||
nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
|
||||
nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
|
||||
|
||||
FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
|
||||
FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
|
||||
|
||||
NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
|
||||
NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
|
||||
NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
|
||||
|
||||
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
|
||||
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
|
||||
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
|
||||
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
|
||||
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
|
||||
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
|
||||
|
||||
t3 = "`" * 3
|
||||
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
|
||||
# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
|
||||
print(f"""
|
||||
```{{mermaid}}
|
||||
%%| label: fig-prisma
|
||||
%%| fig-cap: "Sample sorting process through identification and screening"
|
||||
%%| fig-width: 6
|
||||
flowchart TD;
|
||||
search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
|
||||
search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
|
||||
|
||||
starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
|
||||
|
||||
dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
|
||||
|
||||
title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
|
||||
|
||||
abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
|
||||
|
||||
language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
|
||||
{t3}
|
||||
""")
|
||||
{{< include 01-codechunks/_prisma-flowchart.py >}}
|
||||
```
|
||||
|
||||
All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.
|
||||
|
|
|
@ -21,47 +21,7 @@ crossref:
|
|||
latex-list-of-description: Appendix Table
|
||||
---
|
||||
|
||||
```{python}
|
||||
#| label: load-data
|
||||
#| echo: false
|
||||
from pathlib import Path
|
||||
import re
|
||||
## standard imports
|
||||
from IPython.core.display import Markdown as md
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
import seaborn as sns
|
||||
from tabulate import tabulate
|
||||
import bibtexparser
|
||||
|
||||
sns.set_style("whitegrid")
|
||||
|
||||
DATA_DIR=Path("./02-data")
|
||||
RAW_DATA=DATA_DIR.joinpath("raw")
|
||||
WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
||||
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||||
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||||
|
||||
from src import prep_data
|
||||
|
||||
# raw database-search results
|
||||
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
|
||||
# the complete library of sampled (and working) literature
|
||||
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
||||
|
||||
# load relevant studies
|
||||
from src import load_data
|
||||
|
||||
bib_df = prep_data.observations_with_metadata_df(
|
||||
raw_observations = load_data.from_yml(PROCESSED_DATA),
|
||||
study_metadata = prep_data.bib_metadata_df(bib_sample),
|
||||
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
||||
)
|
||||
raw_observations = None
|
||||
zot_df = None
|
||||
df_country_groups = None
|
||||
```
|
||||
{{< include 01-codechunks/_prep-data.qmd >}}
|
||||
|
||||
<!-- pagebreak to separate from TOC -->
|
||||
{{< pagebreak >}}
|
||||
|
@ -415,52 +375,7 @@ and the sources will be added to the sample to undergo the same screening proces
|
|||
#| label: calculate-scoping-flowchart
|
||||
#| echo: false
|
||||
#| output: asis
|
||||
|
||||
nr_database_query_raw = len(bib_sample_raw_db.entries)
|
||||
nr_snowballing_raw = 2240
|
||||
|
||||
all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
|
||||
nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
|
||||
nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
|
||||
nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
|
||||
|
||||
FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
|
||||
FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
|
||||
|
||||
NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
|
||||
NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
|
||||
NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
|
||||
|
||||
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
|
||||
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
|
||||
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
|
||||
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
|
||||
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
|
||||
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
|
||||
|
||||
t3 = "`" * 3
|
||||
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
|
||||
# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
|
||||
print(f"""
|
||||
```{{mermaid}}
|
||||
%%| label: fig-prisma
|
||||
%%| fig-cap: "Sample sorting process through identification and screening"
|
||||
%%| fig-width: 6
|
||||
flowchart TD;
|
||||
search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
|
||||
search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
|
||||
|
||||
starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
|
||||
|
||||
dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
|
||||
|
||||
title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
|
||||
|
||||
abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
|
||||
|
||||
language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
|
||||
{t3}
|
||||
""")
|
||||
{{< include 01-codechunks/_prisma-flowchart.py >}}
|
||||
```
|
||||
|
||||
All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.
|
||||
|
|
Loading…
Reference in a new issue