feat(script): Move big code chunks out of script

This commit is contained in:
Marty Oehme 2024-07-14 20:18:57 +02:00
parent 76578e99d3
commit ed6c8550b6
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
4 changed files with 88 additions and 173 deletions

View file

@ -0,0 +1,37 @@
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
sns.set_style("whitegrid")
DATA_DIR=Path("./02-data")
RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
from src import prep_data
# raw database-search results
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
# the complete library of sampled (and working) literature
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
# load relevant studies
from src import load_data
bib_df = prep_data.observations_with_metadata_df(
raw_observations = load_data.from_yml(PROCESSED_DATA),
study_metadata = prep_data.bib_metadata_df(bib_sample),
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
)
raw_observations = None
zot_df = None
df_country_groups = None

View file

@ -0,0 +1,45 @@
nr_database_query_raw = len(bib_sample_raw_db.entries)
nr_snowballing_raw = 2240
all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
t3 = "`" * 3
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
print(f"""
```{{mermaid}}
%%| label: fig-prisma
%%| fig-cap: "Sample sorting process through identification and screening"
%%| fig-width: 6
flowchart TD;
search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
{t3}
""")

View file

@ -38,51 +38,16 @@ crossref: # to fix the appendix crossrefs being separate from main
latex-list-of-description: Appendix B Table latex-list-of-description: Appendix B Table
--- ---
{{< portrait >}}
```{python} ```{python}
#| label: load-data #| label: load-data
#| echo: false #| echo: false
#| output: false #| output: false
from pathlib import Path {{< include 01-codechunks/_prep-data.py >}}
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
sns.set_style("whitegrid")
DATA_DIR=Path("./02-data")
RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
from src import prep_data
# raw database-search results
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
# the complete library of sampled (and working) literature
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
# load relevant studies
from src import load_data
bib_df = prep_data.observations_with_metadata_df(
raw_observations = load_data.from_yml(PROCESSED_DATA),
study_metadata = prep_data.bib_metadata_df(bib_sample),
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
)
raw_observations = None
zot_df = None
df_country_groups = None
``` ```
{{< portrait >}}
# Introduction # Introduction
* Context and statement of the problem * Context and statement of the problem
@ -124,8 +89,6 @@ with a focus on the narrowing criteria specified in @tbl-inclusion-criteria.
::: {#tbl-inclusion-criteria} ::: {#tbl-inclusion-criteria}
```{python} ```{python}
#| label: tbl-inclusion-criteria
inclusion_criteria = pd.read_csv("02-data/supplementary/inclusion-criteria.tsv", sep="\t") inclusion_criteria = pd.read_csv("02-data/supplementary/inclusion-criteria.tsv", sep="\t")
md(tabulate(inclusion_criteria, showindex=False, headers="keys", tablefmt="grid")) md(tabulate(inclusion_criteria, showindex=False, headers="keys", tablefmt="grid"))
``` ```
@ -177,52 +140,7 @@ ultimately resulting in the process represented in the PRISMA chart in @fig-pris
#| label: calculate-scoping-flowchart #| label: calculate-scoping-flowchart
#| echo: false #| echo: false
#| output: asis #| output: asis
{{< include 01-codechunks/_prisma-flowchart.py >}}
nr_database_query_raw = len(bib_sample_raw_db.entries)
nr_snowballing_raw = 2240
all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
t3 = "`" * 3
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
print(f"""
```{{mermaid}}
%%| label: fig-prisma
%%| fig-cap: "Sample sorting process through identification and screening"
%%| fig-width: 6
flowchart TD;
search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
{t3}
""")
``` ```
All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix. All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.

View file

@ -21,47 +21,7 @@ crossref:
latex-list-of-description: Appendix Table latex-list-of-description: Appendix Table
--- ---
```{python} {{< include 01-codechunks/_prep-data.qmd >}}
#| label: load-data
#| echo: false
from pathlib import Path
import re
## standard imports
from IPython.core.display import Markdown as md
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tabulate import tabulate
import bibtexparser
sns.set_style("whitegrid")
DATA_DIR=Path("./02-data")
RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
from src import prep_data
# raw database-search results
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
# the complete library of sampled (and working) literature
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
# load relevant studies
from src import load_data
bib_df = prep_data.observations_with_metadata_df(
raw_observations = load_data.from_yml(PROCESSED_DATA),
study_metadata = prep_data.bib_metadata_df(bib_sample),
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
)
raw_observations = None
zot_df = None
df_country_groups = None
```
<!-- pagebreak to separate from TOC --> <!-- pagebreak to separate from TOC -->
{{< pagebreak >}} {{< pagebreak >}}
@ -415,52 +375,7 @@ and the sources will be added to the sample to undergo the same screening proces
#| label: calculate-scoping-flowchart #| label: calculate-scoping-flowchart
#| echo: false #| echo: false
#| output: asis #| output: asis
{{< include 01-codechunks/_prisma-flowchart.py >}}
nr_database_query_raw = len(bib_sample_raw_db.entries)
nr_snowballing_raw = 2240
all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
t3 = "`" * 3
# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
print(f"""
```{{mermaid}}
%%| label: fig-prisma
%%| fig-cap: "Sample sorting process through identification and screening"
%%| fig-width: 6
flowchart TD;
search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
{t3}
""")
``` ```
All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix. All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.