feat(script): Move big code chunks out of script

2024-07-14 20:18:57 +02:00 · 2024-07-14 20:18:57 +02:00 · ed6c8550b6
commit ed6c8550b6
parent 76578e99d3
4 changed files with 88 additions and 173 deletions
--- a/01-codechunks/_prep-data.py
+++ b/01-codechunks/_prep-data.py
@ -0,0 +1,37 @@
 from pathlib import Path
 import re
 ## standard imports
 from IPython.core.display import Markdown as md
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
 import seaborn as sns
 from tabulate import tabulate
 import bibtexparser
 sns.set_style("whitegrid")
 DATA_DIR=Path("./02-data")
 RAW_DATA=DATA_DIR.joinpath("raw")
 WORKING_DATA=DATA_DIR.joinpath("intermediate")
 PROCESSED_DATA=DATA_DIR.joinpath("processed")
 SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
 from src import prep_data
 # raw database-search results
 bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
 # the complete library of sampled (and working) literature
 bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
 # load relevant studies
 from src import load_data
 bib_df = prep_data.observations_with_metadata_df(
    raw_observations = load_data.from_yml(PROCESSED_DATA),
    study_metadata = prep_data.bib_metadata_df(bib_sample),
    country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
 )
 raw_observations = None
 zot_df = None
 df_country_groups = None
--- a/01-codechunks/_prisma-flowchart.py
+++ b/01-codechunks/_prisma-flowchart.py
@ -0,0 +1,45 @@
 nr_database_query_raw = len(bib_sample_raw_db.entries)
 nr_snowballing_raw = 2240
 all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
 nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
 nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
 nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
 FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
 FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
 NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
 NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
 NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
 nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
 nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
 nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
 nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
 nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
 nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
 t3 = "`" * 3
 # FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
 # from: https://github.com/quarto-dev/quarto-cli/discussions/6508
 print(f"""
 ```{{mermaid}}
 %%| label: fig-prisma
 %%| fig-cap: "Sample sorting process through identification and screening"
 %%| fig-width: 6
 flowchart TD;
    search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
    search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
    starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
    dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
    title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
    abstract_screened -- "  Language screening ({nr_out_language} excluded)  "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
    language_screened -- "  Full-text screening ({nr_out_fulltext} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
 {t3}
 """)
--- a/article.qmd
+++ b/article.qmd
@ -38,51 +38,16 @@ crossref: # to fix the appendix crossrefs being separate from main
      latex-list-of-description: Appendix B Table
 ---
 {{< portrait >}}
 ```{python}
 #| label: load-data
 #| echo: false
 #| output: false
-from pathlib import Path
+{{< include 01-codechunks/_prep-data.py >}}
 import re
 ## standard imports
 from IPython.core.display import Markdown as md
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
 import seaborn as sns
 from tabulate import tabulate
 import bibtexparser
 sns.set_style("whitegrid")
 DATA_DIR=Path("./02-data")
 RAW_DATA=DATA_DIR.joinpath("raw")
 WORKING_DATA=DATA_DIR.joinpath("intermediate")
 PROCESSED_DATA=DATA_DIR.joinpath("processed")
 SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
 from src import prep_data
 # raw database-search results
 bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
 # the complete library of sampled (and working) literature
 bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
 # load relevant studies
 from src import load_data
 bib_df = prep_data.observations_with_metadata_df(
    raw_observations = load_data.from_yml(PROCESSED_DATA),
    study_metadata = prep_data.bib_metadata_df(bib_sample),
    country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
 )
 raw_observations = None
 zot_df = None
 df_country_groups = None
 ```
 {{< portrait >}}
 # Introduction
 * Context and statement of the problem
@ -124,8 +89,6 @@ with a focus on the narrowing criteria specified in @tbl-inclusion-criteria.
 ::: {#tbl-inclusion-criteria}
 ```{python}
 #| label: tbl-inclusion-criteria
 inclusion_criteria = pd.read_csv("02-data/supplementary/inclusion-criteria.tsv", sep="\t")
 md(tabulate(inclusion_criteria, showindex=False, headers="keys", tablefmt="grid"))
 ```
@ -177,52 +140,7 @@ ultimately resulting in the process represented in the PRISMA chart in @fig-pris
 #| label: calculate-scoping-flowchart
 #| echo: false
 #| output: asis
-
+{{< include 01-codechunks/_prisma-flowchart.py >}}
 nr_database_query_raw = len(bib_sample_raw_db.entries)
 nr_snowballing_raw = 2240
 all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
 nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
 nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
 nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
 FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
 FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
 NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
 NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
 NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
 nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
 nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
 nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
 nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
 nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
 nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
 t3 = "`" * 3
 # FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
 # from: https://github.com/quarto-dev/quarto-cli/discussions/6508
 print(f"""
 ```{{mermaid}}
 %%| label: fig-prisma
 %%| fig-cap: "Sample sorting process through identification and screening"
 %%| fig-width: 6
 flowchart TD;
    search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
    search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
    starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
    dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
    title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
    abstract_screened -- "  Language screening ({nr_out_language} excluded)  "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
    language_screened -- "  Full-text screening ({nr_out_fulltext} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
 {t3}
 """)
 ```
 All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@ -21,47 +21,7 @@ crossref:
      latex-list-of-description: Appendix Table
 ---
-```{python}
+{{< include 01-codechunks/_prep-data.qmd >}}
 #| label: load-data
 #| echo: false
 from pathlib import Path
 import re
 ## standard imports
 from IPython.core.display import Markdown as md
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
 import seaborn as sns
 from tabulate import tabulate
 import bibtexparser
 sns.set_style("whitegrid")
 DATA_DIR=Path("./02-data")
 RAW_DATA=DATA_DIR.joinpath("raw")
 WORKING_DATA=DATA_DIR.joinpath("intermediate")
 PROCESSED_DATA=DATA_DIR.joinpath("processed")
 SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
 from src import prep_data
 # raw database-search results
 bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
 # the complete library of sampled (and working) literature
 bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
 # load relevant studies
 from src import load_data
 bib_df = prep_data.observations_with_metadata_df(
    raw_observations = load_data.from_yml(PROCESSED_DATA),
    study_metadata = prep_data.bib_metadata_df(bib_sample),
    country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
 )
 raw_observations = None
 zot_df = None
 df_country_groups = None
 ```
 <!-- pagebreak to separate from TOC -->
 {{< pagebreak >}}
@ -415,52 +375,7 @@ and the sources will be added to the sample to undergo the same screening proces
 #| label: calculate-scoping-flowchart
 #| echo: false
 #| output: asis
-
+{{< include 01-codechunks/_prisma-flowchart.py >}}
 nr_database_query_raw = len(bib_sample_raw_db.entries)
 nr_snowballing_raw = 2240
 all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
 nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
 nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
 nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
 FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
 FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
 NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
 NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
 NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
 nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
 nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
 nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
 nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
 nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
 nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
 t3 = "`" * 3
 # FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
 # from: https://github.com/quarto-dev/quarto-cli/discussions/6508
 print(f"""
 ```{{mermaid}}
 %%| label: fig-prisma
 %%| fig-cap: "Sample sorting process through identification and screening"
 %%| fig-width: 6
 flowchart TD;
    search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
    search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
    starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
    dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
    title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
    abstract_screened -- "  Language screening ({nr_out_language} excluded)  "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
    language_screened -- "  Full-text screening ({nr_out_fulltext} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
 {t3}
 """)
 ```
 All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.