feat(script): Move big code chunks out of script

2024-07-14 20:18:57 +02:00 · 2024-07-14 20:18:57 +02:00 · ed6c8550b6
commit ed6c8550b6
parent 76578e99d3
4 changed files with 88 additions and 173 deletions
--- a/01-codechunks/_prep-data.py
+++ b/01-codechunks/_prep-data.py
@ -0,0 +1,37 @@
+from pathlib import Path
+import re
+## standard imports
+from IPython.core.display import Markdown as md
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+import seaborn as sns
+from tabulate import tabulate
+import bibtexparser
+
+sns.set_style("whitegrid")
+
+DATA_DIR=Path("./02-data")
+RAW_DATA=DATA_DIR.joinpath("raw")
+WORKING_DATA=DATA_DIR.joinpath("intermediate")
+PROCESSED_DATA=DATA_DIR.joinpath("processed")
+SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
+
+from src import prep_data
+
+# raw database-search results
+bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
+# the complete library of sampled (and working) literature
+bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
+
+# load relevant studies
+from src import load_data
+
+bib_df = prep_data.observations_with_metadata_df(
+    raw_observations = load_data.from_yml(PROCESSED_DATA),
+    study_metadata = prep_data.bib_metadata_df(bib_sample),
+    country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
+)
+raw_observations = None
+zot_df = None
+df_country_groups = None
--- a/01-codechunks/_prisma-flowchart.py
+++ b/01-codechunks/_prisma-flowchart.py
@ -0,0 +1,45 @@
+nr_database_query_raw = len(bib_sample_raw_db.entries)
+nr_snowballing_raw = 2240
+
+all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
+nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
+nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
+nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
+
+FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
+FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
+
+NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
+NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
+NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
+
+nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
+nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
+nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
+nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
+nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
+nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
+
+t3 = "`" * 3
+# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
+# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
+print(f"""
+```{{mermaid}}
+%%| label: fig-prisma
+%%| fig-cap: "Sample sorting process through identification and screening"
+%%| fig-width: 6
+flowchart TD;
+    search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
+    search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
+
+    starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
+
+    dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
+
+    title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
+
+    abstract_screened -- "  Language screening ({nr_out_language} excluded)  "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
+
+    language_screened -- "  Full-text screening ({nr_out_fulltext} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
+{t3}
+""")
--- a/article.qmd
+++ b/article.qmd
@ -38,51 +38,16 @@ crossref: # to fix the appendix crossrefs being separate from main
      latex-list-of-description: Appendix B Table
 ---

-{{< portrait >}}

 ```{python}
 #| label: load-data
 #| echo: false
 #| output: false
-from pathlib import Path
-import re
-## standard imports
-from IPython.core.display import Markdown as md
-import numpy as np
-import pandas as pd
-from matplotlib import pyplot as plt
-import seaborn as sns
-from tabulate import tabulate
-import bibtexparser
-
-sns.set_style("whitegrid")
-
-DATA_DIR=Path("./02-data")
-RAW_DATA=DATA_DIR.joinpath("raw")
-WORKING_DATA=DATA_DIR.joinpath("intermediate")
-PROCESSED_DATA=DATA_DIR.joinpath("processed")
-SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
-
-from src import prep_data
-
-# raw database-search results
-bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
-# the complete library of sampled (and working) literature
-bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
-
-# load relevant studies
-from src import load_data
-
-bib_df = prep_data.observations_with_metadata_df(
-    raw_observations = load_data.from_yml(PROCESSED_DATA),
-    study_metadata = prep_data.bib_metadata_df(bib_sample),
-    country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
-)
-raw_observations = None
-zot_df = None
-df_country_groups = None
+{{< include 01-codechunks/_prep-data.py >}}
 ```

+{{< portrait >}}
+
 # Introduction

 * Context and statement of the problem
@ -124,8 +89,6 @@ with a focus on the narrowing criteria specified in @tbl-inclusion-criteria.
 ::: {#tbl-inclusion-criteria}

 ```{python}
-#| label: tbl-inclusion-criteria
-
 inclusion_criteria = pd.read_csv("02-data/supplementary/inclusion-criteria.tsv", sep="\t")
 md(tabulate(inclusion_criteria, showindex=False, headers="keys", tablefmt="grid"))
 ```
@ -177,52 +140,7 @@ ultimately resulting in the process represented in the PRISMA chart in @fig-pris
 #| label: calculate-scoping-flowchart
 #| echo: false
 #| output: asis
-
-nr_database_query_raw = len(bib_sample_raw_db.entries)
-nr_snowballing_raw = 2240
-
-all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
-nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
-nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
-nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
-
-FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
-FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
-
-NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
-NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
-NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
-
-nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
-nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
-nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
-nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
-nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
-nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
-
-t3 = "`" * 3
-# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
-# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
-print(f"""
-```{{mermaid}}
-%%| label: fig-prisma
-%%| fig-cap: "Sample sorting process through identification and screening"
-%%| fig-width: 6
-flowchart TD;
-    search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
-    search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
-
-    starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
-
-    dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
-
-    title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
-
-    abstract_screened -- "  Language screening ({nr_out_language} excluded)  "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
-
-    language_screened -- "  Full-text screening ({nr_out_fulltext} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
-{t3}
-""")
+{{< include 01-codechunks/_prisma-flowchart.py >}}
 ```

 All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.
--- a/scoping_review.qmd
+++ b/scoping_review.qmd
@ -21,47 +21,7 @@ crossref:
      latex-list-of-description: Appendix Table
 ---

-```{python}
-#| label: load-data
-#| echo: false
-from pathlib import Path
-import re
-## standard imports
-from IPython.core.display import Markdown as md
-import numpy as np
-import pandas as pd
-from matplotlib import pyplot as plt
-import seaborn as sns
-from tabulate import tabulate
-import bibtexparser
-
-sns.set_style("whitegrid")
-
-DATA_DIR=Path("./02-data")
-RAW_DATA=DATA_DIR.joinpath("raw")
-WORKING_DATA=DATA_DIR.joinpath("intermediate")
-PROCESSED_DATA=DATA_DIR.joinpath("processed")
-SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
-
-from src import prep_data
-
-# raw database-search results
-bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
-# the complete library of sampled (and working) literature
-bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
-
-# load relevant studies
-from src import load_data
-
-bib_df = prep_data.observations_with_metadata_df(
-    raw_observations = load_data.from_yml(PROCESSED_DATA),
-    study_metadata = prep_data.bib_metadata_df(bib_sample),
-    country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
-)
-raw_observations = None
-zot_df = None
-df_country_groups = None
-```
+{{< include 01-codechunks/_prep-data.qmd >}}

 <!-- pagebreak to separate from TOC -->
 {{< pagebreak >}}
@ -415,52 +375,7 @@ and the sources will be added to the sample to undergo the same screening proces
 #| label: calculate-scoping-flowchart
 #| echo: false
 #| output: asis
-
-nr_database_query_raw = len(bib_sample_raw_db.entries)
-nr_snowballing_raw = 2240
-
-all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
-nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
-nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
-nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
-
-FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
-FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
-
-NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
-NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
-NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
-
-nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
-nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
-nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
-nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
-nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
-nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
-
-t3 = "`" * 3
-# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
-# from: https://github.com/quarto-dev/quarto-cli/discussions/6508
-print(f"""
-```{{mermaid}}
-%%| label: fig-prisma
-%%| fig-cap: "Sample sorting process through identification and screening"
-%%| fig-width: 6
-flowchart TD;
-    search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample;
-    search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
-
-    starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"];
-
-    dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"];
-
-    title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"];
-
-    abstract_screened -- "  Language screening ({nr_out_language} excluded)  "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"];
-
-    language_screened -- "  Full-text screening ({nr_out_fulltext} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"];
-{t3}
-""")
+{{< include 01-codechunks/_prisma-flowchart.py >}}
 ```

 All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.