From ed6c8550b688abc7094727720d647fba42df9e50 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sun, 14 Jul 2024 20:18:57 +0200 Subject: [PATCH] feat(script): Move big code chunks out of script --- 01-codechunks/_prep-data.py | 37 ++++++++++++ 01-codechunks/_prisma-flowchart.py | 45 +++++++++++++++ article.qmd | 90 ++---------------------------- scoping_review.qmd | 89 +---------------------------- 4 files changed, 88 insertions(+), 173 deletions(-) create mode 100644 01-codechunks/_prep-data.py create mode 100644 01-codechunks/_prisma-flowchart.py diff --git a/01-codechunks/_prep-data.py b/01-codechunks/_prep-data.py new file mode 100644 index 0000000..8b3f088 --- /dev/null +++ b/01-codechunks/_prep-data.py @@ -0,0 +1,37 @@ +from pathlib import Path +import re +## standard imports +from IPython.core.display import Markdown as md +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +import seaborn as sns +from tabulate import tabulate +import bibtexparser + +sns.set_style("whitegrid") + +DATA_DIR=Path("./02-data") +RAW_DATA=DATA_DIR.joinpath("raw") +WORKING_DATA=DATA_DIR.joinpath("intermediate") +PROCESSED_DATA=DATA_DIR.joinpath("processed") +SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") + +from src import prep_data + +# raw database-search results +bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA) +# the complete library of sampled (and working) literature +bib_sample = prep_data.bib_library_from_dir(WORKING_DATA) + +# load relevant studies +from src import load_data + +bib_df = prep_data.observations_with_metadata_df( + raw_observations = load_data.from_yml(PROCESSED_DATA), + study_metadata = prep_data.bib_metadata_df(bib_sample), + country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), +) +raw_observations = None +zot_df = None +df_country_groups = None diff --git a/01-codechunks/_prisma-flowchart.py b/01-codechunks/_prisma-flowchart.py new file mode 100644 index 0000000..83b76f7 --- /dev/null +++ b/01-codechunks/_prisma-flowchart.py @@ -0,0 +1,45 @@ +nr_database_query_raw = len(bib_sample_raw_db.entries) +nr_snowballing_raw = 2240 + +all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()] +nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw]) +nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw]) +nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw]) + +FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw +FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded + +NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150 +NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727 +NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348 + +nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED +nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL +nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL +nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL +nr_out_language = len([1 for kw in all_keywords if "out::language" in kw]) +nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw]) + +t3 = "`" * 3 +# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts +# from: https://github.com/quarto-dev/quarto-cli/discussions/6508 +print(f""" +```{{mermaid}} +%%| label: fig-prisma +%%| fig-cap: "Sample sorting process through identification and screening" +%%| fig-width: 6 +flowchart TD; + search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample; + search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"]; + + starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"]; + + dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"]; + + title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"]; + + abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"]; + + language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"]; +{t3} +""") diff --git a/article.qmd b/article.qmd index 0cbd99a..d70885f 100644 --- a/article.qmd +++ b/article.qmd @@ -38,51 +38,16 @@ crossref: # to fix the appendix crossrefs being separate from main latex-list-of-description: Appendix B Table --- -{{< portrait >}} ```{python} #| label: load-data #| echo: false #| output: false -from pathlib import Path -import re -## standard imports -from IPython.core.display import Markdown as md -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt -import seaborn as sns -from tabulate import tabulate -import bibtexparser - -sns.set_style("whitegrid") - -DATA_DIR=Path("./02-data") -RAW_DATA=DATA_DIR.joinpath("raw") -WORKING_DATA=DATA_DIR.joinpath("intermediate") -PROCESSED_DATA=DATA_DIR.joinpath("processed") -SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") - -from src import prep_data - -# raw database-search results -bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA) -# the complete library of sampled (and working) literature -bib_sample = prep_data.bib_library_from_dir(WORKING_DATA) - -# load relevant studies -from src import load_data - -bib_df = prep_data.observations_with_metadata_df( - raw_observations = load_data.from_yml(PROCESSED_DATA), - study_metadata = prep_data.bib_metadata_df(bib_sample), - country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), -) -raw_observations = None -zot_df = None -df_country_groups = None +{{< include 01-codechunks/_prep-data.py >}} ``` +{{< portrait >}} + # Introduction * Context and statement of the problem @@ -124,8 +89,6 @@ with a focus on the narrowing criteria specified in @tbl-inclusion-criteria. ::: {#tbl-inclusion-criteria} ```{python} -#| label: tbl-inclusion-criteria - inclusion_criteria = pd.read_csv("02-data/supplementary/inclusion-criteria.tsv", sep="\t") md(tabulate(inclusion_criteria, showindex=False, headers="keys", tablefmt="grid")) ``` @@ -177,52 +140,7 @@ ultimately resulting in the process represented in the PRISMA chart in @fig-pris #| label: calculate-scoping-flowchart #| echo: false #| output: asis - -nr_database_query_raw = len(bib_sample_raw_db.entries) -nr_snowballing_raw = 2240 - -all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()] -nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw]) -nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw]) -nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw]) - -FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw -FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded - -NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150 -NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727 -NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348 - -nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED -nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL -nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL -nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL -nr_out_language = len([1 for kw in all_keywords if "out::language" in kw]) -nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw]) - -t3 = "`" * 3 -# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts -# from: https://github.com/quarto-dev/quarto-cli/discussions/6508 -print(f""" -```{{mermaid}} -%%| label: fig-prisma -%%| fig-cap: "Sample sorting process through identification and screening" -%%| fig-width: 6 -flowchart TD; - search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample; - search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"]; - - starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"]; - - dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"]; - - title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"]; - - abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"]; - - language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"]; -{t3} -""") +{{< include 01-codechunks/_prisma-flowchart.py >}} ``` All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix. diff --git a/scoping_review.qmd b/scoping_review.qmd index 85605fa..a2e50bc 100644 --- a/scoping_review.qmd +++ b/scoping_review.qmd @@ -21,47 +21,7 @@ crossref: latex-list-of-description: Appendix Table --- -```{python} -#| label: load-data -#| echo: false -from pathlib import Path -import re -## standard imports -from IPython.core.display import Markdown as md -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt -import seaborn as sns -from tabulate import tabulate -import bibtexparser - -sns.set_style("whitegrid") - -DATA_DIR=Path("./02-data") -RAW_DATA=DATA_DIR.joinpath("raw") -WORKING_DATA=DATA_DIR.joinpath("intermediate") -PROCESSED_DATA=DATA_DIR.joinpath("processed") -SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") - -from src import prep_data - -# raw database-search results -bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA) -# the complete library of sampled (and working) literature -bib_sample = prep_data.bib_library_from_dir(WORKING_DATA) - -# load relevant studies -from src import load_data - -bib_df = prep_data.observations_with_metadata_df( - raw_observations = load_data.from_yml(PROCESSED_DATA), - study_metadata = prep_data.bib_metadata_df(bib_sample), - country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), -) -raw_observations = None -zot_df = None -df_country_groups = None -``` +{{< include 01-codechunks/_prep-data.qmd >}} {{< pagebreak >}} @@ -415,52 +375,7 @@ and the sources will be added to the sample to undergo the same screening proces #| label: calculate-scoping-flowchart #| echo: false #| output: asis - -nr_database_query_raw = len(bib_sample_raw_db.entries) -nr_snowballing_raw = 2240 - -all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()] -nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw]) -nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw]) -nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw]) - -FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw -FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded - -NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150 -NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727 -NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348 - -nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED -nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL -nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL -nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL -nr_out_language = len([1 for kw in all_keywords if "out::language" in kw]) -nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw]) - -t3 = "`" * 3 -# FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts -# from: https://github.com/quarto-dev/quarto-cli/discussions/6508 -print(f""" -```{{mermaid}} -%%| label: fig-prisma -%%| fig-cap: "Sample sorting process through identification and screening" -%%| fig-width: 6 -flowchart TD; - search_db["Records identified through database searching (n={nr_database_query_raw})"] --> starting_sample; - search_prev["Records identified through other sources (n={nr_snowballing_raw})"] --> starting_sample["Starting sample (n={FULL_RAW_SAMPLE_NOTHING_REMOVED})"]; - - starting_sample -- "Duplicate removal ({nr_out_duplicates+nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={FULL_SAMPLE_DUPLICATES_REMOVED})"]; - - dedup -- "Title screening ({nr_out_title} excluded)" --> title_screened["Records after titles screened (n={FULL_SAMPLE_DUPLICATES_REMOVED - nr_out_title})"]; - - title_screened -- "Abstract screening ({nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract})"]; - - abstract_screened -- " Language screening ({nr_out_language} excluded) "--> language_screened["Records after language screened (n={FULL_SAMPLE_DUPLICATES_REMOVED-nr_out_title-nr_out_abstract-nr_out_language})"]; - - language_screened -- " Full-text screening ({nr_out_fulltext} excluded) "--> full-text_screened["Full-text articles assessed for eligibility (n={nr_extraction_done})"]; -{t3} -""") +{{< include 01-codechunks/_prisma-flowchart.py >}} ``` All relevant data concerning both their major findings and statistical significance are then extracted from the individual studies into a collective results matrix.