feat(code): Add prisma calculation

2024-07-15 20:54:58 +02:00 · 2024-07-15 20:54:58 +02:00 · 38254d1605
commit 38254d1605
parent 0d05ed981a
7 changed files with 99 additions and 62 deletions
--- a/src/model/prisma.py
+++ b/src/model/prisma.py
@ -0,0 +1,52 @@
+from src.process import add_metadata as meta
+from src import globals as g
+
+bib_sample_raw_db = meta.bib_library_from_dir(g.RAW_DATA)
+bib_sample = meta.bib_library_from_dir(g.WORKING_DATA)
+
+class PrismaNumbers:
+    nr_database_query_raw = len(bib_sample_raw_db.entries)
+    nr_snowballing_raw = 2240
+
+    all_keywords = [entry["keywords"] for entry in bib_sample.entries if "keywords" in entry.fields_dict.keys()]
+    nr_database_deduplicated = len([1 for kw in all_keywords if "sample::database" in kw])
+    nr_snowballing_deduplicated = len([1 for kw in all_keywords if "sample::snowballing" in kw])
+    nr_out_superseded = len([1 for kw in all_keywords if "out::superseded" in kw])
+
+    FULL_RAW_SAMPLE_NOTHING_REMOVED = nr_database_query_raw + nr_snowballing_raw
+    FULL_SAMPLE_DUPLICATES_REMOVED = nr_database_deduplicated + nr_snowballing_deduplicated + nr_out_superseded
+
+    NON_ZOTERO_CAPTURE_TITLE_REMOVAL = 1150
+    NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL = 727
+    NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL = 348
+
+    nr_out_duplicates = FULL_RAW_SAMPLE_NOTHING_REMOVED - FULL_SAMPLE_DUPLICATES_REMOVED
+    nr_out_title = len([1 for kw in all_keywords if "out::title" in kw]) + NON_ZOTERO_CAPTURE_TITLE_REMOVAL
+    nr_out_abstract = len([1 for kw in all_keywords if "out::abstract" in kw]) + NON_ZOTERO_CAPTURE_ABSTRACT_REMOVAL
+    nr_out_fulltext = len([1 for kw in all_keywords if "out::full-text" in kw]) + NON_ZOTERO_CAPTURE_FULLTEXT_REMOVAL
+    nr_out_language = len([1 for kw in all_keywords if "out::language" in kw])
+    nr_extraction_done = len([1 for kw in all_keywords if "done::extracted" in kw])
+
+
+del bib_sample, bib_sample_raw_db
+
+if __name__ == "__main__":
+    nr = PrismaNumbers()
+
+    # FIXME use 02-data/supplementary undeduplciated counts to get database starting and snowballing counts
+    outp = f"""
+    flowchart TD;
+        search_db["Records identified through database searching (n={nr.nr_database_query_raw})"] --> starting_sample;
+        search_prev["Records identified through other sources (n={nr.nr_snowballing_raw})"] --> starting_sample["Starting sample (n={nr.FULL_RAW_SAMPLE_NOTHING_REMOVED})"];
+
+        starting_sample -- "Duplicate removal ({nr.nr_out_duplicates+nr.nr_out_superseded} removed) "--> dedup["Records after duplicates removed (n={nr.FULL_SAMPLE_DUPLICATES_REMOVED})"];
+
+        dedup -- "Title screening ({nr.nr_out_title} excluded)" --> title_screened["Records after titles screened (n={nr.FULL_SAMPLE_DUPLICATES_REMOVED - nr.nr_out_title})"];
+
+        title_screened -- "Abstract screening ({nr.nr_out_abstract} excluded)"--> abstract_screened["Records after abstracts screened (n={nr.FULL_SAMPLE_DUPLICATES_REMOVED-nr.nr_out_title-nr.nr_out_abstract})"];
+
+        abstract_screened -- "  Language screening ({nr.nr_out_language} excluded)  "--> language_screened["Records after language screened (n={nr.FULL_SAMPLE_DUPLICATES_REMOVED-nr.nr_out_title-nr.nr_out_abstract-nr.nr_out_language})"];
+
+        language_screened -- "  Full-text screening ({nr.nr_out_fulltext} excluded)  "--> full-text_screened["Full-text articles assessed for eligibility (n={nr.nr_extraction_done})"];
+    """
+    print(outp)