From 8f64604defdc9461be0470f4122678581e8ee4c6 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Fri, 19 Jul 2024 18:09:07 +0200 Subject: [PATCH] chore(repo): Expose dataframes directly from source --- src/__init__.py | 13 ++++ src/model/prisma.py | 6 +- src/process/generate_dataframes.py | 114 +++++++++++++++++++---------- 3 files changed, 90 insertions(+), 43 deletions(-) create mode 100644 src/__init__.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..b1857ec --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,13 @@ +from src.process.generate_dataframes import bib_sample, bib_sample_db_raw, df_by_intervention, df_main, df_validities + +# each observation in a single dataframe +df = df_main() + +# all observations but split per individual intervention +df_by_intervention = df_by_intervention() + +# Calc study validities (internal & external separated) +validities = df_validities() + +bib_sample = bib_sample() +bib_sample_db_raw = bib_sample_db_raw() diff --git a/src/model/prisma.py b/src/model/prisma.py index fe29532..576b14c 100644 --- a/src/model/prisma.py +++ b/src/model/prisma.py @@ -1,8 +1,8 @@ -from src.process.generate_dataframes import bib_sample, bib_sample_raw_db +from src import bib_sample, bib_sample_db_raw class PrismaNumbers: - raw_db = len(bib_sample_raw_db.entries) + raw_db = len(bib_sample_db_raw.entries) raw_snowball = 2240 # list of all keywords (semicolon-delimited string) for each entry in sample @@ -43,7 +43,7 @@ class PrismaNumbers: final_extracted = len([1 for kw in all_kw if "done::extracted" in kw]) -del bib_sample, bib_sample_raw_db +del bib_sample, bib_sample_db_raw if __name__ == "__main__": prisma = PrismaNumbers() diff --git a/src/process/generate_dataframes.py b/src/process/generate_dataframes.py index 9f6b498..72f6950 100644 --- a/src/process/generate_dataframes.py +++ b/src/process/generate_dataframes.py @@ -1,56 +1,90 @@ -from pathlib import Path import re + +import pandas as pd +from bibtexparser import Library + import src.globals as g +from src.extract import load_data as load +from src.model import validity ## Creates 3 important data structures: # df: The main dataframe containing all final sample studies # df_by_intervention: The same dataframe but split up by individual interventions per study # validities: The studies with their validities, containing only quasi-/experimental studies -from src.process import add_metadata as meta + +# the complete library of sampled (and working) literature +def bib_sample() -> Library: + return meta.bib_library_from_file(g.REFERENCE_DATA.joinpath("zotero-library.bib")) + # raw database-search results -bib_sample_raw_db = meta.bib_library_from_dir(g.REFERENCE_DATA.joinpath("db")) -# the complete library of sampled (and working) literature -bib_sample = meta.bib_library_from_file(g.REFERENCE_DATA.joinpath("zotero-library.bib")) +def bib_sample_db_raw() -> Library: + return meta.bib_library_from_dir(g.REFERENCE_DATA.joinpath("db")) -# load relevant studies -from src.extract import load_data as load -# each observation in a single dataframe -df = meta.observations_with_metadata_df( - raw_observations = load.from_yml(g.EXTRACTED_DATA), - study_metadata = meta.bib_metadata_df(bib_sample), - country_groups = meta.country_groups_df(Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), -) - -# all observations but split per individual intervention -df_by_intervention = ( - df - .fillna("") - .groupby(["author", "year", "title", "design", "method", "representativeness", "citation"]) - .agg( - { - "intervention": lambda _col: "; ".join(_col), - } - ) - .reset_index() - .drop_duplicates() - .assign( - intervention=lambda _df: _df["intervention"].apply( - lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) +def df_main() -> pd.DataFrame: + df = meta.observations_with_metadata_df( + raw_observations=load.from_yml(g.EXTRACTED_DATA), + study_metadata=meta.bib_metadata_df(bib_sample()), + country_groups=meta.country_groups_df( + g.SUPPLEMENTARY_DATA.joinpath("wb-country-groupings.xlsx") ), ) - .explode("intervention") -) + return df -# Calc study validities (internal & external separated) -from src.model import validity -validities = validity.calculate(df_by_intervention) -validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" -validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")] -#validities["external_validity"] = validities["external_validity"].astype('category') -validities["internal_validity"] = validities["internal_validity"].astype('category') -validities["External Validity"] = validities["external_validity"] -validities["Internal Validity"] = validities["internal_validity"] +from src.process import add_metadata as meta + + +def df_by_intervention() -> pd.DataFrame: + df_by_intervention = ( + df_main() + .fillna("") + .groupby( + [ + "author", + "year", + "title", + "design", + "method", + "representativeness", + "citation", + ] + ) + .agg( + { + "intervention": lambda _col: "; ".join(_col), + } + ) + .reset_index() + .drop_duplicates() + .assign( + intervention=lambda _df: _df["intervention"].apply( + lambda _cell: set( + [x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")] + ) + ), + ) + .explode("intervention") + ) + return df_by_intervention + + +def df_validities() -> pd.DataFrame: + validities = validity.calculate(df_by_intervention()) + validities["identifier"] = ( + validities["author"].str.replace(r",.*$", "", regex=True) + + " (" + + validities["year"].astype(str) + + ")" + ) + validities = validities.loc[ + (validities["design"] == "quasi-experimental") + | (validities["design"] == "experimental") + ] + # validities["external_validity"] = validities["external_validity"].astype('category') + validities["internal_validity"] = validities["internal_validity"].astype("category") + validities["External Validity"] = validities["external_validity"] + validities["Internal Validity"] = validities["internal_validity"] + return validities