chore(code): Refactor data preparation into process module

This commit is contained in:
Marty Oehme 2024-07-15 20:42:11 +02:00
parent 562b1eb6a0
commit d2c25a9033
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 7 additions and 4 deletions

View file

@ -18,8 +18,12 @@ RAW_DATA=DATA_DIR.joinpath("raw")
WORKING_DATA=DATA_DIR.joinpath("intermediate") WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed") PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary") SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
## Creates 3 important data structures:
# df: The main dataframe containing all final sample studies
# df_by_intervention: The same dataframe but split up by individual interventions per study
# validities: The studies with their validities, containing only quasi-/experimental studies
from src import prep_data from src.process import add_metadata as meta
# raw database-search results # raw database-search results
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA) bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
@ -27,13 +31,13 @@ bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA) bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
# load relevant studies # load relevant studies
from src import load_data from src.extract import load_data as load
# each observation in a single dataframe # each observation in a single dataframe
df = prep_data.observations_with_metadata_df(
raw_observations = load_data.from_yml(PROCESSED_DATA), raw_observations = load_data.from_yml(PROCESSED_DATA),
study_metadata = prep_data.bib_metadata_df(bib_sample), study_metadata = prep_data.bib_metadata_df(bib_sample),
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
df = meta.observations_with_metadata_df(
) )
# all observations but split per individual intervention # all observations but split per individual intervention
@ -66,4 +70,3 @@ validities = validities.loc[(validities["design"] == "quasi-experimental") | (va
validities["internal_validity"] = validities["internal_validity"].astype('category') validities["internal_validity"] = validities["internal_validity"].astype('category')
validities["External Validity"] = validities["external_validity"] validities["External Validity"] = validities["external_validity"]
validities["Internal Validity"] = validities["internal_validity"] validities["Internal Validity"] = validities["internal_validity"]