feat(script): Move all data transformations to single chunk

This commit is contained in:
Marty Oehme 2024-07-14 21:51:18 +02:00
parent ed6c8550b6
commit ad71859ded
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 84 additions and 51 deletions

View file

@ -27,11 +27,41 @@ bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
# load relevant studies
from src import load_data
bib_df = prep_data.observations_with_metadata_df(
# each observation in a single dataframe
df = prep_data.observations_with_metadata_df(
raw_observations = load_data.from_yml(PROCESSED_DATA),
study_metadata = prep_data.bib_metadata_df(bib_sample),
country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
)
raw_observations = None
zot_df = None
df_country_groups = None
# all observations but split per individual intervention
df_by_intervention = (
df
.fillna("")
.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
# Calc study validities (internal & external separated)
from src.model import validity
validities = validity.calculate(df_by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
#validities["external_validity"] = validities["external_validity"].astype('category')
validities["internal_validity"] = validities["internal_validity"].astype('category')
validities["External Validity"] = validities["external_validity"]
validities["Internal Validity"] = validities["internal_validity"]