chore(code): Move dataframe generation to module

This commit is contained in:
Marty Oehme 2024-07-15 21:12:39 +02:00
parent 38254d1605
commit 284a3b9281
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
3 changed files with 32 additions and 15 deletions

View file

@ -0,0 +1,57 @@
from pathlib import Path
import re
import seaborn as sns
import src.globals as g
## Creates 3 important data structures:
# df: The main dataframe containing all final sample studies
# df_by_intervention: The same dataframe but split up by individual interventions per study
# validities: The studies with their validities, containing only quasi-/experimental studies
from src.process import add_metadata as meta
# raw database-search results
bib_sample_raw_db = meta.bib_library_from_dir(g.RAW_DATA)
# the complete library of sampled (and working) literature
bib_sample = meta.bib_library_from_dir(g.WORKING_DATA)
# load relevant studies
from src.extract import load_data as load
# each observation in a single dataframe
df = meta.observations_with_metadata_df(
raw_observations = load.from_yml(g.PROCESSED_DATA),
study_metadata = meta.bib_metadata_df(bib_sample),
country_groups = meta.country_groups_df(Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
)
# all observations but split per individual intervention
df_by_intervention = (
df
.fillna("")
.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
),
)
.explode("intervention")
)
# Calc study validities (internal & external separated)
from src.model import validity
validities = validity.calculate(df_by_intervention)
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
#validities["external_validity"] = validities["external_validity"].astype('category')
validities["internal_validity"] = validities["internal_validity"].astype('category')
validities["External Validity"] = validities["external_validity"]
validities["Internal Validity"] = validities["internal_validity"]