wow-inequalities/src/process/generate_dataframes.py

90 lines
2.7 KiB
Python

import re
import pandas as pd
from bibtexparser import Library
import src.globals as g
from src.extract import load_data as load
from src.model import validity
## Creates 3 important data structures:
# df: The main dataframe containing all final sample studies
# df_by_intervention: The same dataframe but split up by individual interventions per study
# validities: The studies with their validities, containing only quasi-/experimental studies
# the complete library of sampled (and working) literature
def bib_sample() -> Library:
return meta.bib_library_from_file(g.REFERENCE_DATA.joinpath("zotero-library.bib"))
# raw database-search results
def bib_sample_db_raw() -> Library:
return meta.bib_library_from_dir(g.REFERENCE_DATA.joinpath("db"))
def df_main() -> pd.DataFrame:
df = meta.observations_with_metadata_df(
raw_observations=load.from_yml(g.EXTRACTED_DATA),
study_metadata=meta.bib_metadata_df(bib_sample()),
country_groups=meta.country_groups_df(
g.SUPPLEMENTARY_DATA.joinpath("wb-country-groupings.xlsx")
),
)
return df
from src.process import add_metadata as meta
def df_by_intervention() -> pd.DataFrame:
df_by_intervention = (
df_main()
.fillna("")
.groupby(
[
"author",
"year",
"title",
"design",
"method",
"representativeness",
"citation",
]
)
.agg(
{
"intervention": lambda _col: "; ".join(_col),
}
)
.reset_index()
.drop_duplicates()
.assign(
intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set(
[x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]
)
),
)
.explode("intervention")
)
return df_by_intervention
def df_validities() -> pd.DataFrame:
validities = validity.calculate(df_by_intervention())
validities["identifier"] = (
validities["author"].str.replace(r",.*$", "", regex=True)
+ " ("
+ validities["year"].astype(str)
+ ")"
)
validities = validities.loc[
(validities["design"] == "quasi-experimental")
| (validities["design"] == "experimental")
]
# validities["external_validity"] = validities["external_validity"].astype('category')
validities["internal_validity"] = validities["internal_validity"].astype("category")
validities["External Validity"] = validities["external_validity"]
validities["Internal Validity"] = validities["internal_validity"]
return validities