chore(repo): Expose dataframes directly from source

This commit is contained in:
Marty Oehme 2024-07-19 18:09:07 +02:00
parent f61da38837
commit 8f64604def
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
3 changed files with 90 additions and 43 deletions

13
src/__init__.py Normal file
View file

@ -0,0 +1,13 @@
from src.process.generate_dataframes import bib_sample, bib_sample_db_raw, df_by_intervention, df_main, df_validities
# each observation in a single dataframe
df = df_main()
# all observations but split per individual intervention
df_by_intervention = df_by_intervention()
# Calc study validities (internal & external separated)
validities = df_validities()
bib_sample = bib_sample()
bib_sample_db_raw = bib_sample_db_raw()

View file

@ -1,8 +1,8 @@
from src.process.generate_dataframes import bib_sample, bib_sample_raw_db from src import bib_sample, bib_sample_db_raw
class PrismaNumbers: class PrismaNumbers:
raw_db = len(bib_sample_raw_db.entries) raw_db = len(bib_sample_db_raw.entries)
raw_snowball = 2240 raw_snowball = 2240
# list of all keywords (semicolon-delimited string) for each entry in sample # list of all keywords (semicolon-delimited string) for each entry in sample
@ -43,7 +43,7 @@ class PrismaNumbers:
final_extracted = len([1 for kw in all_kw if "done::extracted" in kw]) final_extracted = len([1 for kw in all_kw if "done::extracted" in kw])
del bib_sample, bib_sample_raw_db del bib_sample, bib_sample_db_raw
if __name__ == "__main__": if __name__ == "__main__":
prisma = PrismaNumbers() prisma = PrismaNumbers()

View file

@ -1,34 +1,57 @@
from pathlib import Path
import re import re
import pandas as pd
from bibtexparser import Library
import src.globals as g import src.globals as g
from src.extract import load_data as load
from src.model import validity
## Creates 3 important data structures: ## Creates 3 important data structures:
# df: The main dataframe containing all final sample studies # df: The main dataframe containing all final sample studies
# df_by_intervention: The same dataframe but split up by individual interventions per study # df_by_intervention: The same dataframe but split up by individual interventions per study
# validities: The studies with their validities, containing only quasi-/experimental studies # validities: The studies with their validities, containing only quasi-/experimental studies
from src.process import add_metadata as meta
# the complete library of sampled (and working) literature
def bib_sample() -> Library:
return meta.bib_library_from_file(g.REFERENCE_DATA.joinpath("zotero-library.bib"))
# raw database-search results # raw database-search results
bib_sample_raw_db = meta.bib_library_from_dir(g.REFERENCE_DATA.joinpath("db")) def bib_sample_db_raw() -> Library:
# the complete library of sampled (and working) literature return meta.bib_library_from_dir(g.REFERENCE_DATA.joinpath("db"))
bib_sample = meta.bib_library_from_file(g.REFERENCE_DATA.joinpath("zotero-library.bib"))
# load relevant studies
from src.extract import load_data as load
# each observation in a single dataframe def df_main() -> pd.DataFrame:
df = meta.observations_with_metadata_df( df = meta.observations_with_metadata_df(
raw_observations = load.from_yml(g.EXTRACTED_DATA), raw_observations=load.from_yml(g.EXTRACTED_DATA),
study_metadata = meta.bib_metadata_df(bib_sample), study_metadata=meta.bib_metadata_df(bib_sample()),
country_groups = meta.country_groups_df(Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")), country_groups=meta.country_groups_df(
) g.SUPPLEMENTARY_DATA.joinpath("wb-country-groupings.xlsx")
),
)
return df
# all observations but split per individual intervention
df_by_intervention = ( from src.process import add_metadata as meta
df
def df_by_intervention() -> pd.DataFrame:
df_by_intervention = (
df_main()
.fillna("") .fillna("")
.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"]) .groupby(
[
"author",
"year",
"title",
"design",
"method",
"representativeness",
"citation",
]
)
.agg( .agg(
{ {
"intervention": lambda _col: "; ".join(_col), "intervention": lambda _col: "; ".join(_col),
@ -38,19 +61,30 @@ df_by_intervention = (
.drop_duplicates() .drop_duplicates()
.assign( .assign(
intervention=lambda _df: _df["intervention"].apply( intervention=lambda _df: _df["intervention"].apply(
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]) lambda _cell: set(
[x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]
)
), ),
) )
.explode("intervention") .explode("intervention")
) )
return df_by_intervention
# Calc study validities (internal & external separated)
from src.model import validity
validities = validity.calculate(df_by_intervention) def df_validities() -> pd.DataFrame:
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")" validities = validity.calculate(df_by_intervention())
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")] validities["identifier"] = (
#validities["external_validity"] = validities["external_validity"].astype('category') validities["author"].str.replace(r",.*$", "", regex=True)
validities["internal_validity"] = validities["internal_validity"].astype('category') + " ("
validities["External Validity"] = validities["external_validity"] + validities["year"].astype(str)
validities["Internal Validity"] = validities["internal_validity"] + ")"
)
validities = validities.loc[
(validities["design"] == "quasi-experimental")
| (validities["design"] == "experimental")
]
# validities["external_validity"] = validities["external_validity"].astype('category')
validities["internal_validity"] = validities["internal_validity"].astype("category")
validities["External Validity"] = validities["external_validity"]
validities["Internal Validity"] = validities["internal_validity"]
return validities