chore(repo): Expose dataframes directly from source
This commit is contained in:
parent
f61da38837
commit
8f64604def
3 changed files with 90 additions and 43 deletions
13
src/__init__.py
Normal file
13
src/__init__.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
from src.process.generate_dataframes import bib_sample, bib_sample_db_raw, df_by_intervention, df_main, df_validities
|
||||||
|
|
||||||
|
# each observation in a single dataframe
|
||||||
|
df = df_main()
|
||||||
|
|
||||||
|
# all observations but split per individual intervention
|
||||||
|
df_by_intervention = df_by_intervention()
|
||||||
|
|
||||||
|
# Calc study validities (internal & external separated)
|
||||||
|
validities = df_validities()
|
||||||
|
|
||||||
|
bib_sample = bib_sample()
|
||||||
|
bib_sample_db_raw = bib_sample_db_raw()
|
|
@ -1,8 +1,8 @@
|
||||||
from src.process.generate_dataframes import bib_sample, bib_sample_raw_db
|
from src import bib_sample, bib_sample_db_raw
|
||||||
|
|
||||||
|
|
||||||
class PrismaNumbers:
|
class PrismaNumbers:
|
||||||
raw_db = len(bib_sample_raw_db.entries)
|
raw_db = len(bib_sample_db_raw.entries)
|
||||||
raw_snowball = 2240
|
raw_snowball = 2240
|
||||||
|
|
||||||
# list of all keywords (semicolon-delimited string) for each entry in sample
|
# list of all keywords (semicolon-delimited string) for each entry in sample
|
||||||
|
@ -43,7 +43,7 @@ class PrismaNumbers:
|
||||||
final_extracted = len([1 for kw in all_kw if "done::extracted" in kw])
|
final_extracted = len([1 for kw in all_kw if "done::extracted" in kw])
|
||||||
|
|
||||||
|
|
||||||
del bib_sample, bib_sample_raw_db
|
del bib_sample, bib_sample_db_raw
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
prisma = PrismaNumbers()
|
prisma = PrismaNumbers()
|
||||||
|
|
|
@ -1,56 +1,90 @@
|
||||||
from pathlib import Path
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from bibtexparser import Library
|
||||||
|
|
||||||
import src.globals as g
|
import src.globals as g
|
||||||
|
from src.extract import load_data as load
|
||||||
|
from src.model import validity
|
||||||
|
|
||||||
## Creates 3 important data structures:
|
## Creates 3 important data structures:
|
||||||
# df: The main dataframe containing all final sample studies
|
# df: The main dataframe containing all final sample studies
|
||||||
# df_by_intervention: The same dataframe but split up by individual interventions per study
|
# df_by_intervention: The same dataframe but split up by individual interventions per study
|
||||||
# validities: The studies with their validities, containing only quasi-/experimental studies
|
# validities: The studies with their validities, containing only quasi-/experimental studies
|
||||||
|
|
||||||
from src.process import add_metadata as meta
|
|
||||||
|
# the complete library of sampled (and working) literature
|
||||||
|
def bib_sample() -> Library:
|
||||||
|
return meta.bib_library_from_file(g.REFERENCE_DATA.joinpath("zotero-library.bib"))
|
||||||
|
|
||||||
|
|
||||||
# raw database-search results
|
# raw database-search results
|
||||||
bib_sample_raw_db = meta.bib_library_from_dir(g.REFERENCE_DATA.joinpath("db"))
|
def bib_sample_db_raw() -> Library:
|
||||||
# the complete library of sampled (and working) literature
|
return meta.bib_library_from_dir(g.REFERENCE_DATA.joinpath("db"))
|
||||||
bib_sample = meta.bib_library_from_file(g.REFERENCE_DATA.joinpath("zotero-library.bib"))
|
|
||||||
|
|
||||||
# load relevant studies
|
|
||||||
from src.extract import load_data as load
|
|
||||||
|
|
||||||
# each observation in a single dataframe
|
def df_main() -> pd.DataFrame:
|
||||||
df = meta.observations_with_metadata_df(
|
df = meta.observations_with_metadata_df(
|
||||||
raw_observations = load.from_yml(g.EXTRACTED_DATA),
|
raw_observations=load.from_yml(g.EXTRACTED_DATA),
|
||||||
study_metadata = meta.bib_metadata_df(bib_sample),
|
study_metadata=meta.bib_metadata_df(bib_sample()),
|
||||||
country_groups = meta.country_groups_df(Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
|
country_groups=meta.country_groups_df(
|
||||||
)
|
g.SUPPLEMENTARY_DATA.joinpath("wb-country-groupings.xlsx")
|
||||||
|
|
||||||
# all observations but split per individual intervention
|
|
||||||
df_by_intervention = (
|
|
||||||
df
|
|
||||||
.fillna("")
|
|
||||||
.groupby(["author", "year", "title", "design", "method", "representativeness", "citation"])
|
|
||||||
.agg(
|
|
||||||
{
|
|
||||||
"intervention": lambda _col: "; ".join(_col),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
.reset_index()
|
|
||||||
.drop_duplicates()
|
|
||||||
.assign(
|
|
||||||
intervention=lambda _df: _df["intervention"].apply(
|
|
||||||
lambda _cell: set([x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")])
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
.explode("intervention")
|
return df
|
||||||
)
|
|
||||||
|
|
||||||
# Calc study validities (internal & external separated)
|
|
||||||
from src.model import validity
|
|
||||||
|
|
||||||
validities = validity.calculate(df_by_intervention)
|
from src.process import add_metadata as meta
|
||||||
validities["identifier"] = validities["author"].str.replace(r',.*$', '', regex=True) + " (" + validities["year"].astype(str) + ")"
|
|
||||||
validities = validities.loc[(validities["design"] == "quasi-experimental") | (validities["design"] == "experimental")]
|
|
||||||
#validities["external_validity"] = validities["external_validity"].astype('category')
|
def df_by_intervention() -> pd.DataFrame:
|
||||||
validities["internal_validity"] = validities["internal_validity"].astype('category')
|
df_by_intervention = (
|
||||||
validities["External Validity"] = validities["external_validity"]
|
df_main()
|
||||||
validities["Internal Validity"] = validities["internal_validity"]
|
.fillna("")
|
||||||
|
.groupby(
|
||||||
|
[
|
||||||
|
"author",
|
||||||
|
"year",
|
||||||
|
"title",
|
||||||
|
"design",
|
||||||
|
"method",
|
||||||
|
"representativeness",
|
||||||
|
"citation",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
.agg(
|
||||||
|
{
|
||||||
|
"intervention": lambda _col: "; ".join(_col),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
.reset_index()
|
||||||
|
.drop_duplicates()
|
||||||
|
.assign(
|
||||||
|
intervention=lambda _df: _df["intervention"].apply(
|
||||||
|
lambda _cell: set(
|
||||||
|
[x.strip() for x in re.sub(r"\(.*\)", "", _cell).split(";")]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.explode("intervention")
|
||||||
|
)
|
||||||
|
return df_by_intervention
|
||||||
|
|
||||||
|
|
||||||
|
def df_validities() -> pd.DataFrame:
|
||||||
|
validities = validity.calculate(df_by_intervention())
|
||||||
|
validities["identifier"] = (
|
||||||
|
validities["author"].str.replace(r",.*$", "", regex=True)
|
||||||
|
+ " ("
|
||||||
|
+ validities["year"].astype(str)
|
||||||
|
+ ")"
|
||||||
|
)
|
||||||
|
validities = validities.loc[
|
||||||
|
(validities["design"] == "quasi-experimental")
|
||||||
|
| (validities["design"] == "experimental")
|
||||||
|
]
|
||||||
|
# validities["external_validity"] = validities["external_validity"].astype('category')
|
||||||
|
validities["internal_validity"] = validities["internal_validity"].astype("category")
|
||||||
|
validities["External Validity"] = validities["external_validity"]
|
||||||
|
validities["Internal Validity"] = validities["internal_validity"]
|
||||||
|
return validities
|
||||||
|
|
Loading…
Reference in a new issue