chore(repo): Move yml files to extracted data dir

This commit is contained in:
Marty Oehme 2024-07-16 16:26:11 +02:00
parent 3ec7dcd1bb
commit c5df5c01a2
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
66 changed files with 21 additions and 14 deletions

View file

@ -1,18 +1,21 @@
# Scoping Review: Inequalities on the Labour Market
This repository contains all data, modelling and processing source code and the complete textual content to reproduce the scoping review study.
The most up-to-date version of this repository can always be found [here](https://git.martyoeh.me/professional/wow-inequalities).
Raw, intermediate and processed data can all be found in the `data/` directory:
Raw data include the unmodified database queries using the scoping review search terms.
Intermediate data are made up of the bibtex file produced by Zotero, after tagging and sorting in a Zotero library, ready to be re-imported into the application.
Processed data include the fully extracted studies which make up the main sample for the review.
Raw references, extracted and processed data can all be found in the `data/` directory:
Reference data include the unmodified database queries using the scoping review search terms,
and the bibtex file produced by Zotero after tagging and sorting in a Zotero library, ready to be re-imported into the application.
Extracted data include the fully extracted studies which make up the main sample for the review.
Processed data are ready to import into a dataframe or visualize in a report.
The full article text and code can be found in the `scoping_review.qmd` file.
It makes use of supplementary processing code which resides in the `src/` directory,
All full texts and visualization code reside in the `manuscripts/` directory.
The full working paper text and code can be found in the `scoping_review.qmd` file.
The full article text can be found in the `article.qmd` file.
They both make use of supplementary extraction and processing code which resides in the `src/` directory,
mainly to load processed data from the `data/` directory and turn it into `.csv` data,
as well as pre-processing those for visualization and validity ranking within the study.
as well as pre-processing for visualization and validity ranking within the study.
## Execution and Reproduction

View file

@ -52,7 +52,7 @@ zot_df = pd.DataFrame([
WB_COUNTRY_GROUPS_FILE = Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
bib_df = (load_data.from_yml(f"{g.PROCESSED_DATA}/relevant")
bib_df = (load_data.from_yml(f"{g.EXTRACTED_DATA}")
.assign(
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),

View file

@ -9,7 +9,11 @@ try:
except ModuleNotFoundError:
import yml as yaml # for directly running the package
DEFAULT_YAML_PATH = Path("data/processed")
try:
import src.globals as g
DEFAULT_YAML_PATH = g.EXTRACTED_DATA
except ModuleNotFoundError:
DEFAULT_YAML_PATH = Path("data")
def to_tsv(studies: list[dict]) -> str:

View file

@ -5,7 +5,7 @@ PROJECT_DIR = Path(os.getenv("QUARTO_PROJECT_DIR", "."))
DATA_DIR = PROJECT_DIR.joinpath("data")
PROCESSED_DATA = DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA = DATA_DIR.joinpath("supplementary")
EXTRACTED_DATA = DATA_DIR.joinpath("extracted")
REFERENCE_DATA = DATA_DIR.joinpath("references")
SUPPLEMENTARY_DATA = DATA_DIR.joinpath("supplementary")
PROCESSED_DATA = DATA_DIR.joinpath("processed")

View file

@ -20,7 +20,7 @@ from src.extract import load_data as load
# each observation in a single dataframe
df = meta.observations_with_metadata_df(
raw_observations = load.from_yml(g.PROCESSED_DATA),
raw_observations = load.from_yml(g.EXTRACTED_DATA),
study_metadata = meta.bib_metadata_df(bib_sample),
country_groups = meta.country_groups_df(Path(f"{g.SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx")),
)