refactor(code): Move observation data prep to src

Moved the overall metadata, country grouping code to the src directory
somewhat trimming the prep code in the beginning of the manuscript
itself.
This commit is contained in:
Marty Oehme 2024-02-17 17:30:58 +01:00
parent 71d02ff8dd
commit 42222447dc
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 168 additions and 52 deletions

View file

@ -46,70 +46,32 @@ WORKING_DATA=DATA_DIR.joinpath("intermediate")
PROCESSED_DATA=DATA_DIR.joinpath("processed")
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
bib_string=""
for partial_bib in RAW_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
with all_logging_disabled():
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
from src import prep_data
bib_string=""
for partial_bib in WORKING_DATA.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string+="\n".join(f.readlines())
with all_logging_disabled():
bib_sample = bibtexparser.parse_string(bib_string)
# raw database-search results
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
# the complete library of sampled (and working) literature
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
# load relevant studies
from src import load_data
raw_observations_df = load_data.from_yml(PROCESSED_DATA)
# load zotero-based metadata: citations and uses
zot_df = pd.DataFrame([
[
entry["doi"] if "doi" in entry.fields_dict else None,
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
entry["usage"] if "usage" in entry.fields_dict else None,
entry["keywords"] if "keywords" in entry.fields_dict else None,
]
for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
zot_df = prep_data.bib_metadata_df(bib_sample)
# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
df_country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx"))
def countries_to_regions(countries:str):
res = set()
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
if c in df_country_groups.index:
region = df_country_groups.at[c,'Region']
res.add(region)
return ";".join(res)
def countries_to_income_groups(countries:str):
res = set()
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
if c in df_country_groups.index:
region = df_country_groups.at[c,'Income group']
res.add(region)
return ";".join(res)
bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
.assign(
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
year = lambda _df: _df["date"].dt.year,
region = lambda _df: _df["country"].map(countries_to_regions),
income_group = lambda _df: _df["country"].map(countries_to_income_groups),
)
.query("year >= 2000")
bib_df = prep_data.observations_with_metadata_df(
raw_observations = raw_observations_df,
study_metadata = zot_df,
country_groups = df_country_groups
)
raw_observations = None
zot_df = None
df_country_groups = None
```
<!-- pagebreak to separate from TOC -->

154
src/prep_data.py Normal file
View file

@ -0,0 +1,154 @@
import logging
from contextlib import contextmanager, nullcontext
from pathlib import Path
import bibtexparser
import pandas as pd
@contextmanager
def all_logging_disabled(highest_level=logging.CRITICAL):
previous_level = logging.root.manager.disable
logging.disable(highest_level)
try:
yield
finally:
logging.disable(previous_level)
def bib_library_from_dir(
dir: Path, disable_warnings: bool = True
) -> bibtexparser.Library:
bib_string = ""
for partial_bib in dir.glob("**/*.bib"):
with open(partial_bib) as f:
bib_string += "\n".join(f.readlines())
if disable_warnings:
cm = all_logging_disabled
else:
cm = nullcontext
with cm():
return bibtexparser.parse_string(bib_string)
def bib_metadata_df(sample: bibtexparser.Library) -> pd.DataFrame:
"""Returns dataframe with relevant metadata extracted from a bibtex library.
DataFrame contains: doi, the number times it has been cited in the
literature (WOS statistic), its overall usage, and all keywords it
has been assigned (in the reference management software).
"""
return (
pd.DataFrame(
[
[
entry["doi"] if "doi" in entry.fields_dict else None,
entry["times-cited"]
if "times-cited" in entry.fields_dict
else None,
entry["usage"] if "usage" in entry.fields_dict else None,
entry["keywords"] if "keywords" in entry.fields_dict else None,
]
for entry in sample.entries
],
columns=["doi", "cited", "usage", "keywords"],
)
.drop_duplicates("doi")
.set_index("doi")
)
def country_groups_df(wb_xlsx_file: Path) -> pd.DataFrame:
"""Returns a dataframe of WB country group definitions.
DataFrame contains the region of a country, the country code,
its income group, economy group and lending category.
Contains an additional entry for 'global' with the country
code 'WLD' which can be used for non-country specific
observations (e.g. wide comparative studies using 30+
developing/developed country datasets).
"""
return pd.concat(
[
pd.read_excel(wb_xlsx_file),
pd.DataFrame(
data={
"Economy": ["global"],
"Code": ["WLD"],
"Region": [
"Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean"
],
"Income group": [""],
"Lending category": [""],
}
),
]
).set_index("Economy")
def observations_with_metadata_df(
raw_observations: pd.DataFrame,
study_metadata: pd.DataFrame | None = None,
country_groups: pd.DataFrame | None = None,
) -> pd.DataFrame:
"""Returns observations with metadata and country information attached.
Returned DataFrame contains the raw observations data with one
observation per row, with correct study and WB country grouping metadata
attached for each row as well.
"""
df = raw_observations.assign(
doi=lambda _df: _df["uri"].str.extract(
r"https?://(?:dx\.)?doi\.org/(.*)", expand=False
),
date=lambda _df: pd.to_datetime(_df["year"], format="%Y"),
year=lambda _df: _df["date"].dt.year,
).query("year >= 2000")
if study_metadata is not None:
df = df.assign(
zot_cited=lambda _df: _df["doi"].map(study_metadata["cited"]),
zot_usage=lambda _df: _df["doi"].map(study_metadata["usage"]),
zot_keywords=lambda _df: _df["doi"].map(study_metadata["keywords"]),
)
if country_groups is not None:
def c_to_region_closure(countries):
return countries_to_regions(country_groups, countries)
def c_to_income_closure(countries):
return countries_to_income_groups(country_groups, countries)
df = df.assign(
region=lambda _df: _df["country"].map(c_to_region_closure),
income_group=lambda _df: _df["country"].map(c_to_income_closure),
)
return df
def countries_to_regions(country_groups: pd.DataFrame, countries: str) -> str:
"""Returns the regions for countries passed in.
Return is in semicolon-concatenated string-list ready to be passed into
pandas for an apply or mapping method.
"""
res = set()
for c in countries.replace(" ;", ";").replace("; ", ";").split(";"):
if c in country_groups.index:
region = country_groups.at[c, "Region"]
res.add(region)
return ";".join(res)
def countries_to_income_groups(country_groups: pd.DataFrame, countries: str):
"""Returns the income groups for countries passed in.
Return is in semicolon-concatenated string-list ready to be passed into
pandas for an apply or mapping method.
"""
res = set()
for c in countries.replace(" ;", ";").replace("; ", ";").split(";"):
if c in country_groups.index:
region = country_groups.at[c, "Income group"]
res.add(region)
return ";".join(res)