refactor(code): Move observation data prep to src
Moved the overall metadata, country grouping code to the src directory somewhat trimming the prep code in the beginning of the manuscript itself.
This commit is contained in:
parent
71d02ff8dd
commit
42222447dc
2 changed files with 168 additions and 52 deletions
|
@ -46,70 +46,32 @@ WORKING_DATA=DATA_DIR.joinpath("intermediate")
|
|||
PROCESSED_DATA=DATA_DIR.joinpath("processed")
|
||||
SUPPLEMENTARY_DATA=DATA_DIR.joinpath("supplementary")
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in RAW_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
with all_logging_disabled():
|
||||
bib_sample_raw_db = bibtexparser.parse_string(bib_string)
|
||||
from src import prep_data
|
||||
|
||||
bib_string=""
|
||||
for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string+="\n".join(f.readlines())
|
||||
with all_logging_disabled():
|
||||
bib_sample = bibtexparser.parse_string(bib_string)
|
||||
# raw database-search results
|
||||
bib_sample_raw_db = prep_data.bib_library_from_dir(RAW_DATA)
|
||||
# the complete library of sampled (and working) literature
|
||||
bib_sample = prep_data.bib_library_from_dir(WORKING_DATA)
|
||||
|
||||
# load relevant studies
|
||||
from src import load_data
|
||||
raw_observations_df = load_data.from_yml(PROCESSED_DATA)
|
||||
|
||||
# load zotero-based metadata: citations and uses
|
||||
zot_df = pd.DataFrame([
|
||||
[
|
||||
entry["doi"] if "doi" in entry.fields_dict else None,
|
||||
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
||||
entry["usage"] if "usage" in entry.fields_dict else None,
|
||||
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
||||
]
|
||||
for entry in bib_sample.entries
|
||||
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
||||
zot_df = prep_data.bib_metadata_df(bib_sample)
|
||||
|
||||
# Add WB country grouping definitions (income group, world region)
|
||||
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
||||
df_country_groups = pd.concat([pd.read_excel(WB_COUNTRY_GROUPS_FILE), pd.DataFrame(data={'Economy':['global'],'Code':['WLD'],'Region':['Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean'], 'Income group':[''], 'Lending category':['']})]).set_index("Economy")
|
||||
df_country_groups = prep_data.country_groups_df(Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx"))
|
||||
|
||||
def countries_to_regions(countries:str):
|
||||
res = set()
|
||||
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
|
||||
if c in df_country_groups.index:
|
||||
region = df_country_groups.at[c,'Region']
|
||||
res.add(region)
|
||||
return ";".join(res)
|
||||
|
||||
def countries_to_income_groups(countries:str):
|
||||
res = set()
|
||||
for c in countries.replace(" ;", ";").replace("; ",";").split(";"):
|
||||
if c in df_country_groups.index:
|
||||
region = df_country_groups.at[c,'Income group']
|
||||
res.add(region)
|
||||
return ";".join(res)
|
||||
|
||||
|
||||
bib_df = (load_data.from_yml(f"{PROCESSED_DATA}")
|
||||
.assign(
|
||||
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
||||
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
||||
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
||||
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
||||
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
||||
year = lambda _df: _df["date"].dt.year,
|
||||
region = lambda _df: _df["country"].map(countries_to_regions),
|
||||
income_group = lambda _df: _df["country"].map(countries_to_income_groups),
|
||||
)
|
||||
.query("year >= 2000")
|
||||
bib_df = prep_data.observations_with_metadata_df(
|
||||
raw_observations = raw_observations_df,
|
||||
study_metadata = zot_df,
|
||||
country_groups = df_country_groups
|
||||
)
|
||||
raw_observations = None
|
||||
zot_df = None
|
||||
df_country_groups = None
|
||||
|
||||
```
|
||||
|
||||
<!-- pagebreak to separate from TOC -->
|
||||
|
|
154
src/prep_data.py
Normal file
154
src/prep_data.py
Normal file
|
@ -0,0 +1,154 @@
|
|||
import logging
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from pathlib import Path
|
||||
|
||||
import bibtexparser
|
||||
import pandas as pd
|
||||
|
||||
|
||||
@contextmanager
|
||||
def all_logging_disabled(highest_level=logging.CRITICAL):
|
||||
previous_level = logging.root.manager.disable
|
||||
logging.disable(highest_level)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
logging.disable(previous_level)
|
||||
|
||||
|
||||
def bib_library_from_dir(
|
||||
dir: Path, disable_warnings: bool = True
|
||||
) -> bibtexparser.Library:
|
||||
bib_string = ""
|
||||
for partial_bib in dir.glob("**/*.bib"):
|
||||
with open(partial_bib) as f:
|
||||
bib_string += "\n".join(f.readlines())
|
||||
if disable_warnings:
|
||||
cm = all_logging_disabled
|
||||
else:
|
||||
cm = nullcontext
|
||||
with cm():
|
||||
return bibtexparser.parse_string(bib_string)
|
||||
|
||||
|
||||
def bib_metadata_df(sample: bibtexparser.Library) -> pd.DataFrame:
|
||||
"""Returns dataframe with relevant metadata extracted from a bibtex library.
|
||||
|
||||
DataFrame contains: doi, the number times it has been cited in the
|
||||
literature (WOS statistic), its overall usage, and all keywords it
|
||||
has been assigned (in the reference management software).
|
||||
"""
|
||||
return (
|
||||
pd.DataFrame(
|
||||
[
|
||||
[
|
||||
entry["doi"] if "doi" in entry.fields_dict else None,
|
||||
entry["times-cited"]
|
||||
if "times-cited" in entry.fields_dict
|
||||
else None,
|
||||
entry["usage"] if "usage" in entry.fields_dict else None,
|
||||
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
||||
]
|
||||
for entry in sample.entries
|
||||
],
|
||||
columns=["doi", "cited", "usage", "keywords"],
|
||||
)
|
||||
.drop_duplicates("doi")
|
||||
.set_index("doi")
|
||||
)
|
||||
|
||||
|
||||
def country_groups_df(wb_xlsx_file: Path) -> pd.DataFrame:
|
||||
"""Returns a dataframe of WB country group definitions.
|
||||
|
||||
DataFrame contains the region of a country, the country code,
|
||||
its income group, economy group and lending category.
|
||||
|
||||
Contains an additional entry for 'global' with the country
|
||||
code 'WLD' which can be used for non-country specific
|
||||
observations (e.g. wide comparative studies using 30+
|
||||
developing/developed country datasets).
|
||||
"""
|
||||
return pd.concat(
|
||||
[
|
||||
pd.read_excel(wb_xlsx_file),
|
||||
pd.DataFrame(
|
||||
data={
|
||||
"Economy": ["global"],
|
||||
"Code": ["WLD"],
|
||||
"Region": [
|
||||
"Europe & Central Asia;South Asia;North America;East Asia & Pacific;Sub-Saharan Africa;Europe & Central Asia;Latin America & Caribbean"
|
||||
],
|
||||
"Income group": [""],
|
||||
"Lending category": [""],
|
||||
}
|
||||
),
|
||||
]
|
||||
).set_index("Economy")
|
||||
|
||||
|
||||
def observations_with_metadata_df(
|
||||
raw_observations: pd.DataFrame,
|
||||
study_metadata: pd.DataFrame | None = None,
|
||||
country_groups: pd.DataFrame | None = None,
|
||||
) -> pd.DataFrame:
|
||||
"""Returns observations with metadata and country information attached.
|
||||
|
||||
Returned DataFrame contains the raw observations data with one
|
||||
observation per row, with correct study and WB country grouping metadata
|
||||
attached for each row as well.
|
||||
"""
|
||||
df = raw_observations.assign(
|
||||
doi=lambda _df: _df["uri"].str.extract(
|
||||
r"https?://(?:dx\.)?doi\.org/(.*)", expand=False
|
||||
),
|
||||
date=lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
||||
year=lambda _df: _df["date"].dt.year,
|
||||
).query("year >= 2000")
|
||||
if study_metadata is not None:
|
||||
df = df.assign(
|
||||
zot_cited=lambda _df: _df["doi"].map(study_metadata["cited"]),
|
||||
zot_usage=lambda _df: _df["doi"].map(study_metadata["usage"]),
|
||||
zot_keywords=lambda _df: _df["doi"].map(study_metadata["keywords"]),
|
||||
)
|
||||
if country_groups is not None:
|
||||
|
||||
def c_to_region_closure(countries):
|
||||
return countries_to_regions(country_groups, countries)
|
||||
|
||||
def c_to_income_closure(countries):
|
||||
return countries_to_income_groups(country_groups, countries)
|
||||
|
||||
df = df.assign(
|
||||
region=lambda _df: _df["country"].map(c_to_region_closure),
|
||||
income_group=lambda _df: _df["country"].map(c_to_income_closure),
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def countries_to_regions(country_groups: pd.DataFrame, countries: str) -> str:
|
||||
"""Returns the regions for countries passed in.
|
||||
|
||||
Return is in semicolon-concatenated string-list ready to be passed into
|
||||
pandas for an apply or mapping method.
|
||||
"""
|
||||
res = set()
|
||||
for c in countries.replace(" ;", ";").replace("; ", ";").split(";"):
|
||||
if c in country_groups.index:
|
||||
region = country_groups.at[c, "Region"]
|
||||
res.add(region)
|
||||
return ";".join(res)
|
||||
|
||||
|
||||
def countries_to_income_groups(country_groups: pd.DataFrame, countries: str):
|
||||
"""Returns the income groups for countries passed in.
|
||||
|
||||
Return is in semicolon-concatenated string-list ready to be passed into
|
||||
pandas for an apply or mapping method.
|
||||
"""
|
||||
res = set()
|
||||
for c in countries.replace(" ;", ";").replace("; ", ";").split(";"):
|
||||
if c in country_groups.index:
|
||||
region = country_groups.at[c, "Income group"]
|
||||
res.add(region)
|
||||
return ";".join(res)
|
Loading…
Reference in a new issue