chore(script): Refactor dataframe loading code

Improved readability of dataframe loading, used improved chaining
and some list comprehension to make it much less messy.
This commit is contained in:
Marty Oehme 2023-12-09 23:38:08 +01:00
parent 3f05283f6d
commit 8e7f99b20d
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A

View file

@ -55,6 +55,42 @@ for partial_bib in WORKING_DATA.glob("**/*.bib"):
bib_sample = bibtexparser.parse_string(bib_string) bib_sample = bibtexparser.parse_string(bib_string)
``` ```
```{python}
# load relevant studies
from src import data
# load zotero-based metadata: citations and uses
zot_df = pd.DataFrame([
[
entry["doi"] if "doi" in entry.fields_dict else None,
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
entry["usage"] if "usage" in entry.fields_dict else None,
entry["keywords"] if "keywords" in entry.fields_dict else None,
]
for entry in bib_sample.entries
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
.assign(
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
year = lambda _df: _df["date"].dt.year,
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
)
.query("year >= 2000")
)
zot_df = None
df_country_groups = None
```
# Introduction # Introduction
This section will introduce the reader to the concern of inequality in the World of Work (WoW), This section will introduce the reader to the concern of inequality in the World of Work (WoW),
@ -363,7 +399,6 @@ It restricts studies to those that comprise primary research published after 200
with a focus on the narrowing criteria specified in @tbl-inclusion-criteria. with a focus on the narrowing criteria specified in @tbl-inclusion-criteria.
```{python} ```{python}
#| echo: false
#| label: tbl-inclusion-criteria #| label: tbl-inclusion-criteria
#| tbl-cap: Study inclusion and exclusion scoping criteria {#tbl-inclusion-criteria} #| tbl-cap: Study inclusion and exclusion scoping criteria {#tbl-inclusion-criteria}
@ -447,41 +482,6 @@ with small decreases between 2001 and 2008, as well as more significant ones in
as can be seen in @fig-publications-per-year. as can be seen in @fig-publications-per-year.
Keeping in mind that these results are not yet screened for their full relevance to the topic at hand, so far only being *potentially* relevant in falling into the requirements of the search pattern, an increased results output does not necessarily mean a clearly rising amount of relevant literature. Keeping in mind that these results are not yet screened for their full relevance to the topic at hand, so far only being *potentially* relevant in falling into the requirements of the search pattern, an increased results output does not necessarily mean a clearly rising amount of relevant literature.
```{python}
# load relevant studies
from src import data
bib_df = data.from_yml(f"{PROCESSED_DATA}/relevant")
# load zotero-based metadata
reformatted = []
for e in sample_relevant:
ed = e.fields_dict
reformatted.append([
ed.get("doi", Field(key="doi", value=None)).value,
ed.get("times-cited", Field(key="times-cited", value=None)).value,
ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
ed.get("keywords", Field(key="keywords", value=None)).value,
])
zot_df = pd.DataFrame(reformatted, columns = ["doi", "cited", "usage", "keywords"])
bib_df["doi"] = bib_df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False)
bib_df["zot_cited"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["cited"])
bib_df["zot_usage"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["usage"])
bib_df["zot_keywords"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["keywords"])
bib_df["date"] = pd.to_datetime(bib_df["year"], format="%Y")
bib_df["year"] = bib_df["date"].dt.year
# only keep newer entries
bib_df = bib_df[bib_df["year"] >= 2000]
# Add WB country grouping definitions (income group, world region)
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE)
bib_df["income group"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Income group"])
bib_df["region"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Region"])
```
<!-- TODO Should this be sub-divided by region or subdivision later per-section? --> <!-- TODO Should this be sub-divided by region or subdivision later per-section? -->
```{python} ```{python}
#| label: fig-publications-per-year #| label: fig-publications-per-year