chore(script): Refactor dataframe loading code
Improved readability of dataframe loading, used improved chaining and some list comprehension to make it much less messy.
This commit is contained in:
parent
3f05283f6d
commit
8e7f99b20d
1 changed files with 36 additions and 36 deletions
|
@ -55,6 +55,42 @@ for partial_bib in WORKING_DATA.glob("**/*.bib"):
|
|||
bib_sample = bibtexparser.parse_string(bib_string)
|
||||
```
|
||||
|
||||
```{python}
|
||||
# load relevant studies
|
||||
from src import data
|
||||
|
||||
# load zotero-based metadata: citations and uses
|
||||
zot_df = pd.DataFrame([
|
||||
[
|
||||
entry["doi"] if "doi" in entry.fields_dict else None,
|
||||
entry["times-cited"] if "times-cited" in entry.fields_dict else None,
|
||||
entry["usage"] if "usage" in entry.fields_dict else None,
|
||||
entry["keywords"] if "keywords" in entry.fields_dict else None,
|
||||
]
|
||||
for entry in bib_sample.entries
|
||||
], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi")
|
||||
|
||||
# Add WB country grouping definitions (income group, world region)
|
||||
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
||||
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy")
|
||||
|
||||
bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant")
|
||||
.assign(
|
||||
doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False),
|
||||
zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]),
|
||||
zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]),
|
||||
zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]),
|
||||
date = lambda _df: pd.to_datetime(_df["year"], format="%Y"),
|
||||
year = lambda _df: _df["date"].dt.year,
|
||||
region = lambda _df: _df["country"].map(df_country_groups["Region"]),
|
||||
income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]),
|
||||
)
|
||||
.query("year >= 2000")
|
||||
)
|
||||
zot_df = None
|
||||
df_country_groups = None
|
||||
```
|
||||
|
||||
# Introduction
|
||||
|
||||
This section will introduce the reader to the concern of inequality in the World of Work (WoW),
|
||||
|
@ -363,7 +399,6 @@ It restricts studies to those that comprise primary research published after 200
|
|||
with a focus on the narrowing criteria specified in @tbl-inclusion-criteria.
|
||||
|
||||
```{python}
|
||||
#| echo: false
|
||||
#| label: tbl-inclusion-criteria
|
||||
#| tbl-cap: Study inclusion and exclusion scoping criteria {#tbl-inclusion-criteria}
|
||||
|
||||
|
@ -447,41 +482,6 @@ with small decreases between 2001 and 2008, as well as more significant ones in
|
|||
as can be seen in @fig-publications-per-year.
|
||||
Keeping in mind that these results are not yet screened for their full relevance to the topic at hand, so far only being *potentially* relevant in falling into the requirements of the search pattern, an increased results output does not necessarily mean a clearly rising amount of relevant literature.
|
||||
|
||||
```{python}
|
||||
# load relevant studies
|
||||
from src import data
|
||||
bib_df = data.from_yml(f"{PROCESSED_DATA}/relevant")
|
||||
|
||||
# load zotero-based metadata
|
||||
reformatted = []
|
||||
for e in sample_relevant:
|
||||
ed = e.fields_dict
|
||||
reformatted.append([
|
||||
ed.get("doi", Field(key="doi", value=None)).value,
|
||||
ed.get("times-cited", Field(key="times-cited", value=None)).value,
|
||||
ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value,
|
||||
ed.get("keywords", Field(key="keywords", value=None)).value,
|
||||
])
|
||||
zot_df = pd.DataFrame(reformatted, columns = ["doi", "cited", "usage", "keywords"])
|
||||
|
||||
bib_df["doi"] = bib_df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False)
|
||||
bib_df["zot_cited"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["cited"])
|
||||
bib_df["zot_usage"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["usage"])
|
||||
bib_df["zot_keywords"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["keywords"])
|
||||
|
||||
bib_df["date"] = pd.to_datetime(bib_df["year"], format="%Y")
|
||||
bib_df["year"] = bib_df["date"].dt.year
|
||||
|
||||
# only keep newer entries
|
||||
bib_df = bib_df[bib_df["year"] >= 2000]
|
||||
|
||||
# Add WB country grouping definitions (income group, world region)
|
||||
WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve()
|
||||
df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE)
|
||||
bib_df["income group"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Income group"])
|
||||
bib_df["region"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Region"])
|
||||
```
|
||||
|
||||
<!-- TODO Should this be sub-divided by region or subdivision later per-section? -->
|
||||
```{python}
|
||||
#| label: fig-publications-per-year
|
||||
|
|
Loading…
Reference in a new issue