From 8e7f99b20d17500fea243bcf4fb8aa55ce825f0f Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 9 Dec 2023 23:38:08 +0100 Subject: [PATCH] chore(script): Refactor dataframe loading code Improved readability of dataframe loading, used improved chaining and some list comprehension to make it much less messy. --- scoping_review.qmd | 72 +++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/scoping_review.qmd b/scoping_review.qmd index 8f27091..e2618f1 100644 --- a/scoping_review.qmd +++ b/scoping_review.qmd @@ -55,6 +55,42 @@ for partial_bib in WORKING_DATA.glob("**/*.bib"): bib_sample = bibtexparser.parse_string(bib_string) ``` +```{python} +# load relevant studies +from src import data + +# load zotero-based metadata: citations and uses +zot_df = pd.DataFrame([ + [ + entry["doi"] if "doi" in entry.fields_dict else None, + entry["times-cited"] if "times-cited" in entry.fields_dict else None, + entry["usage"] if "usage" in entry.fields_dict else None, + entry["keywords"] if "keywords" in entry.fields_dict else None, + ] + for entry in bib_sample.entries +], columns = ["doi", "cited", "usage", "keywords"]).drop_duplicates("doi").set_index("doi") + +# Add WB country grouping definitions (income group, world region) +WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() +df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE).set_index("Economy") + +bib_df = (data.from_yml(f"{PROCESSED_DATA}/relevant") + .assign( + doi=lambda _df: _df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False), + zot_cited=lambda _df: _df["doi"].map(zot_df["cited"]), + zot_usage=lambda _df: _df["doi"].map(zot_df["usage"]), + zot_keywords=lambda _df: _df["doi"].map(zot_df["keywords"]), + date = lambda _df: pd.to_datetime(_df["year"], format="%Y"), + year = lambda _df: _df["date"].dt.year, + region = lambda _df: _df["country"].map(df_country_groups["Region"]), + income_group = lambda _df: _df["country"].map(df_country_groups["Income group"]), + ) + .query("year >= 2000") +) +zot_df = None +df_country_groups = None +``` + # Introduction This section will introduce the reader to the concern of inequality in the World of Work (WoW), @@ -363,7 +399,6 @@ It restricts studies to those that comprise primary research published after 200 with a focus on the narrowing criteria specified in @tbl-inclusion-criteria. ```{python} -#| echo: false #| label: tbl-inclusion-criteria #| tbl-cap: Study inclusion and exclusion scoping criteria {#tbl-inclusion-criteria} @@ -447,41 +482,6 @@ with small decreases between 2001 and 2008, as well as more significant ones in as can be seen in @fig-publications-per-year. Keeping in mind that these results are not yet screened for their full relevance to the topic at hand, so far only being *potentially* relevant in falling into the requirements of the search pattern, an increased results output does not necessarily mean a clearly rising amount of relevant literature. -```{python} -# load relevant studies -from src import data -bib_df = data.from_yml(f"{PROCESSED_DATA}/relevant") - -# load zotero-based metadata -reformatted = [] -for e in sample_relevant: - ed = e.fields_dict - reformatted.append([ - ed.get("doi", Field(key="doi", value=None)).value, - ed.get("times-cited", Field(key="times-cited", value=None)).value, - ed.get("usage-count-since-2013", Field(key="usage-count-since-2013", value=None)).value, - ed.get("keywords", Field(key="keywords", value=None)).value, - ]) -zot_df = pd.DataFrame(reformatted, columns = ["doi", "cited", "usage", "keywords"]) - -bib_df["doi"] = bib_df["uri"].str.extract(r"https?://(?:dx\.)?doi\.org/(.*)", expand=False) -bib_df["zot_cited"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["cited"]) -bib_df["zot_usage"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["usage"]) -bib_df["zot_keywords"] = bib_df["doi"].map(zot_df.drop_duplicates("doi").set_index("doi")["keywords"]) - -bib_df["date"] = pd.to_datetime(bib_df["year"], format="%Y") -bib_df["year"] = bib_df["date"].dt.year - -# only keep newer entries -bib_df = bib_df[bib_df["year"] >= 2000] - -# Add WB country grouping definitions (income group, world region) -WB_COUNTRY_GROUPS_FILE = Path(f"{SUPPLEMENTARY_DATA}/wb-country-groupings.xlsx").resolve() -df_country_groups = pd.read_excel(WB_COUNTRY_GROUPS_FILE) -bib_df["income group"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Income group"]) -bib_df["region"] = bib_df["country"].map(df_country_groups.set_index("Economy")["Region"]) -``` - ```{python} #| label: fig-publications-per-year