#!/usr/bin/env python3 """ Goes to the sci-hub page for the current article, based on DOI. Based on the work in https://github.com/cadadr/configuration/blob/4b6a241d04d113f322b960890a0d0a0ab783a7b3/dotfiles/qutebrowser/userscripts/doi with much gratitude. The program can be invoked with DOI on a page selected, through the hinting mode when selecting a DOI link or on a publisher page (any page where doi meta-tags are set) - works on ScienceDirect, Taylor&Francis, Springer, etc. That means you can give it a doi through a link or on the current page, for example with the following mappings: ```python config.bind('"p', "spawn --userscript doi2scihub") config.bind(';p', "hint links userscript doi2scihub") ``` You can also pass the doi as the (only) argument to the userscript: ``` :spawn --userscript doi2scihub https://doi.org/10.37394/23207.2021.18.68 ` Updates its sci-hub link based on the one listed on sci-hub wiki page. """ import os import re import sys import html.parser import requests mode = os.getenv("QUTE_MODE") text = None class DoiTagParser(html.parser.HTMLParser): doi = None def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: if self.doi == None and tag == "meta": if ( ("name", "citation_doi") in attrs or ("name", "dc.identifier") in attrs or ("scheme", "doi") in attrs ): for att in attrs: if att[0] == "content": self.doi = att[1] break class SciHubLinkParser(html.parser.HTMLParser): current = None link_patt = re.compile(r"^(?Phttps?://sci-hub\..+)/about$") def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: if self.current == None and tag == "a": for att in attrs: if att[0] == "href" and self.link_patt.match(att[1] or ""): match = self.link_patt.match(att[1] or "") self.current = match["url"] if match and match["url"] else None def get_scihub_url(wiki_page: str = "https://wikiless.org/wiki/Sci-Hub"): resp = requests.get(wiki_page) parser = SciHubLinkParser() parser.feed(resp.text) return parser.current or "https://sci-hub.ru" # use doi argument if we got one if len(sys.argv) > 1: text = sys.argv[1] # use the hinted url elif mode == "hints": text = os.getenv("QUTE_URL", "").strip() # use the current selection elif mode == "command" and os.getenv("QUTE_SELECTED_TEXT"): text = os.getenv("QUTE_SELECTED_TEXT", "").strip() # just try to find a doi on current page elif os.getenv("QUTE_HTML"): with open(os.getenv("QUTE_HTML", ""), "r") as source: parser = DoiTagParser() parser.feed(source.read()) text = parser.doi with open(os.getenv("QUTE_FIFO", ""), "w") as fifo: if not text: fifo.write(f'message-warning "Could not find a valid DOI"') sys.exit() # DOI syntax: https://www.doi.org/doi_handbook/2_Numbering.html#2.2. # # Note that this probably matches a subset of possible DOIs, as it # seems that there’s no practical limitation on neither the length nor # the contents of the DOI. But IMHO this is a healthy subset. doi_re = re.compile( # match possible URI prefix r"(?P((https?)?://)?doi\.org/)?" # match actual DOI r"(?P[a-zA-Z0-9\./\-_]+)" ) match = doi_re.match(text) if match is None or match["meat"] is None: fifo.write( f"message-warning \"'{text}' is probably not a DOI, or update regexp\"" ) else: url = get_scihub_url() doi = match["meat"] fifo.write(f"open -t {url}/{doi}")