dotfiles/qutebrowser/data/userscripts/doi2scihub

#!/usr/bin/env python3
"""
Goes to the sci-hub page for the current article, based on DOI.


Based on the work in
https://github.com/cadadr/configuration/blob/4b6a241d04d113f322b960890a0d0a0ab783a7b3/dotfiles/qutebrowser/userscripts/doi
with much gratitude.

The program can be invoked with DOI on a page selected, through the hinting mode when selecting a DOI link or on a publisher page (any page where doi meta-tags are set) - works on ScienceDirect, Taylor&Francis, Springer, etc.
That means you can give it a doi through a link or on the current page, for example with the following mappings:

```python
config.bind('"p', "spawn --userscript doi2scihub")
config.bind(';p', "hint links userscript doi2scihub")
```

You can also pass the doi as the (only) argument to the userscript:

```
:spawn --userscript doi2scihub https://doi.org/10.37394/23207.2021.18.68
`

Updates its sci-hub link based on the one listed on sci-hub wiki page.
"""


import os
import re
import sys
import html.parser
import requests


mode = os.getenv("QUTE_MODE")

text = None

class DoiTagParser(html.parser.HTMLParser):
    doi = None

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if self.doi == None and tag == "meta":
            if (
                ("name", "citation_doi") in attrs
                or ("name", "dc.identifier") in attrs
                or ("scheme", "doi") in attrs
            ):
                for att in attrs:
                    if att[0] == "content":
                        self.doi = att[1]
                        break


class SciHubLinkParser(html.parser.HTMLParser):
    current = None
    link_patt = re.compile(r"^(?P<url>https?://sci-hub\..+)/about$")

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if self.current == None and tag == "a":
            for att in attrs:
                if att[0] == "href" and self.link_patt.match(att[1] or ""):
                    match = self.link_patt.match(att[1] or "")
                    self.current = match["url"] if match and match["url"] else None


def get_scihub_url(wiki_page: str = "https://en.wikipedia.org/wiki/Sci-Hub"):
    resp = requests.get(wiki_page)
    parser = SciHubLinkParser()
    parser.feed(resp.text)
    return parser.current or "https://sci-hub.ru"


# use doi argument if we got one
if len(sys.argv) > 1:
    text = sys.argv[1]
# use the hinted url
elif mode == "hints":
    text = os.getenv("QUTE_URL", "").strip()
# use the current selection
elif mode == "command" and os.getenv("QUTE_SELECTED_TEXT"):
    text = os.getenv("QUTE_SELECTED_TEXT", "").strip()
# just try to find a doi on current page
elif os.getenv("QUTE_HTML"):
    with open(os.getenv("QUTE_HTML", ""), "r") as source:
        parser = DoiTagParser()
        parser.feed(source.read())
        text = parser.doi

with open(os.getenv("QUTE_FIFO", ""), "w") as fifo:
    if not text:
        fifo.write(f'message-warning "Could not find a valid DOI"')
        sys.exit()

    # DOI syntax: https://www.doi.org/doi_handbook/2_Numbering.html#2.2.
    #
    # Note that this probably matches a subset of possible DOIs, as it
    # seems that there’s no practical limitation on neither the length nor
    # the contents of the DOI.  But IMHO this is a healthy subset.
    doi_re = re.compile(
        # match possible URI prefix
        r"(?P<blah>((https?)?://)?doi\.org/)?"
        # match actual DOI
        r"(?P<meat>[a-zA-Z0-9\./\-_]+)"
    )

    match = doi_re.match(text)
    if match is None or match["meat"] is None:
        fifo.write(
            f"message-warning \"'{text}' is probably not a DOI, or update regexp\""
        )

    else:
        url = get_scihub_url()
        doi = match["meat"]
        fifo.write(f"open -t {url}/{doi}")