dotfiles/qutebrowser/data/userscripts/doi2scihub
Marty Oehme ab4a0154c6
qutebrowser: Fix scihub script to use wikipedia
When looking for the most up-to-date link to grab a scihub url
we used to use wikiless but now simply directly scrape from
wikipedia.
Functionality is not changed but should work much faster and
more reliably.
2024-04-19 11:30:12 +02:00

116 lines
3.7 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Goes to the sci-hub page for the current article, based on DOI.
Based on the work in
https://github.com/cadadr/configuration/blob/4b6a241d04d113f322b960890a0d0a0ab783a7b3/dotfiles/qutebrowser/userscripts/doi
with much gratitude.
The program can be invoked with DOI on a page selected, through the hinting mode when selecting a DOI link or on a publisher page (any page where doi meta-tags are set) - works on ScienceDirect, Taylor&Francis, Springer, etc.
That means you can give it a doi through a link or on the current page, for example with the following mappings:
```python
config.bind('"p', "spawn --userscript doi2scihub")
config.bind(';p', "hint links userscript doi2scihub")
```
You can also pass the doi as the (only) argument to the userscript:
```
:spawn --userscript doi2scihub https://doi.org/10.37394/23207.2021.18.68
`
Updates its sci-hub link based on the one listed on sci-hub wiki page.
"""
import os
import re
import sys
import html.parser
import requests
mode = os.getenv("QUTE_MODE")
text = None
class DoiTagParser(html.parser.HTMLParser):
doi = None
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if self.doi == None and tag == "meta":
if (
("name", "citation_doi") in attrs
or ("name", "dc.identifier") in attrs
or ("scheme", "doi") in attrs
):
for att in attrs:
if att[0] == "content":
self.doi = att[1]
break
class SciHubLinkParser(html.parser.HTMLParser):
current = None
link_patt = re.compile(r"^(?P<url>https?://sci-hub\..+)/about$")
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if self.current == None and tag == "a":
for att in attrs:
if att[0] == "href" and self.link_patt.match(att[1] or ""):
match = self.link_patt.match(att[1] or "")
self.current = match["url"] if match and match["url"] else None
def get_scihub_url(wiki_page: str = "https://en.wikipedia.org/wiki/Sci-Hub"):
resp = requests.get(wiki_page)
parser = SciHubLinkParser()
parser.feed(resp.text)
return parser.current or "https://sci-hub.ru"
# use doi argument if we got one
if len(sys.argv) > 1:
text = sys.argv[1]
# use the hinted url
elif mode == "hints":
text = os.getenv("QUTE_URL", "").strip()
# use the current selection
elif mode == "command" and os.getenv("QUTE_SELECTED_TEXT"):
text = os.getenv("QUTE_SELECTED_TEXT", "").strip()
# just try to find a doi on current page
elif os.getenv("QUTE_HTML"):
with open(os.getenv("QUTE_HTML", ""), "r") as source:
parser = DoiTagParser()
parser.feed(source.read())
text = parser.doi
with open(os.getenv("QUTE_FIFO", ""), "w") as fifo:
if not text:
fifo.write(f'message-warning "Could not find a valid DOI"')
sys.exit()
# DOI syntax: https://www.doi.org/doi_handbook/2_Numbering.html#2.2.
#
# Note that this probably matches a subset of possible DOIs, as it
# seems that theres no practical limitation on neither the length nor
# the contents of the DOI. But IMHO this is a healthy subset.
doi_re = re.compile(
# match possible URI prefix
r"(?P<blah>((https?)?://)?doi\.org/)?"
# match actual DOI
r"(?P<meat>[a-zA-Z0-9\./\-_]+)"
)
match = doi_re.match(text)
if match is None or match["meat"] is None:
fifo.write(
f"message-warning \"'{text}' is probably not a DOI, or update regexp\""
)
else:
url = get_scihub_url()
doi = match["meat"]
fifo.write(f"open -t {url}/{doi}")