Marty Oehme
ab4a0154c6
When looking for the most up-to-date link to grab a scihub url we used to use wikiless but now simply directly scrape from wikipedia. Functionality is not changed but should work much faster and more reliably.
116 lines
3.7 KiB
Python
Executable file
116 lines
3.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Goes to the sci-hub page for the current article, based on DOI.
|
||
|
||
|
||
Based on the work in
|
||
https://github.com/cadadr/configuration/blob/4b6a241d04d113f322b960890a0d0a0ab783a7b3/dotfiles/qutebrowser/userscripts/doi
|
||
with much gratitude.
|
||
|
||
The program can be invoked with DOI on a page selected, through the hinting mode when selecting a DOI link or on a publisher page (any page where doi meta-tags are set) - works on ScienceDirect, Taylor&Francis, Springer, etc.
|
||
That means you can give it a doi through a link or on the current page, for example with the following mappings:
|
||
|
||
```python
|
||
config.bind('"p', "spawn --userscript doi2scihub")
|
||
config.bind(';p', "hint links userscript doi2scihub")
|
||
```
|
||
|
||
You can also pass the doi as the (only) argument to the userscript:
|
||
|
||
```
|
||
:spawn --userscript doi2scihub https://doi.org/10.37394/23207.2021.18.68
|
||
`
|
||
|
||
Updates its sci-hub link based on the one listed on sci-hub wiki page.
|
||
"""
|
||
|
||
|
||
import os
|
||
import re
|
||
import sys
|
||
import html.parser
|
||
import requests
|
||
|
||
|
||
mode = os.getenv("QUTE_MODE")
|
||
|
||
text = None
|
||
|
||
class DoiTagParser(html.parser.HTMLParser):
|
||
doi = None
|
||
|
||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||
if self.doi == None and tag == "meta":
|
||
if (
|
||
("name", "citation_doi") in attrs
|
||
or ("name", "dc.identifier") in attrs
|
||
or ("scheme", "doi") in attrs
|
||
):
|
||
for att in attrs:
|
||
if att[0] == "content":
|
||
self.doi = att[1]
|
||
break
|
||
|
||
|
||
class SciHubLinkParser(html.parser.HTMLParser):
|
||
current = None
|
||
link_patt = re.compile(r"^(?P<url>https?://sci-hub\..+)/about$")
|
||
|
||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||
if self.current == None and tag == "a":
|
||
for att in attrs:
|
||
if att[0] == "href" and self.link_patt.match(att[1] or ""):
|
||
match = self.link_patt.match(att[1] or "")
|
||
self.current = match["url"] if match and match["url"] else None
|
||
|
||
|
||
def get_scihub_url(wiki_page: str = "https://en.wikipedia.org/wiki/Sci-Hub"):
|
||
resp = requests.get(wiki_page)
|
||
parser = SciHubLinkParser()
|
||
parser.feed(resp.text)
|
||
return parser.current or "https://sci-hub.ru"
|
||
|
||
|
||
# use doi argument if we got one
|
||
if len(sys.argv) > 1:
|
||
text = sys.argv[1]
|
||
# use the hinted url
|
||
elif mode == "hints":
|
||
text = os.getenv("QUTE_URL", "").strip()
|
||
# use the current selection
|
||
elif mode == "command" and os.getenv("QUTE_SELECTED_TEXT"):
|
||
text = os.getenv("QUTE_SELECTED_TEXT", "").strip()
|
||
# just try to find a doi on current page
|
||
elif os.getenv("QUTE_HTML"):
|
||
with open(os.getenv("QUTE_HTML", ""), "r") as source:
|
||
parser = DoiTagParser()
|
||
parser.feed(source.read())
|
||
text = parser.doi
|
||
|
||
with open(os.getenv("QUTE_FIFO", ""), "w") as fifo:
|
||
if not text:
|
||
fifo.write(f'message-warning "Could not find a valid DOI"')
|
||
sys.exit()
|
||
|
||
# DOI syntax: https://www.doi.org/doi_handbook/2_Numbering.html#2.2.
|
||
#
|
||
# Note that this probably matches a subset of possible DOIs, as it
|
||
# seems that there’s no practical limitation on neither the length nor
|
||
# the contents of the DOI. But IMHO this is a healthy subset.
|
||
doi_re = re.compile(
|
||
# match possible URI prefix
|
||
r"(?P<blah>((https?)?://)?doi\.org/)?"
|
||
# match actual DOI
|
||
r"(?P<meat>[a-zA-Z0-9\./\-_]+)"
|
||
)
|
||
|
||
match = doi_re.match(text)
|
||
if match is None or match["meat"] is None:
|
||
fifo.write(
|
||
f"message-warning \"'{text}' is probably not a DOI, or update regexp\""
|
||
)
|
||
|
||
else:
|
||
url = get_scihub_url()
|
||
doi = match["meat"]
|
||
fifo.write(f"open -t {url}/{doi}")
|