Marty Oehme
8681d34946
Since we now use dotter we can simplify the dir structure for qutebrowser a lot. Everything dot-filed earlier can now reside in simple directories called config (for ~/.config/qutebrowser), data (for ~/.local/share/qutebrowser), and scripts (for ~/.local/bin) files.
116 lines
3.7 KiB
Python
Executable file
116 lines
3.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Goes to the sci-hub page for the current article, based on DOI.
|
||
|
||
|
||
Based on the work in
|
||
https://github.com/cadadr/configuration/blob/4b6a241d04d113f322b960890a0d0a0ab783a7b3/dotfiles/qutebrowser/userscripts/doi
|
||
with much gratitude.
|
||
|
||
The program can be invoked with DOI on a page selected, through the hinting mode when selecting a DOI link or on a publisher page (any page where doi meta-tags are set) - works on ScienceDirect, Taylor&Francis, Springer, etc.
|
||
That means you can give it a doi through a link or on the current page, for example with the following mappings:
|
||
|
||
```python
|
||
config.bind('"p', "spawn --userscript doi2scihub")
|
||
config.bind(';p', "hint links userscript doi2scihub")
|
||
```
|
||
|
||
You can also pass the doi as the (only) argument to the userscript:
|
||
|
||
```
|
||
:spawn --userscript doi2scihub https://doi.org/10.37394/23207.2021.18.68
|
||
`
|
||
|
||
Updates its sci-hub link based on the one listed on sci-hub wiki page.
|
||
"""
|
||
|
||
|
||
import os
|
||
import re
|
||
import sys
|
||
import html.parser
|
||
import requests
|
||
|
||
|
||
mode = os.getenv("QUTE_MODE")
|
||
|
||
text = None
|
||
|
||
class DoiTagParser(html.parser.HTMLParser):
|
||
doi = None
|
||
|
||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||
if self.doi == None and tag == "meta":
|
||
if (
|
||
("name", "citation_doi") in attrs
|
||
or ("name", "dc.identifier") in attrs
|
||
or ("scheme", "doi") in attrs
|
||
):
|
||
for att in attrs:
|
||
if att[0] == "content":
|
||
self.doi = att[1]
|
||
break
|
||
|
||
|
||
class SciHubLinkParser(html.parser.HTMLParser):
|
||
current = None
|
||
link_patt = re.compile(r"^(?P<url>https?://sci-hub\..+)/about$")
|
||
|
||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||
if self.current == None and tag == "a":
|
||
for att in attrs:
|
||
if att[0] == "href" and self.link_patt.match(att[1] or ""):
|
||
match = self.link_patt.match(att[1] or "")
|
||
self.current = match["url"] if match and match["url"] else None
|
||
|
||
|
||
def get_scihub_url(wiki_page: str = "https://wikiless.org/wiki/Sci-Hub"):
|
||
resp = requests.get(wiki_page)
|
||
parser = SciHubLinkParser()
|
||
parser.feed(resp.text)
|
||
return parser.current or "https://sci-hub.ru"
|
||
|
||
|
||
# use doi argument if we got one
|
||
if len(sys.argv) > 1:
|
||
text = sys.argv[1]
|
||
# use the hinted url
|
||
elif mode == "hints":
|
||
text = os.getenv("QUTE_URL", "").strip()
|
||
# use the current selection
|
||
elif mode == "command" and os.getenv("QUTE_SELECTED_TEXT"):
|
||
text = os.getenv("QUTE_SELECTED_TEXT", "").strip()
|
||
# just try to find a doi on current page
|
||
elif os.getenv("QUTE_HTML"):
|
||
with open(os.getenv("QUTE_HTML", ""), "r") as source:
|
||
parser = DoiTagParser()
|
||
parser.feed(source.read())
|
||
text = parser.doi
|
||
|
||
with open(os.getenv("QUTE_FIFO", ""), "w") as fifo:
|
||
if not text:
|
||
fifo.write(f'message-warning "Could not find a valid DOI"')
|
||
sys.exit()
|
||
|
||
# DOI syntax: https://www.doi.org/doi_handbook/2_Numbering.html#2.2.
|
||
#
|
||
# Note that this probably matches a subset of possible DOIs, as it
|
||
# seems that there’s no practical limitation on neither the length nor
|
||
# the contents of the DOI. But IMHO this is a healthy subset.
|
||
doi_re = re.compile(
|
||
# match possible URI prefix
|
||
r"(?P<blah>((https?)?://)?doi\.org/)?"
|
||
# match actual DOI
|
||
r"(?P<meat>[a-zA-Z0-9\./\-_]+)"
|
||
)
|
||
|
||
match = doi_re.match(text)
|
||
if match is None or match["meat"] is None:
|
||
fifo.write(
|
||
f"message-warning \"'{text}' is probably not a DOI, or update regexp\""
|
||
)
|
||
|
||
else:
|
||
url = get_scihub_url()
|
||
doi = match["meat"]
|
||
fifo.write(f"open -t {url}/{doi}")
|