chore: Remove python-magic dependency
It relies on the libmagic module which is not necessarily installed everywhere. Most of the functionality that we need for our purposes can be recreated with lighter-weight methods.
This commit is contained in:
parent
17c6fefd89
commit
1a4b5e3a70
5 changed files with 33 additions and 12 deletions
|
|
@ -1,10 +1,10 @@
|
||||||
# pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
|
# pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
|
||||||
|
import mimetypes
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import NamedTuple, cast
|
from typing import NamedTuple, cast
|
||||||
|
|
||||||
import Levenshtein
|
import Levenshtein
|
||||||
import magic
|
|
||||||
import papis.config
|
import papis.config
|
||||||
import papis.logging
|
import papis.logging
|
||||||
import pymupdf as mu
|
import pymupdf as mu
|
||||||
|
|
@ -77,7 +77,7 @@ class PdfExtractor:
|
||||||
|
|
||||||
def _is_pdf(self, fname: Path) -> bool:
|
def _is_pdf(self, fname: Path) -> bool:
|
||||||
"""Check if file is a pdf, using mime type."""
|
"""Check if file is a pdf, using mime type."""
|
||||||
return magic.from_file(fname, mime=True) == "application/pdf"
|
return mimetypes.guess_type(fname)[0] == "application/pdf"
|
||||||
|
|
||||||
def _get_annotation_content(
|
def _get_annotation_content(
|
||||||
self, page: mu.Page, annotation: mu.Annot
|
self, page: mu.Page, annotation: mu.Annot
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
# pyright: strict, reportUnknownMemberType=false
|
# pyright: strict, reportUnknownMemberType=false
|
||||||
|
import mimetypes
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import magic
|
|
||||||
import papis.logging
|
import papis.logging
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
@ -12,7 +12,7 @@ logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
class PocketBookExtractor:
|
class PocketBookExtractor:
|
||||||
def can_process(self, filename: Path) -> bool:
|
def can_process(self, filename: Path) -> bool:
|
||||||
if magic.from_file(filename, mime=True) != "text/xml":
|
if not self._is_html(filename):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
content = self._read_file(filename)
|
content = self._read_file(filename)
|
||||||
|
|
@ -28,6 +28,9 @@ class PocketBookExtractor:
|
||||||
logger.debug(f"Found processable annotation file: {filename}")
|
logger.debug(f"Found processable annotation file: {filename}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _is_html(self, filename: Path) -> bool:
|
||||||
|
return mimetypes.guess_type(filename)[0] == "text/html"
|
||||||
|
|
||||||
def run(self, filename: Path) -> list[Annotation]:
|
def run(self, filename: Path) -> list[Annotation]:
|
||||||
"""Extract annotations from pocketbook html file.
|
"""Extract annotations from pocketbook html file.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
# pyright: strict, reportUnknownMemberType=false
|
# pyright: strict, reportUnknownMemberType=false
|
||||||
|
import mimetypes
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import magic
|
|
||||||
import papis.logging
|
import papis.logging
|
||||||
|
|
||||||
from papis_extract.annotation import Annotation
|
from papis_extract.annotation import Annotation
|
||||||
|
|
@ -17,7 +17,7 @@ class ReadEraExtractor:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def can_process(self, filename: Path) -> bool:
|
def can_process(self, filename: Path) -> bool:
|
||||||
if magic.from_file(filename, mime=True) != "text/plain":
|
if not self._is_txt(filename):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
content = self._read_file(filename)
|
content = self._read_file(filename)
|
||||||
|
|
@ -36,11 +36,12 @@ class ReadEraExtractor:
|
||||||
if not re.search(r"\n\*\*\*\*\*\n\n$", "".join(content)):
|
if not re.search(r"\n\*\*\*\*\*\n\n$", "".join(content)):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(f"Found processable annotation file: {filename}")
|
||||||
f"Found processable annotation file: {filename}"
|
|
||||||
)
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _is_txt(self, filename: Path) -> bool:
|
||||||
|
return mimetypes.guess_type(filename)[0] == "text/plain"
|
||||||
|
|
||||||
def run(self, filename: Path) -> list[Annotation]:
|
def run(self, filename: Path) -> list[Annotation]:
|
||||||
"""Extract annotations from readera txt file.
|
"""Extract annotations from readera txt file.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,13 +2,15 @@
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import magic
|
|
||||||
import papis.logging
|
import papis.logging
|
||||||
|
|
||||||
from papis_extract.annotation import Annotation
|
from papis_extract.annotation import Annotation
|
||||||
|
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
ACCEPTED_EXTENSIONS = [".txt", ".md", ".qmd", ".rmd"]
|
||||||
|
TEXTCHARS = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
|
||||||
|
|
||||||
|
|
||||||
class ReadestExtractor:
|
class ReadestExtractor:
|
||||||
"""Extracts exported annotations from the FOSS Readest book reading app.
|
"""Extracts exported annotations from the FOSS Readest book reading app.
|
||||||
|
|
@ -17,7 +19,7 @@ class ReadestExtractor:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def can_process(self, filename: Path) -> bool:
|
def can_process(self, filename: Path) -> bool:
|
||||||
if magic.from_file(filename, mime=True) != "text/plain":
|
if not self._is_readable_text(filename):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
content = self._read_file(filename)
|
content = self._read_file(filename)
|
||||||
|
|
@ -33,6 +35,22 @@ class ReadestExtractor:
|
||||||
logger.debug(f"Found processable annotation file: {filename}")
|
logger.debug(f"Found processable annotation file: {filename}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _is_readable_text(self, filename: Path) -> bool:
|
||||||
|
"""Checks whether a file has a valid text extension and is not a binary file.
|
||||||
|
|
||||||
|
A file is considered a valid text file if its extension is in
|
||||||
|
:data:`ACCEPTED_EXTENSIONS` and does not contain any non-text characters.
|
||||||
|
|
||||||
|
:returns: A boolean indicating whether the file is a valid text file.
|
||||||
|
"""
|
||||||
|
if filename.suffix not in ACCEPTED_EXTENSIONS:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
with filename.open("rb") as rb:
|
||||||
|
return not bool(rb.read(1024).translate(None, TEXTCHARS))
|
||||||
|
except (FileNotFoundError, PermissionError):
|
||||||
|
return False
|
||||||
|
|
||||||
def run(self, filename: Path) -> list[Annotation]:
|
def run(self, filename: Path) -> list[Annotation]:
|
||||||
"""Extract annotations from readest txt file.
|
"""Extract annotations from readest txt file.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,6 @@ keywords = [
|
||||||
"bibliography",
|
"bibliography",
|
||||||
"reference manager",
|
"reference manager",
|
||||||
"research",
|
"research",
|
||||||
"science",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue