chore: Remove python-magic dependency

It relies on the libmagic module which is not necessarily installed
everywhere. Most of the functionality that we need for our purposes can
be recreated with lighter-weight methods.
This commit is contained in:
Marty Oehme 2025-09-12 09:47:41 +02:00
parent 17c6fefd89
commit 1a4b5e3a70
Signed by: Marty
GPG key ID: 4E535BC19C61886E
5 changed files with 33 additions and 12 deletions

View file

@ -1,10 +1,10 @@
# pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
import mimetypes
from collections.abc import Generator from collections.abc import Generator
from pathlib import Path from pathlib import Path
from typing import NamedTuple, cast from typing import NamedTuple, cast
import Levenshtein import Levenshtein
import magic
import papis.config import papis.config
import papis.logging import papis.logging
import pymupdf as mu import pymupdf as mu
@ -77,7 +77,7 @@ class PdfExtractor:
def _is_pdf(self, fname: Path) -> bool: def _is_pdf(self, fname: Path) -> bool:
"""Check if file is a pdf, using mime type.""" """Check if file is a pdf, using mime type."""
return magic.from_file(fname, mime=True) == "application/pdf" return mimetypes.guess_type(fname)[0] == "application/pdf"
def _get_annotation_content( def _get_annotation_content(
self, page: mu.Page, annotation: mu.Annot self, page: mu.Page, annotation: mu.Annot

View file

@ -1,7 +1,7 @@
# pyright: strict, reportUnknownMemberType=false # pyright: strict, reportUnknownMemberType=false
import mimetypes
from pathlib import Path from pathlib import Path
import magic
import papis.logging import papis.logging
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -12,7 +12,7 @@ logger = papis.logging.get_logger(__name__)
class PocketBookExtractor: class PocketBookExtractor:
def can_process(self, filename: Path) -> bool: def can_process(self, filename: Path) -> bool:
if magic.from_file(filename, mime=True) != "text/xml": if not self._is_html(filename):
return False return False
content = self._read_file(filename) content = self._read_file(filename)
@ -28,6 +28,9 @@ class PocketBookExtractor:
logger.debug(f"Found processable annotation file: {filename}") logger.debug(f"Found processable annotation file: {filename}")
return True return True
def _is_html(self, filename: Path) -> bool:
return mimetypes.guess_type(filename)[0] == "text/html"
def run(self, filename: Path) -> list[Annotation]: def run(self, filename: Path) -> list[Annotation]:
"""Extract annotations from pocketbook html file. """Extract annotations from pocketbook html file.

View file

@ -1,8 +1,8 @@
# pyright: strict, reportUnknownMemberType=false # pyright: strict, reportUnknownMemberType=false
import mimetypes
import re import re
from pathlib import Path from pathlib import Path
import magic
import papis.logging import papis.logging
from papis_extract.annotation import Annotation from papis_extract.annotation import Annotation
@ -17,7 +17,7 @@ class ReadEraExtractor:
""" """
def can_process(self, filename: Path) -> bool: def can_process(self, filename: Path) -> bool:
if magic.from_file(filename, mime=True) != "text/plain": if not self._is_txt(filename):
return False return False
content = self._read_file(filename) content = self._read_file(filename)
@ -36,11 +36,12 @@ class ReadEraExtractor:
if not re.search(r"\n\*\*\*\*\*\n\n$", "".join(content)): if not re.search(r"\n\*\*\*\*\*\n\n$", "".join(content)):
return False return False
logger.debug( logger.debug(f"Found processable annotation file: {filename}")
f"Found processable annotation file: {filename}"
)
return True return True
def _is_txt(self, filename: Path) -> bool:
return mimetypes.guess_type(filename)[0] == "text/plain"
def run(self, filename: Path) -> list[Annotation]: def run(self, filename: Path) -> list[Annotation]:
"""Extract annotations from readera txt file. """Extract annotations from readera txt file.

View file

@ -2,13 +2,15 @@
import re import re
from pathlib import Path from pathlib import Path
import magic
import papis.logging import papis.logging
from papis_extract.annotation import Annotation from papis_extract.annotation import Annotation
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
ACCEPTED_EXTENSIONS = [".txt", ".md", ".qmd", ".rmd"]
TEXTCHARS = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
class ReadestExtractor: class ReadestExtractor:
"""Extracts exported annotations from the FOSS Readest book reading app. """Extracts exported annotations from the FOSS Readest book reading app.
@ -17,7 +19,7 @@ class ReadestExtractor:
""" """
def can_process(self, filename: Path) -> bool: def can_process(self, filename: Path) -> bool:
if magic.from_file(filename, mime=True) != "text/plain": if not self._is_readable_text(filename):
return False return False
content = self._read_file(filename) content = self._read_file(filename)
@ -33,6 +35,22 @@ class ReadestExtractor:
logger.debug(f"Found processable annotation file: {filename}") logger.debug(f"Found processable annotation file: {filename}")
return True return True
def _is_readable_text(self, filename: Path) -> bool:
"""Checks whether a file has a valid text extension and is not a binary file.
A file is considered a valid text file if its extension is in
:data:`ACCEPTED_EXTENSIONS` and does not contain any non-text characters.
:returns: A boolean indicating whether the file is a valid text file.
"""
if filename.suffix not in ACCEPTED_EXTENSIONS:
return False
try:
with filename.open("rb") as rb:
return not bool(rb.read(1024).translate(None, TEXTCHARS))
except (FileNotFoundError, PermissionError):
return False
def run(self, filename: Path) -> list[Annotation]: def run(self, filename: Path) -> list[Annotation]:
"""Extract annotations from readest txt file. """Extract annotations from readest txt file.

View file

@ -22,7 +22,6 @@ keywords = [
"bibliography", "bibliography",
"reference manager", "reference manager",
"research", "research",
"science",
] ]
[project.urls] [project.urls]