chore: Remove python-magic dependency

It relies on the libmagic module which is not necessarily installed everywhere. Most of the functionality that we need for our purposes can be recreated with lighter-weight methods.
2025-09-12 09:47:41 +02:00 · 2025-09-12 09:47:41 +02:00 · 1a4b5e3a70
commit 1a4b5e3a70
parent 17c6fefd89
5 changed files with 33 additions and 12 deletions
--- a/papis_extract/extractors/pdf.py
+++ b/papis_extract/extractors/pdf.py
@ -1,10 +1,10 @@
 # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
 import mimetypes
 from collections.abc import Generator
 from pathlib import Path
 from typing import NamedTuple, cast
 import Levenshtein
 import magic
 import papis.config
 import papis.logging
 import pymupdf as mu
@ -77,7 +77,7 @@ class PdfExtractor:
    def _is_pdf(self, fname: Path) -> bool:
        """Check if file is a pdf, using mime type."""
-        return magic.from_file(fname, mime=True) == "application/pdf"
+        return mimetypes.guess_type(fname)[0] == "application/pdf"
    def _get_annotation_content(
        self, page: mu.Page, annotation: mu.Annot
--- a/papis_extract/extractors/pocketbook.py
+++ b/papis_extract/extractors/pocketbook.py
@ -1,7 +1,7 @@
 # pyright: strict, reportUnknownMemberType=false
 import mimetypes
 from pathlib import Path
 import magic
 import papis.logging
 from bs4 import BeautifulSoup
@ -12,7 +12,7 @@ logger = papis.logging.get_logger(__name__)
 class PocketBookExtractor:
    def can_process(self, filename: Path) -> bool:
-        if magic.from_file(filename, mime=True) != "text/xml":
+        if not self._is_html(filename):
            return False
        content = self._read_file(filename)
@ -28,6 +28,9 @@ class PocketBookExtractor:
        logger.debug(f"Found processable annotation file: {filename}")
        return True
    def _is_html(self, filename: Path) -> bool:
        return mimetypes.guess_type(filename)[0] == "text/html"
    def run(self, filename: Path) -> list[Annotation]:
        """Extract annotations from pocketbook html file.
--- a/papis_extract/extractors/readera.py
+++ b/papis_extract/extractors/readera.py
@ -1,8 +1,8 @@
 # pyright: strict, reportUnknownMemberType=false
 import mimetypes
 import re
 from pathlib import Path
 import magic
 import papis.logging
 from papis_extract.annotation import Annotation
@ -17,7 +17,7 @@ class ReadEraExtractor:
    """
    def can_process(self, filename: Path) -> bool:
-        if magic.from_file(filename, mime=True) != "text/plain":
+        if not self._is_txt(filename):
            return False
        content = self._read_file(filename)
@ -36,11 +36,12 @@ class ReadEraExtractor:
        if not re.search(r"\n\*\*\*\*\*\n\n$", "".join(content)):
            return False
-        logger.debug(
+        logger.debug(f"Found processable annotation file: {filename}")
            f"Found processable annotation file: {filename}"
        )
        return True
    def _is_txt(self, filename: Path) -> bool:
        return mimetypes.guess_type(filename)[0] == "text/plain"
    def run(self, filename: Path) -> list[Annotation]:
        """Extract annotations from readera txt file.
--- a/papis_extract/extractors/readest.py
+++ b/papis_extract/extractors/readest.py
@ -2,13 +2,15 @@
 import re
 from pathlib import Path
 import magic
 import papis.logging
 from papis_extract.annotation import Annotation
 logger = papis.logging.get_logger(__name__)
 ACCEPTED_EXTENSIONS = [".txt", ".md", ".qmd", ".rmd"]
 TEXTCHARS = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
 class ReadestExtractor:
    """Extracts exported annotations from the FOSS Readest book reading app.
@ -17,7 +19,7 @@ class ReadestExtractor:
    """
    def can_process(self, filename: Path) -> bool:
-        if magic.from_file(filename, mime=True) != "text/plain":
+        if not self._is_readable_text(filename):
            return False
        content = self._read_file(filename)
@ -33,6 +35,22 @@ class ReadestExtractor:
        logger.debug(f"Found processable annotation file: {filename}")
        return True
    def _is_readable_text(self, filename: Path) -> bool:
        """Checks whether a file has a valid text extension and is not a binary file.
        A file is considered a valid text file if its extension is in
        :data:`ACCEPTED_EXTENSIONS` and does not contain any non-text characters.
        :returns: A boolean indicating whether the file is a valid text file.
        """
        if filename.suffix not in ACCEPTED_EXTENSIONS:
            return False
        try:
            with filename.open("rb") as rb:
                return not bool(rb.read(1024).translate(None, TEXTCHARS))
        except (FileNotFoundError, PermissionError):
            return False
    def run(self, filename: Path) -> list[Annotation]:
        """Extract annotations from readest txt file.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -22,7 +22,6 @@ keywords = [
    "bibliography",
    "reference manager",
    "research",
    "science",
 ]
 [project.urls]