chore: Remove python-magic dependency

It relies on the libmagic module which is not necessarily installed everywhere. Most of the functionality that we need for our purposes can be recreated with lighter-weight methods.
2025-09-12 09:47:41 +02:00 · 2025-09-12 09:47:41 +02:00 · 7459fbeb0b
commit 7459fbeb0b
parent 30bc8452fa
5 changed files with 33 additions and 12 deletions
--- a/papis_extract/extractors/pdf.py
+++ b/papis_extract/extractors/pdf.py
@ -1,10 +1,10 @@
 # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false
+import mimetypes
 from collections.abc import Generator
 from pathlib import Path
 from typing import NamedTuple, cast

 import Levenshtein
-import magic
 import papis.config
 import papis.logging
 import pymupdf as mu
@ -77,7 +77,7 @@ class PdfExtractor:

    def _is_pdf(self, fname: Path) -> bool:
        """Check if file is a pdf, using mime type."""
-        return magic.from_file(fname, mime=True) == "application/pdf"
+        return mimetypes.guess_type(fname)[0] == "application/pdf"

    def _get_annotation_content(
        self, page: mu.Page, annotation: mu.Annot
--- a/papis_extract/extractors/pocketbook.py
+++ b/papis_extract/extractors/pocketbook.py
@ -1,7 +1,7 @@
 # pyright: strict, reportUnknownMemberType=false
+import mimetypes
 from pathlib import Path

-import magic
 import papis.logging
 from bs4 import BeautifulSoup

@ -12,7 +12,7 @@ logger = papis.logging.get_logger(__name__)

 class PocketBookExtractor:
    def can_process(self, filename: Path) -> bool:
-        if magic.from_file(filename, mime=True) != "text/xml":
+        if not self._is_html(filename):
            return False

        content = self._read_file(filename)
@ -28,6 +28,9 @@ class PocketBookExtractor:
        logger.debug(f"Found processable annotation file: {filename}")
        return True

+    def _is_html(self, filename: Path) -> bool:
+        return mimetypes.guess_type(filename)[0] == "text/html"
+
    def run(self, filename: Path) -> list[Annotation]:
        """Extract annotations from pocketbook html file.

--- a/papis_extract/extractors/readera.py
+++ b/papis_extract/extractors/readera.py
@ -1,8 +1,8 @@
 # pyright: strict, reportUnknownMemberType=false
+import mimetypes
 import re
 from pathlib import Path

-import magic
 import papis.logging

 from papis_extract.annotation import Annotation
@ -17,7 +17,7 @@ class ReadEraExtractor:
    """

    def can_process(self, filename: Path) -> bool:
-        if magic.from_file(filename, mime=True) != "text/plain":
+        if not self._is_txt(filename):
            return False

        content = self._read_file(filename)
@ -36,11 +36,12 @@ class ReadEraExtractor:
        if not re.search(r"\n\*\*\*\*\*\n\n$", "".join(content)):
            return False

-        logger.debug(
-            f"Found processable annotation file: {filename}"
-        )
+        logger.debug(f"Found processable annotation file: {filename}")
        return True

+    def _is_txt(self, filename: Path) -> bool:
+        return mimetypes.guess_type(filename)[0] == "text/plain"
+
    def run(self, filename: Path) -> list[Annotation]:
        """Extract annotations from readera txt file.

--- a/papis_extract/extractors/readest.py
+++ b/papis_extract/extractors/readest.py
@ -2,13 +2,15 @@
 import re
 from pathlib import Path

-import magic
 import papis.logging

 from papis_extract.annotation import Annotation

 logger = papis.logging.get_logger(__name__)

+ACCEPTED_EXTENSIONS = [".txt", ".md", ".qmd", ".rmd"]
+TEXTCHARS = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
+

 class ReadestExtractor:
    """Extracts exported annotations from the FOSS Readest book reading app.
@ -17,7 +19,7 @@ class ReadestExtractor:
    """

    def can_process(self, filename: Path) -> bool:
-        if magic.from_file(filename, mime=True) != "text/plain":
+        if not self._is_readable_text(filename):
            return False

        content = self._read_file(filename)
@ -33,6 +35,22 @@ class ReadestExtractor:
        logger.debug(f"Found processable annotation file: {filename}")
        return True

+    def _is_readable_text(self, filename: Path) -> bool:
+        """Checks whether a file has a valid text extension and is not a binary file.
+
+        A file is considered a valid text file if its extension is in
+        :data:`ACCEPTED_EXTENSIONS` and does not contain any non-text characters.
+
+        :returns: A boolean indicating whether the file is a valid text file.
+        """
+        if filename.suffix not in ACCEPTED_EXTENSIONS:
+            return False
+        try:
+            with filename.open("rb") as rb:
+                return not bool(rb.read(1024).translate(None, TEXTCHARS))
+        except (FileNotFoundError, PermissionError):
+            return False
+
    def run(self, filename: Path) -> list[Annotation]:
        """Extract annotations from readest txt file.

--- a/pyproject.toml
+++ b/pyproject.toml
@ -22,7 +22,6 @@ keywords = [
    "bibliography",
    "reference manager",
    "research",
-    "science",
 ]

 [project.urls]