diff --git a/papis_extract/extractors/pdf.py b/papis_extract/extractors/pdf.py index 3ba9873..d161229 100644 --- a/papis_extract/extractors/pdf.py +++ b/papis_extract/extractors/pdf.py @@ -1,10 +1,10 @@ # pyright: strict, reportMissingTypeStubs=false, reportUnknownMemberType=false +import mimetypes from collections.abc import Generator from pathlib import Path from typing import NamedTuple, cast import Levenshtein -import magic import papis.config import papis.logging import pymupdf as mu @@ -77,7 +77,7 @@ class PdfExtractor: def _is_pdf(self, fname: Path) -> bool: """Check if file is a pdf, using mime type.""" - return magic.from_file(fname, mime=True) == "application/pdf" + return mimetypes.guess_type(fname)[0] == "application/pdf" def _get_annotation_content( self, page: mu.Page, annotation: mu.Annot diff --git a/papis_extract/extractors/pocketbook.py b/papis_extract/extractors/pocketbook.py index 5fc0585..928aad1 100644 --- a/papis_extract/extractors/pocketbook.py +++ b/papis_extract/extractors/pocketbook.py @@ -1,7 +1,7 @@ # pyright: strict, reportUnknownMemberType=false +import mimetypes from pathlib import Path -import magic import papis.logging from bs4 import BeautifulSoup @@ -12,7 +12,7 @@ logger = papis.logging.get_logger(__name__) class PocketBookExtractor: def can_process(self, filename: Path) -> bool: - if magic.from_file(filename, mime=True) != "text/xml": + if not self._is_html(filename): return False content = self._read_file(filename) @@ -28,6 +28,9 @@ class PocketBookExtractor: logger.debug(f"Found processable annotation file: {filename}") return True + def _is_html(self, filename: Path) -> bool: + return mimetypes.guess_type(filename)[0] == "text/html" + def run(self, filename: Path) -> list[Annotation]: """Extract annotations from pocketbook html file. diff --git a/papis_extract/extractors/readera.py b/papis_extract/extractors/readera.py index d992512..c8905fe 100644 --- a/papis_extract/extractors/readera.py +++ b/papis_extract/extractors/readera.py @@ -1,8 +1,8 @@ # pyright: strict, reportUnknownMemberType=false +import mimetypes import re from pathlib import Path -import magic import papis.logging from papis_extract.annotation import Annotation @@ -17,7 +17,7 @@ class ReadEraExtractor: """ def can_process(self, filename: Path) -> bool: - if magic.from_file(filename, mime=True) != "text/plain": + if not self._is_txt(filename): return False content = self._read_file(filename) @@ -36,11 +36,12 @@ class ReadEraExtractor: if not re.search(r"\n\*\*\*\*\*\n\n$", "".join(content)): return False - logger.debug( - f"Found processable annotation file: {filename}" - ) + logger.debug(f"Found processable annotation file: {filename}") return True + def _is_txt(self, filename: Path) -> bool: + return mimetypes.guess_type(filename)[0] == "text/plain" + def run(self, filename: Path) -> list[Annotation]: """Extract annotations from readera txt file. diff --git a/papis_extract/extractors/readest.py b/papis_extract/extractors/readest.py index 1d0df08..f387721 100644 --- a/papis_extract/extractors/readest.py +++ b/papis_extract/extractors/readest.py @@ -2,13 +2,15 @@ import re from pathlib import Path -import magic import papis.logging from papis_extract.annotation import Annotation logger = papis.logging.get_logger(__name__) +ACCEPTED_EXTENSIONS = [".txt", ".md", ".qmd", ".rmd"] +TEXTCHARS = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F}) + class ReadestExtractor: """Extracts exported annotations from the FOSS Readest book reading app. @@ -17,7 +19,7 @@ class ReadestExtractor: """ def can_process(self, filename: Path) -> bool: - if magic.from_file(filename, mime=True) != "text/plain": + if not self._is_readable_text(filename): return False content = self._read_file(filename) @@ -33,6 +35,22 @@ class ReadestExtractor: logger.debug(f"Found processable annotation file: {filename}") return True + def _is_readable_text(self, filename: Path) -> bool: + """Checks whether a file has a valid text extension and is not a binary file. + + A file is considered a valid text file if its extension is in + :data:`ACCEPTED_EXTENSIONS` and does not contain any non-text characters. + + :returns: A boolean indicating whether the file is a valid text file. + """ + if filename.suffix not in ACCEPTED_EXTENSIONS: + return False + try: + with filename.open("rb") as rb: + return not bool(rb.read(1024).translate(None, TEXTCHARS)) + except (FileNotFoundError, PermissionError): + return False + def run(self, filename: Path) -> list[Annotation]: """Extract annotations from readest txt file. diff --git a/pyproject.toml b/pyproject.toml index f9fc3c1..bf39ddf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,6 @@ keywords = [ "bibliography", "reference manager", "research", - "science", ] [project.urls]