Move all extraction logic into extractor module

The publically accessible default interface only contains the command line command interface and a single run function.
2023-08-29 12:40:36 +02:00 · 2023-08-29 12:40:36 +02:00 · e325b89c9b
commit e325b89c9b
parent b564ab4792
2 changed files with 64 additions and 52 deletions
--- a/papis_extract/init.py
+++ b/papis_extract/init.py
@ -1,19 +1,14 @@
-from pathlib import Path
-import re
-
 import click
-import fitz_new as fitz
-import magic
 import papis.cli
 import papis.config
 import papis.document
-from papis.document import Document
 import papis.logging
 import papis.notes
 import papis.strings
+from papis.document import Document

 from papis_extract import extractor, exporter
-from papis_extract.annotation_data import Annotation, AnnotatedDocument
+from papis_extract.annotation_data import AnnotatedDocument

 logger = papis.logging.get_logger(__name__)

@ -76,50 +71,19 @@ def main(
        logger.warning(papis.strings.no_documents_retrieved_message)
        return

-    doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
+    run(documents, edit=manual, write=write, git=git)
+
+
+def run(
+    documents: list[Document],
+    edit: bool = False,
+    write: bool = False,
+    git: bool = False,
+) -> None:
+
+    doc_annotations: list[AnnotatedDocument] = extractor.start(documents)

    if write:
-        exporter.to_notes(doc_annotations, edit=manual, git=git)
+        exporter.to_notes(doc_annotations, edit=edit, git=git)
    else:
        exporter.to_stdout(doc_annotations)
-
-    # note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
-
-
-def is_pdf(fname: Path) -> bool:
-    return magic.from_file(fname, mime=True) == "application/pdf"
-
-
-def _get_annotations_for_documents(
-    documents: list[Document],
-) -> list[AnnotatedDocument]:
-    output: list[AnnotatedDocument] = []
-    for doc in documents:
-        annotations: list[Annotation] = []
-        found_pdf: bool = False
-        for file in doc.get_files():
-            fname = Path(file)
-            if not _is_file_processable(fname):
-                break
-            found_pdf = True
-
-            try:
-                annotations.extend(extractor.start(fname))
-            except fitz.FileDataError as e:
-                print(f"File structure errors for {file}.\n{e}")
-
-        if not found_pdf:
-            # have to remove curlys or papis logger gets upset
-            desc = re.sub("[{}]", "", papis.document.describe(doc))
-            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
-        output.append(AnnotatedDocument(doc, annotations))
-    return output
-
-
-def _is_file_processable(fname: Path) -> bool:
-    if not fname.is_file():
-        logger.error(f"File {str(fname)} not readable.")
-        return False
-    if not is_pdf(fname):
-        return False
-    return True
--- a/papis_extract/extractor.py
+++ b/papis_extract/extractor.py
@ -1,17 +1,51 @@
+import re
 from pathlib import Path
 from typing import Any, Optional

 import Levenshtein
+import magic
 import fitz_new as fitz
 import papis.logging
 import papis.config
+import papis.document
+from papis.document import Document

-from papis_extract.annotation_data import Annotation
+from papis_extract.annotation_data import Annotation, AnnotatedDocument

 logger = papis.logging.get_logger(__name__)

+def start(
+    documents: list[Document],
+) -> list[AnnotatedDocument]:
+    """Extract all annotations from passed documents.

-def start(filename: Path) -> list[Annotation]:
+    Returns all annotations contained in the papis 
+    documents passed in.
+    """
+
+    output: list[AnnotatedDocument] = []
+    for doc in documents:
+        annotations: list[Annotation] = []
+        found_pdf: bool = False
+        for file in doc.get_files():
+            fname = Path(file)
+            if not _is_file_processable(fname):
+                break
+            found_pdf = True
+
+            try:
+                annotations.extend(extract(fname))
+            except fitz.FileDataError as e:
+                print(f"File structure errors for {file}.\n{e}")
+
+        if not found_pdf:
+            # have to remove curlys or papis logger gets upset
+            desc = re.sub("[{}]", "", papis.document.describe(doc))
+            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
+        output.append(AnnotatedDocument(doc, annotations))
+    return output
+
+def extract(filename: Path) -> list[Annotation]:
    """Extract annotations from a file.

    Returns all readable annotations contained in the file
@ -41,6 +75,20 @@ def start(filename: Path) -> list[Annotation]:
    return annotations


+def is_pdf(fname: Path) -> bool:
+    return magic.from_file(fname, mime=True) == "application/pdf"
+
+
+
+
+def _is_file_processable(fname: Path) -> bool:
+    if not fname.is_file():
+        logger.error(f"File {str(fname)} not readable.")
+        return False
+    if not is_pdf(fname):
+        return False
+    return True
+
 def _tag_from_colorname(colorname: str) -> str:
    color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
    if not color_mapping: