Move all extraction logic into extractor module

The publically accessible default interface only contains the command line command interface and a single run function.
2023-08-29 12:40:36 +02:00 · 2023-08-29 12:40:36 +02:00 · e325b89c9b
commit e325b89c9b
parent b564ab4792
2 changed files with 64 additions and 52 deletions
--- a/papis_extract/init.py
+++ b/papis_extract/init.py
@ -1,19 +1,14 @@
 from pathlib import Path
 import re
 import click
 import fitz_new as fitz
 import magic
 import papis.cli
 import papis.config
 import papis.document
 from papis.document import Document
 import papis.logging
 import papis.notes
 import papis.strings
 from papis.document import Document
 from papis_extract import extractor, exporter
-from papis_extract.annotation_data import Annotation, AnnotatedDocument
+from papis_extract.annotation_data import AnnotatedDocument
 logger = papis.logging.get_logger(__name__)
@ -76,50 +71,19 @@ def main(
        logger.warning(papis.strings.no_documents_retrieved_message)
        return
-    doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
+    run(documents, edit=manual, write=write, git=git)
 def run(
    documents: list[Document],
    edit: bool = False,
    write: bool = False,
    git: bool = False,
 ) -> None:
    doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
    if write:
-        exporter.to_notes(doc_annotations, edit=manual, git=git)
+        exporter.to_notes(doc_annotations, edit=edit, git=git)
    else:
        exporter.to_stdout(doc_annotations)
    # note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
 def is_pdf(fname: Path) -> bool:
    return magic.from_file(fname, mime=True) == "application/pdf"
 def _get_annotations_for_documents(
    documents: list[Document],
 ) -> list[AnnotatedDocument]:
    output: list[AnnotatedDocument] = []
    for doc in documents:
        annotations: list[Annotation] = []
        found_pdf: bool = False
        for file in doc.get_files():
            fname = Path(file)
            if not _is_file_processable(fname):
                break
            found_pdf = True
            try:
                annotations.extend(extractor.start(fname))
            except fitz.FileDataError as e:
                print(f"File structure errors for {file}.\n{e}")
        if not found_pdf:
            # have to remove curlys or papis logger gets upset
            desc = re.sub("[{}]", "", papis.document.describe(doc))
            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
        output.append(AnnotatedDocument(doc, annotations))
    return output
 def _is_file_processable(fname: Path) -> bool:
    if not fname.is_file():
        logger.error(f"File {str(fname)} not readable.")
        return False
    if not is_pdf(fname):
        return False
    return True
--- a/papis_extract/extractor.py
+++ b/papis_extract/extractor.py
@ -1,17 +1,51 @@
 import re
 from pathlib import Path
 from typing import Any, Optional
 import Levenshtein
 import magic
 import fitz_new as fitz
 import papis.logging
 import papis.config
 import papis.document
 from papis.document import Document
-from papis_extract.annotation_data import Annotation
+from papis_extract.annotation_data import Annotation, AnnotatedDocument
 logger = papis.logging.get_logger(__name__)
 def start(
    documents: list[Document],
 ) -> list[AnnotatedDocument]:
    """Extract all annotations from passed documents.
-def start(filename: Path) -> list[Annotation]:
+    Returns all annotations contained in the papis 
    documents passed in.
    """
    output: list[AnnotatedDocument] = []
    for doc in documents:
        annotations: list[Annotation] = []
        found_pdf: bool = False
        for file in doc.get_files():
            fname = Path(file)
            if not _is_file_processable(fname):
                break
            found_pdf = True
            try:
                annotations.extend(extract(fname))
            except fitz.FileDataError as e:
                print(f"File structure errors for {file}.\n{e}")
        if not found_pdf:
            # have to remove curlys or papis logger gets upset
            desc = re.sub("[{}]", "", papis.document.describe(doc))
            logger.warning("Did not find suitable PDF file for document: " f"{desc}")
        output.append(AnnotatedDocument(doc, annotations))
    return output
 def extract(filename: Path) -> list[Annotation]:
    """Extract annotations from a file.
    Returns all readable annotations contained in the file
@ -41,6 +75,20 @@ def start(filename: Path) -> list[Annotation]:
    return annotations
 def is_pdf(fname: Path) -> bool:
    return magic.from_file(fname, mime=True) == "application/pdf"
 def _is_file_processable(fname: Path) -> bool:
    if not fname.is_file():
        logger.error(f"File {str(fname)} not readable.")
        return False
    if not is_pdf(fname):
        return False
    return True
 def _tag_from_colorname(colorname: str) -> str:
    color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
    if not color_mapping: