Move all extraction logic into extractor module

The publically accessible default interface only contains
the command line command interface and a single run function.
This commit is contained in:
Marty Oehme 2023-08-29 12:40:36 +02:00
parent b564ab4792
commit e325b89c9b
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 64 additions and 52 deletions

View file

@ -1,19 +1,14 @@
from pathlib import Path
import re
import click import click
import fitz_new as fitz
import magic
import papis.cli import papis.cli
import papis.config import papis.config
import papis.document import papis.document
from papis.document import Document
import papis.logging import papis.logging
import papis.notes import papis.notes
import papis.strings import papis.strings
from papis.document import Document
from papis_extract import extractor, exporter from papis_extract import extractor, exporter
from papis_extract.annotation_data import Annotation, AnnotatedDocument from papis_extract.annotation_data import AnnotatedDocument
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
@ -76,50 +71,19 @@ def main(
logger.warning(papis.strings.no_documents_retrieved_message) logger.warning(papis.strings.no_documents_retrieved_message)
return return
doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents) run(documents, edit=manual, write=write, git=git)
def run(
documents: list[Document],
edit: bool = False,
write: bool = False,
git: bool = False,
) -> None:
doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
if write: if write:
exporter.to_notes(doc_annotations, edit=manual, git=git) exporter.to_notes(doc_annotations, edit=edit, git=git)
else: else:
exporter.to_stdout(doc_annotations) exporter.to_stdout(doc_annotations)
# note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
def is_pdf(fname: Path) -> bool:
return magic.from_file(fname, mime=True) == "application/pdf"
def _get_annotations_for_documents(
documents: list[Document],
) -> list[AnnotatedDocument]:
output: list[AnnotatedDocument] = []
for doc in documents:
annotations: list[Annotation] = []
found_pdf: bool = False
for file in doc.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extractor.start(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(doc))
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
output.append(AnnotatedDocument(doc, annotations))
return output
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True

View file

@ -1,17 +1,51 @@
import re
from pathlib import Path from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import Levenshtein import Levenshtein
import magic
import fitz_new as fitz import fitz_new as fitz
import papis.logging import papis.logging
import papis.config import papis.config
import papis.document
from papis.document import Document
from papis_extract.annotation_data import Annotation from papis_extract.annotation_data import Annotation, AnnotatedDocument
logger = papis.logging.get_logger(__name__) logger = papis.logging.get_logger(__name__)
def start(
documents: list[Document],
) -> list[AnnotatedDocument]:
"""Extract all annotations from passed documents.
def start(filename: Path) -> list[Annotation]: Returns all annotations contained in the papis
documents passed in.
"""
output: list[AnnotatedDocument] = []
for doc in documents:
annotations: list[Annotation] = []
found_pdf: bool = False
for file in doc.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extract(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(doc))
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
output.append(AnnotatedDocument(doc, annotations))
return output
def extract(filename: Path) -> list[Annotation]:
"""Extract annotations from a file. """Extract annotations from a file.
Returns all readable annotations contained in the file Returns all readable annotations contained in the file
@ -41,6 +75,20 @@ def start(filename: Path) -> list[Annotation]:
return annotations return annotations
def is_pdf(fname: Path) -> bool:
return magic.from_file(fname, mime=True) == "application/pdf"
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True
def _tag_from_colorname(colorname: str) -> str: def _tag_from_colorname(colorname: str) -> str:
color_mapping: dict[str, str] = getdict("tags", "plugins.extract") color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
if not color_mapping: if not color_mapping: