Move all extraction logic into extractor module

The publically accessible default interface only contains
the command line command interface and a single run function.
This commit is contained in:
Marty Oehme 2023-08-29 12:40:36 +02:00
parent b564ab4792
commit e325b89c9b
Signed by: Marty
GPG key ID: EDBF2ED917B2EF6A
2 changed files with 64 additions and 52 deletions

View file

@ -1,19 +1,14 @@
from pathlib import Path
import re
import click
import fitz_new as fitz
import magic
import papis.cli
import papis.config
import papis.document
from papis.document import Document
import papis.logging
import papis.notes
import papis.strings
from papis.document import Document
from papis_extract import extractor, exporter
from papis_extract.annotation_data import Annotation, AnnotatedDocument
from papis_extract.annotation_data import AnnotatedDocument
logger = papis.logging.get_logger(__name__)
@ -76,50 +71,19 @@ def main(
logger.warning(papis.strings.no_documents_retrieved_message)
return
doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
run(documents, edit=manual, write=write, git=git)
def run(
documents: list[Document],
edit: bool = False,
write: bool = False,
git: bool = False,
) -> None:
doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
if write:
exporter.to_notes(doc_annotations, edit=manual, git=git)
exporter.to_notes(doc_annotations, edit=edit, git=git)
else:
exporter.to_stdout(doc_annotations)
# note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
def is_pdf(fname: Path) -> bool:
return magic.from_file(fname, mime=True) == "application/pdf"
def _get_annotations_for_documents(
documents: list[Document],
) -> list[AnnotatedDocument]:
output: list[AnnotatedDocument] = []
for doc in documents:
annotations: list[Annotation] = []
found_pdf: bool = False
for file in doc.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extractor.start(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(doc))
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
output.append(AnnotatedDocument(doc, annotations))
return output
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True

View file

@ -1,17 +1,51 @@
import re
from pathlib import Path
from typing import Any, Optional
import Levenshtein
import magic
import fitz_new as fitz
import papis.logging
import papis.config
import papis.document
from papis.document import Document
from papis_extract.annotation_data import Annotation
from papis_extract.annotation_data import Annotation, AnnotatedDocument
logger = papis.logging.get_logger(__name__)
def start(
documents: list[Document],
) -> list[AnnotatedDocument]:
"""Extract all annotations from passed documents.
def start(filename: Path) -> list[Annotation]:
Returns all annotations contained in the papis
documents passed in.
"""
output: list[AnnotatedDocument] = []
for doc in documents:
annotations: list[Annotation] = []
found_pdf: bool = False
for file in doc.get_files():
fname = Path(file)
if not _is_file_processable(fname):
break
found_pdf = True
try:
annotations.extend(extract(fname))
except fitz.FileDataError as e:
print(f"File structure errors for {file}.\n{e}")
if not found_pdf:
# have to remove curlys or papis logger gets upset
desc = re.sub("[{}]", "", papis.document.describe(doc))
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
output.append(AnnotatedDocument(doc, annotations))
return output
def extract(filename: Path) -> list[Annotation]:
"""Extract annotations from a file.
Returns all readable annotations contained in the file
@ -41,6 +75,20 @@ def start(filename: Path) -> list[Annotation]:
return annotations
def is_pdf(fname: Path) -> bool:
return magic.from_file(fname, mime=True) == "application/pdf"
def _is_file_processable(fname: Path) -> bool:
if not fname.is_file():
logger.error(f"File {str(fname)} not readable.")
return False
if not is_pdf(fname):
return False
return True
def _tag_from_colorname(colorname: str) -> str:
color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
if not color_mapping: