Move all extraction logic into extractor module
The publically accessible default interface only contains the command line command interface and a single run function.
This commit is contained in:
parent
b564ab4792
commit
e325b89c9b
2 changed files with 64 additions and 52 deletions
|
@ -1,19 +1,14 @@
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
|
|
||||||
import click
|
import click
|
||||||
import fitz_new as fitz
|
|
||||||
import magic
|
|
||||||
import papis.cli
|
import papis.cli
|
||||||
import papis.config
|
import papis.config
|
||||||
import papis.document
|
import papis.document
|
||||||
from papis.document import Document
|
|
||||||
import papis.logging
|
import papis.logging
|
||||||
import papis.notes
|
import papis.notes
|
||||||
import papis.strings
|
import papis.strings
|
||||||
|
from papis.document import Document
|
||||||
|
|
||||||
from papis_extract import extractor, exporter
|
from papis_extract import extractor, exporter
|
||||||
from papis_extract.annotation_data import Annotation, AnnotatedDocument
|
from papis_extract.annotation_data import AnnotatedDocument
|
||||||
|
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
@ -76,50 +71,19 @@ def main(
|
||||||
logger.warning(papis.strings.no_documents_retrieved_message)
|
logger.warning(papis.strings.no_documents_retrieved_message)
|
||||||
return
|
return
|
||||||
|
|
||||||
doc_annotations: list[AnnotatedDocument] = _get_annotations_for_documents(documents)
|
run(documents, edit=manual, write=write, git=git)
|
||||||
|
|
||||||
|
|
||||||
|
def run(
|
||||||
|
documents: list[Document],
|
||||||
|
edit: bool = False,
|
||||||
|
write: bool = False,
|
||||||
|
git: bool = False,
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
doc_annotations: list[AnnotatedDocument] = extractor.start(documents)
|
||||||
|
|
||||||
if write:
|
if write:
|
||||||
exporter.to_notes(doc_annotations, edit=manual, git=git)
|
exporter.to_notes(doc_annotations, edit=edit, git=git)
|
||||||
else:
|
else:
|
||||||
exporter.to_stdout(doc_annotations)
|
exporter.to_stdout(doc_annotations)
|
||||||
|
|
||||||
# note_file: Path = Path(papis.notes.notes_path_ensured(documents[0]))
|
|
||||||
|
|
||||||
|
|
||||||
def is_pdf(fname: Path) -> bool:
|
|
||||||
return magic.from_file(fname, mime=True) == "application/pdf"
|
|
||||||
|
|
||||||
|
|
||||||
def _get_annotations_for_documents(
|
|
||||||
documents: list[Document],
|
|
||||||
) -> list[AnnotatedDocument]:
|
|
||||||
output: list[AnnotatedDocument] = []
|
|
||||||
for doc in documents:
|
|
||||||
annotations: list[Annotation] = []
|
|
||||||
found_pdf: bool = False
|
|
||||||
for file in doc.get_files():
|
|
||||||
fname = Path(file)
|
|
||||||
if not _is_file_processable(fname):
|
|
||||||
break
|
|
||||||
found_pdf = True
|
|
||||||
|
|
||||||
try:
|
|
||||||
annotations.extend(extractor.start(fname))
|
|
||||||
except fitz.FileDataError as e:
|
|
||||||
print(f"File structure errors for {file}.\n{e}")
|
|
||||||
|
|
||||||
if not found_pdf:
|
|
||||||
# have to remove curlys or papis logger gets upset
|
|
||||||
desc = re.sub("[{}]", "", papis.document.describe(doc))
|
|
||||||
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
|
|
||||||
output.append(AnnotatedDocument(doc, annotations))
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def _is_file_processable(fname: Path) -> bool:
|
|
||||||
if not fname.is_file():
|
|
||||||
logger.error(f"File {str(fname)} not readable.")
|
|
||||||
return False
|
|
||||||
if not is_pdf(fname):
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
|
@ -1,17 +1,51 @@
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import Levenshtein
|
import Levenshtein
|
||||||
|
import magic
|
||||||
import fitz_new as fitz
|
import fitz_new as fitz
|
||||||
import papis.logging
|
import papis.logging
|
||||||
import papis.config
|
import papis.config
|
||||||
|
import papis.document
|
||||||
|
from papis.document import Document
|
||||||
|
|
||||||
from papis_extract.annotation_data import Annotation
|
from papis_extract.annotation_data import Annotation, AnnotatedDocument
|
||||||
|
|
||||||
logger = papis.logging.get_logger(__name__)
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
def start(
|
||||||
|
documents: list[Document],
|
||||||
|
) -> list[AnnotatedDocument]:
|
||||||
|
"""Extract all annotations from passed documents.
|
||||||
|
|
||||||
def start(filename: Path) -> list[Annotation]:
|
Returns all annotations contained in the papis
|
||||||
|
documents passed in.
|
||||||
|
"""
|
||||||
|
|
||||||
|
output: list[AnnotatedDocument] = []
|
||||||
|
for doc in documents:
|
||||||
|
annotations: list[Annotation] = []
|
||||||
|
found_pdf: bool = False
|
||||||
|
for file in doc.get_files():
|
||||||
|
fname = Path(file)
|
||||||
|
if not _is_file_processable(fname):
|
||||||
|
break
|
||||||
|
found_pdf = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
annotations.extend(extract(fname))
|
||||||
|
except fitz.FileDataError as e:
|
||||||
|
print(f"File structure errors for {file}.\n{e}")
|
||||||
|
|
||||||
|
if not found_pdf:
|
||||||
|
# have to remove curlys or papis logger gets upset
|
||||||
|
desc = re.sub("[{}]", "", papis.document.describe(doc))
|
||||||
|
logger.warning("Did not find suitable PDF file for document: " f"{desc}")
|
||||||
|
output.append(AnnotatedDocument(doc, annotations))
|
||||||
|
return output
|
||||||
|
|
||||||
|
def extract(filename: Path) -> list[Annotation]:
|
||||||
"""Extract annotations from a file.
|
"""Extract annotations from a file.
|
||||||
|
|
||||||
Returns all readable annotations contained in the file
|
Returns all readable annotations contained in the file
|
||||||
|
@ -41,6 +75,20 @@ def start(filename: Path) -> list[Annotation]:
|
||||||
return annotations
|
return annotations
|
||||||
|
|
||||||
|
|
||||||
|
def is_pdf(fname: Path) -> bool:
|
||||||
|
return magic.from_file(fname, mime=True) == "application/pdf"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _is_file_processable(fname: Path) -> bool:
|
||||||
|
if not fname.is_file():
|
||||||
|
logger.error(f"File {str(fname)} not readable.")
|
||||||
|
return False
|
||||||
|
if not is_pdf(fname):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def _tag_from_colorname(colorname: str) -> str:
|
def _tag_from_colorname(colorname: str) -> str:
|
||||||
color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
|
color_mapping: dict[str, str] = getdict("tags", "plugins.extract")
|
||||||
if not color_mapping:
|
if not color_mapping:
|
||||||
|
|
Loading…
Reference in a new issue