From 3bdc37b7297dc79496497379980a9021b5e80542 Mon Sep 17 00:00:00 2001
From: Marty Oehme <marty.oehme@gmail.com>
Date: Fri, 14 Jun 2024 20:02:52 +0200
Subject: [PATCH] fix: Only inform if no extractor finds valid files

Until now whenever an extractor could not find any valid files for a
document it would inform the user of this case. However, this is not
very useful: if you have a pdf and an epub extractor running, it would
inform you for each document which only had one of the two formats as
well as those which actually did not have any valid files for *any* of
the extractors running.

This commit changes the behavior to only inform the user when none of
the running extractors find a valid file, since that is the actual case
a user might want to be informed about.
---
 papis_extract/__init__.py   | 11 ++++++++++-
 papis_extract/extraction.py | 12 ++++++------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/papis_extract/__init__.py b/papis_extract/__init__.py
index 3e4792d..043d3b5 100644
--- a/papis_extract/__init__.py
+++ b/papis_extract/__init__.py
@@ -1,3 +1,4 @@
+import re
 import click
 import papis.cli
 import papis.config
@@ -145,10 +146,18 @@ def run(
     doc_annots: list[tuple[Document, list[Annotation]]] = []
     for doc in documents:
         annotations: list[Annotation] = []
+        valid_files: int = 0
         for ext in extractors:
             if not ext:
                 continue
-            annotations.extend(extraction.start(ext, doc))
+            added = extraction.start(ext, doc)
+            if added is not None:
+                valid_files += 1
+                annotations.extend(added)
         doc_annots.append((doc, annotations))
 
+        if valid_files == 0:
+            # have to remove curlys or papis logger gets upset
+            desc = re.sub("[{}]", "", papis.document.describe(doc))
+            logger.info(f"Document {desc} has no valid extractors for any of its files.")
     exporter.run(doc_annots)
diff --git a/papis_extract/extraction.py b/papis_extract/extraction.py
index 2054323..0fa8b31 100644
--- a/papis_extract/extraction.py
+++ b/papis_extract/extraction.py
@@ -22,11 +22,13 @@ class Extractor(Protocol):
 def start(
     extractor: Extractor,
     document: Document,
-) -> list[Annotation]:
+) -> list[Annotation] | None:
     """Extract all annotations from passed documents.
 
     Returns all annotations contained in the papis
-    documents passed in.
+    documents passed in (empty list if no annotations).
+    If there are no files that the extractor can process,
+    returns None instead.
     """
     annotations: list[Annotation] = []
     file_available: bool = False
@@ -40,11 +42,9 @@ def start(
         try:
             annotations.extend(extractor.run(fname))
         except ExtractionError as e:
-            print(f"File extraction errors for {file}.\n{e}")
+            logger.error(f"File extraction errors for {file}. File may be damaged.\n{e}")
 
     if not file_available:
-        # have to remove curlys or papis logger gets upset
-        desc = re.sub("[{}]", "", papis.document.describe(document))
-        logger.info(f"No {type(extractor)} file for document: {desc}")
+        return None
 
     return annotations