diff --git a/papis_extract/annotation.py b/papis_extract/annotation.py index 26bec9d..0955e47 100644 --- a/papis_extract/annotation.py +++ b/papis_extract/annotation.py @@ -9,15 +9,19 @@ from papis.document import Document TEXT_SIMILARITY_MINIMUM = 0.75 COLOR_SIMILARITY_MINIMUM = 0.833 -COLORS = { - "red": (1, 0, 0), - "green": (0, 1, 0), +COLORS: dict[str, tuple[float, float, float]] = { "blue": (0, 0, 1), + "green": (0, 1, 0), + "red": (1, 0, 0), + "cyan": (0, 1, 1), "yellow": (1, 1, 0), + "magenta": (1, 0, 1), "purple": (0.5, 0, 0.5), + "pink": (1, 0.75, 0.8), "orange": (1, 0.65, 0), } + @dataclass class Annotation: """A PDF annotation object. diff --git a/papis_extract/exporter.py b/papis_extract/exporter.py index 862a068..11cb666 100644 --- a/papis_extract/exporter.py +++ b/papis_extract/exporter.py @@ -98,7 +98,7 @@ def _add_annots_to_note( f.write("\n") logger.info( f"Wrote {len(new_annotations)} " - f"{'annotation' if len(new_annotations) == 1 else 'annotations'} " + f"{'line' if len(new_annotations) == 1 else 'lines'} " f"to {papis.document.describe(document)}" ) diff --git a/papis_extract/extraction.py b/papis_extract/extraction.py index bf7391a..79c6d6a 100644 --- a/papis_extract/extraction.py +++ b/papis_extract/extraction.py @@ -47,6 +47,6 @@ def start( if not file_available: # have to remove curlys or papis logger gets upset desc = re.sub("[{}]", "", papis.document.describe(document)) - logger.warning("Did not find suitable file for document: " f"{desc}") + logger.info(f"No {type(extractor)} file for document: {desc}") return annotations diff --git a/papis_extract/extractors/__init__.py b/papis_extract/extractors/__init__.py index 174b556..ae9205a 100644 --- a/papis_extract/extractors/__init__.py +++ b/papis_extract/extractors/__init__.py @@ -1,6 +1,8 @@ from papis_extract.extraction import Extractor from papis_extract.extractors import pdf +from papis_extract.extractors.pocketbook import PocketBookExtractor all_extractors: dict[str, Extractor] = { "pdf": pdf.PdfExtractor(), + "pocketbook": PocketBookExtractor(), } diff --git a/papis_extract/extractors/pdf.py b/papis_extract/extractors/pdf.py index b38ed50..a5f8de4 100644 --- a/papis_extract/extractors/pdf.py +++ b/papis_extract/extractors/pdf.py @@ -1,5 +1,4 @@ from pathlib import Path -from typing import Any, Optional import fitz import Levenshtein @@ -43,7 +42,7 @@ class PdfExtractor: file=str(filename), content=quote or "", note=note or "", - colors=col, + color=col, type=annot.type[1], page=(page.number or 0) + 1, ) @@ -54,14 +53,12 @@ class PdfExtractor: ) return annotations - def _is_pdf(self, fname: Path) -> bool: """Check if file is a pdf, using mime type.""" return magic.from_file(fname, mime=True) == "application/pdf" - - def _retrieve_annotation_content(self, - page: fitz.Page, annotation: fitz.Annot + def _retrieve_annotation_content( + self, page: fitz.Page, annotation: fitz.Annot ) -> tuple[str | None, str | None]: """Gets the text content of an annotation. @@ -77,7 +74,8 @@ class PdfExtractor: # highlight with selection in note minimum_similarity = ( - papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0 + papis.config.getfloat("minimum_similarity_content", "plugins.extract") + or 1.0 ) if Levenshtein.ratio(content, written) > minimum_similarity: return (content, None) @@ -92,5 +90,3 @@ class PdfExtractor: return (written, None) # just a highlight without any text return (None, None) - - diff --git a/papis_extract/extractors/pocketbook.py b/papis_extract/extractors/pocketbook.py new file mode 100644 index 0000000..c7f2a9c --- /dev/null +++ b/papis_extract/extractors/pocketbook.py @@ -0,0 +1,60 @@ +from pathlib import Path + +import magic +import papis.config +import papis.logging +from bs4 import BeautifulSoup + +from papis_extract.annotation import COLORS, Annotation + +logger = papis.logging.get_logger(__name__) + + +class PocketBookExtractor: + def can_process(self, filename: Path) -> bool: + return magic.from_file(filename, mime=True) == "text/xml" + + def run(self, filename: Path) -> list[Annotation]: + """Extract annotations from pocketbook html file. + + Export annotations from pocketbook app and load add them + to a papis document as the exported html file. + + Returns all readable annotations contained in the file + passed in, with highlights, notes and pages if available. + """ + annotations: list[Annotation] = [] + try: + with open(filename) as f: + html = BeautifulSoup(f.read(), features="xml") + except FileNotFoundError: + logger.error(f"Could not open file {filename} for extraction.") + return [] + + for bm in html.select("div.bookmark"): + content = (bm.select_one("div.bm-text>p") or html.new_string("")).text + note = (bm.select_one("div.bm-note>p") or html.new_string("")).text + page = (bm.select_one("p.bm-page") or html.new_string("")).text + + el_classes = bm.attrs.get("class", "").split(" ") + color = (0, 0, 0) + for c in el_classes: + if "bm-color-" in c: + color = COLORS.get(c.removeprefix("bm-color-"), (0, 0, 0)) + break + + a = Annotation( + file=str(filename), + content=content or "", + note=note or "", + color=color, + type="Highlight", + page=int(page), + ) + annotations.append(a) + + logger.debug( + f"Found {len(annotations)} " + f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}." + ) + return annotations diff --git a/pyproject.toml b/pyproject.toml index 526732c..0c793cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,9 +19,11 @@ click = "^8.1.7" whoosh = { version = "^2.7.4", optional = true } python-magic = "^0.4.27" chevron = "^0.14.0" +beautifulsoup4 = { version = "^4.12.3", optional = true } [tool.poetry.extras] whoosh = ["whoosh"] +pocketbook = ["beautifulsoup4"] [tool.poetry.group.dev.dependencies] pytest = "^7.4.0"