feat: Add pocketbook extraction
This commit is contained in:
parent
ddb34fca7b
commit
c53cd563b7
7 changed files with 78 additions and 14 deletions
|
@ -9,15 +9,19 @@ from papis.document import Document
|
|||
TEXT_SIMILARITY_MINIMUM = 0.75
|
||||
COLOR_SIMILARITY_MINIMUM = 0.833
|
||||
|
||||
COLORS = {
|
||||
"red": (1, 0, 0),
|
||||
"green": (0, 1, 0),
|
||||
COLORS: dict[str, tuple[float, float, float]] = {
|
||||
"blue": (0, 0, 1),
|
||||
"green": (0, 1, 0),
|
||||
"red": (1, 0, 0),
|
||||
"cyan": (0, 1, 1),
|
||||
"yellow": (1, 1, 0),
|
||||
"magenta": (1, 0, 1),
|
||||
"purple": (0.5, 0, 0.5),
|
||||
"pink": (1, 0.75, 0.8),
|
||||
"orange": (1, 0.65, 0),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Annotation:
|
||||
"""A PDF annotation object.
|
||||
|
|
|
@ -98,7 +98,7 @@ def _add_annots_to_note(
|
|||
f.write("\n")
|
||||
logger.info(
|
||||
f"Wrote {len(new_annotations)} "
|
||||
f"{'annotation' if len(new_annotations) == 1 else 'annotations'} "
|
||||
f"{'line' if len(new_annotations) == 1 else 'lines'} "
|
||||
f"to {papis.document.describe(document)}"
|
||||
)
|
||||
|
||||
|
|
|
@ -47,6 +47,6 @@ def start(
|
|||
if not file_available:
|
||||
# have to remove curlys or papis logger gets upset
|
||||
desc = re.sub("[{}]", "", papis.document.describe(document))
|
||||
logger.warning("Did not find suitable file for document: " f"{desc}")
|
||||
logger.info(f"No {type(extractor)} file for document: {desc}")
|
||||
|
||||
return annotations
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from papis_extract.extraction import Extractor
|
||||
from papis_extract.extractors import pdf
|
||||
from papis_extract.extractors.pocketbook import PocketBookExtractor
|
||||
|
||||
all_extractors: dict[str, Extractor] = {
|
||||
"pdf": pdf.PdfExtractor(),
|
||||
"pocketbook": PocketBookExtractor(),
|
||||
}
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import fitz
|
||||
import Levenshtein
|
||||
|
@ -43,7 +42,7 @@ class PdfExtractor:
|
|||
file=str(filename),
|
||||
content=quote or "",
|
||||
note=note or "",
|
||||
colors=col,
|
||||
color=col,
|
||||
type=annot.type[1],
|
||||
page=(page.number or 0) + 1,
|
||||
)
|
||||
|
@ -54,14 +53,12 @@ class PdfExtractor:
|
|||
)
|
||||
return annotations
|
||||
|
||||
|
||||
def _is_pdf(self, fname: Path) -> bool:
|
||||
"""Check if file is a pdf, using mime type."""
|
||||
return magic.from_file(fname, mime=True) == "application/pdf"
|
||||
|
||||
|
||||
def _retrieve_annotation_content(self,
|
||||
page: fitz.Page, annotation: fitz.Annot
|
||||
def _retrieve_annotation_content(
|
||||
self, page: fitz.Page, annotation: fitz.Annot
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""Gets the text content of an annotation.
|
||||
|
||||
|
@ -77,7 +74,8 @@ class PdfExtractor:
|
|||
|
||||
# highlight with selection in note
|
||||
minimum_similarity = (
|
||||
papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0
|
||||
papis.config.getfloat("minimum_similarity_content", "plugins.extract")
|
||||
or 1.0
|
||||
)
|
||||
if Levenshtein.ratio(content, written) > minimum_similarity:
|
||||
return (content, None)
|
||||
|
@ -92,5 +90,3 @@ class PdfExtractor:
|
|||
return (written, None)
|
||||
# just a highlight without any text
|
||||
return (None, None)
|
||||
|
||||
|
||||
|
|
60
papis_extract/extractors/pocketbook.py
Normal file
60
papis_extract/extractors/pocketbook.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
from pathlib import Path
|
||||
|
||||
import magic
|
||||
import papis.config
|
||||
import papis.logging
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from papis_extract.annotation import COLORS, Annotation
|
||||
|
||||
logger = papis.logging.get_logger(__name__)
|
||||
|
||||
|
||||
class PocketBookExtractor:
|
||||
def can_process(self, filename: Path) -> bool:
|
||||
return magic.from_file(filename, mime=True) == "text/xml"
|
||||
|
||||
def run(self, filename: Path) -> list[Annotation]:
|
||||
"""Extract annotations from pocketbook html file.
|
||||
|
||||
Export annotations from pocketbook app and load add them
|
||||
to a papis document as the exported html file.
|
||||
|
||||
Returns all readable annotations contained in the file
|
||||
passed in, with highlights, notes and pages if available.
|
||||
"""
|
||||
annotations: list[Annotation] = []
|
||||
try:
|
||||
with open(filename) as f:
|
||||
html = BeautifulSoup(f.read(), features="xml")
|
||||
except FileNotFoundError:
|
||||
logger.error(f"Could not open file {filename} for extraction.")
|
||||
return []
|
||||
|
||||
for bm in html.select("div.bookmark"):
|
||||
content = (bm.select_one("div.bm-text>p") or html.new_string("")).text
|
||||
note = (bm.select_one("div.bm-note>p") or html.new_string("")).text
|
||||
page = (bm.select_one("p.bm-page") or html.new_string("")).text
|
||||
|
||||
el_classes = bm.attrs.get("class", "").split(" ")
|
||||
color = (0, 0, 0)
|
||||
for c in el_classes:
|
||||
if "bm-color-" in c:
|
||||
color = COLORS.get(c.removeprefix("bm-color-"), (0, 0, 0))
|
||||
break
|
||||
|
||||
a = Annotation(
|
||||
file=str(filename),
|
||||
content=content or "",
|
||||
note=note or "",
|
||||
color=color,
|
||||
type="Highlight",
|
||||
page=int(page),
|
||||
)
|
||||
annotations.append(a)
|
||||
|
||||
logger.debug(
|
||||
f"Found {len(annotations)} "
|
||||
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
|
||||
)
|
||||
return annotations
|
|
@ -19,9 +19,11 @@ click = "^8.1.7"
|
|||
whoosh = { version = "^2.7.4", optional = true }
|
||||
python-magic = "^0.4.27"
|
||||
chevron = "^0.14.0"
|
||||
beautifulsoup4 = { version = "^4.12.3", optional = true }
|
||||
|
||||
[tool.poetry.extras]
|
||||
whoosh = ["whoosh"]
|
||||
pocketbook = ["beautifulsoup4"]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.4.0"
|
||||
|
|
Loading…
Reference in a new issue