feat: Add pocketbook extraction
This commit is contained in:
parent
ddb34fca7b
commit
c53cd563b7
7 changed files with 78 additions and 14 deletions
|
@ -9,15 +9,19 @@ from papis.document import Document
|
||||||
TEXT_SIMILARITY_MINIMUM = 0.75
|
TEXT_SIMILARITY_MINIMUM = 0.75
|
||||||
COLOR_SIMILARITY_MINIMUM = 0.833
|
COLOR_SIMILARITY_MINIMUM = 0.833
|
||||||
|
|
||||||
COLORS = {
|
COLORS: dict[str, tuple[float, float, float]] = {
|
||||||
"red": (1, 0, 0),
|
|
||||||
"green": (0, 1, 0),
|
|
||||||
"blue": (0, 0, 1),
|
"blue": (0, 0, 1),
|
||||||
|
"green": (0, 1, 0),
|
||||||
|
"red": (1, 0, 0),
|
||||||
|
"cyan": (0, 1, 1),
|
||||||
"yellow": (1, 1, 0),
|
"yellow": (1, 1, 0),
|
||||||
|
"magenta": (1, 0, 1),
|
||||||
"purple": (0.5, 0, 0.5),
|
"purple": (0.5, 0, 0.5),
|
||||||
|
"pink": (1, 0.75, 0.8),
|
||||||
"orange": (1, 0.65, 0),
|
"orange": (1, 0.65, 0),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Annotation:
|
class Annotation:
|
||||||
"""A PDF annotation object.
|
"""A PDF annotation object.
|
||||||
|
|
|
@ -98,7 +98,7 @@ def _add_annots_to_note(
|
||||||
f.write("\n")
|
f.write("\n")
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Wrote {len(new_annotations)} "
|
f"Wrote {len(new_annotations)} "
|
||||||
f"{'annotation' if len(new_annotations) == 1 else 'annotations'} "
|
f"{'line' if len(new_annotations) == 1 else 'lines'} "
|
||||||
f"to {papis.document.describe(document)}"
|
f"to {papis.document.describe(document)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -47,6 +47,6 @@ def start(
|
||||||
if not file_available:
|
if not file_available:
|
||||||
# have to remove curlys or papis logger gets upset
|
# have to remove curlys or papis logger gets upset
|
||||||
desc = re.sub("[{}]", "", papis.document.describe(document))
|
desc = re.sub("[{}]", "", papis.document.describe(document))
|
||||||
logger.warning("Did not find suitable file for document: " f"{desc}")
|
logger.info(f"No {type(extractor)} file for document: {desc}")
|
||||||
|
|
||||||
return annotations
|
return annotations
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
from papis_extract.extraction import Extractor
|
from papis_extract.extraction import Extractor
|
||||||
from papis_extract.extractors import pdf
|
from papis_extract.extractors import pdf
|
||||||
|
from papis_extract.extractors.pocketbook import PocketBookExtractor
|
||||||
|
|
||||||
all_extractors: dict[str, Extractor] = {
|
all_extractors: dict[str, Extractor] = {
|
||||||
"pdf": pdf.PdfExtractor(),
|
"pdf": pdf.PdfExtractor(),
|
||||||
|
"pocketbook": PocketBookExtractor(),
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional
|
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
import Levenshtein
|
import Levenshtein
|
||||||
|
@ -43,7 +42,7 @@ class PdfExtractor:
|
||||||
file=str(filename),
|
file=str(filename),
|
||||||
content=quote or "",
|
content=quote or "",
|
||||||
note=note or "",
|
note=note or "",
|
||||||
colors=col,
|
color=col,
|
||||||
type=annot.type[1],
|
type=annot.type[1],
|
||||||
page=(page.number or 0) + 1,
|
page=(page.number or 0) + 1,
|
||||||
)
|
)
|
||||||
|
@ -54,14 +53,12 @@ class PdfExtractor:
|
||||||
)
|
)
|
||||||
return annotations
|
return annotations
|
||||||
|
|
||||||
|
|
||||||
def _is_pdf(self, fname: Path) -> bool:
|
def _is_pdf(self, fname: Path) -> bool:
|
||||||
"""Check if file is a pdf, using mime type."""
|
"""Check if file is a pdf, using mime type."""
|
||||||
return magic.from_file(fname, mime=True) == "application/pdf"
|
return magic.from_file(fname, mime=True) == "application/pdf"
|
||||||
|
|
||||||
|
def _retrieve_annotation_content(
|
||||||
def _retrieve_annotation_content(self,
|
self, page: fitz.Page, annotation: fitz.Annot
|
||||||
page: fitz.Page, annotation: fitz.Annot
|
|
||||||
) -> tuple[str | None, str | None]:
|
) -> tuple[str | None, str | None]:
|
||||||
"""Gets the text content of an annotation.
|
"""Gets the text content of an annotation.
|
||||||
|
|
||||||
|
@ -77,7 +74,8 @@ class PdfExtractor:
|
||||||
|
|
||||||
# highlight with selection in note
|
# highlight with selection in note
|
||||||
minimum_similarity = (
|
minimum_similarity = (
|
||||||
papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0
|
papis.config.getfloat("minimum_similarity_content", "plugins.extract")
|
||||||
|
or 1.0
|
||||||
)
|
)
|
||||||
if Levenshtein.ratio(content, written) > minimum_similarity:
|
if Levenshtein.ratio(content, written) > minimum_similarity:
|
||||||
return (content, None)
|
return (content, None)
|
||||||
|
@ -92,5 +90,3 @@ class PdfExtractor:
|
||||||
return (written, None)
|
return (written, None)
|
||||||
# just a highlight without any text
|
# just a highlight without any text
|
||||||
return (None, None)
|
return (None, None)
|
||||||
|
|
||||||
|
|
||||||
|
|
60
papis_extract/extractors/pocketbook.py
Normal file
60
papis_extract/extractors/pocketbook.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import magic
|
||||||
|
import papis.config
|
||||||
|
import papis.logging
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from papis_extract.annotation import COLORS, Annotation
|
||||||
|
|
||||||
|
logger = papis.logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PocketBookExtractor:
|
||||||
|
def can_process(self, filename: Path) -> bool:
|
||||||
|
return magic.from_file(filename, mime=True) == "text/xml"
|
||||||
|
|
||||||
|
def run(self, filename: Path) -> list[Annotation]:
|
||||||
|
"""Extract annotations from pocketbook html file.
|
||||||
|
|
||||||
|
Export annotations from pocketbook app and load add them
|
||||||
|
to a papis document as the exported html file.
|
||||||
|
|
||||||
|
Returns all readable annotations contained in the file
|
||||||
|
passed in, with highlights, notes and pages if available.
|
||||||
|
"""
|
||||||
|
annotations: list[Annotation] = []
|
||||||
|
try:
|
||||||
|
with open(filename) as f:
|
||||||
|
html = BeautifulSoup(f.read(), features="xml")
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.error(f"Could not open file {filename} for extraction.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
for bm in html.select("div.bookmark"):
|
||||||
|
content = (bm.select_one("div.bm-text>p") or html.new_string("")).text
|
||||||
|
note = (bm.select_one("div.bm-note>p") or html.new_string("")).text
|
||||||
|
page = (bm.select_one("p.bm-page") or html.new_string("")).text
|
||||||
|
|
||||||
|
el_classes = bm.attrs.get("class", "").split(" ")
|
||||||
|
color = (0, 0, 0)
|
||||||
|
for c in el_classes:
|
||||||
|
if "bm-color-" in c:
|
||||||
|
color = COLORS.get(c.removeprefix("bm-color-"), (0, 0, 0))
|
||||||
|
break
|
||||||
|
|
||||||
|
a = Annotation(
|
||||||
|
file=str(filename),
|
||||||
|
content=content or "",
|
||||||
|
note=note or "",
|
||||||
|
color=color,
|
||||||
|
type="Highlight",
|
||||||
|
page=int(page),
|
||||||
|
)
|
||||||
|
annotations.append(a)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Found {len(annotations)} "
|
||||||
|
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
|
||||||
|
)
|
||||||
|
return annotations
|
|
@ -19,9 +19,11 @@ click = "^8.1.7"
|
||||||
whoosh = { version = "^2.7.4", optional = true }
|
whoosh = { version = "^2.7.4", optional = true }
|
||||||
python-magic = "^0.4.27"
|
python-magic = "^0.4.27"
|
||||||
chevron = "^0.14.0"
|
chevron = "^0.14.0"
|
||||||
|
beautifulsoup4 = { version = "^4.12.3", optional = true }
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
whoosh = ["whoosh"]
|
whoosh = ["whoosh"]
|
||||||
|
pocketbook = ["beautifulsoup4"]
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
pytest = "^7.4.0"
|
pytest = "^7.4.0"
|
||||||
|
|
Loading…
Reference in a new issue