Marty Oehme
163fd63038
The complete read routine would work before figuring out that it is a file of xml mimetype. This means that it would try to read to memory any file as the first thing, pdfs, even binaries. Of course doing so crashed the program.
78 lines
2.5 KiB
Python
78 lines
2.5 KiB
Python
from pathlib import Path
|
|
|
|
import magic
|
|
import papis.config
|
|
import papis.logging
|
|
from bs4 import BeautifulSoup
|
|
|
|
from papis_extract.annotation import COLORS, Annotation
|
|
|
|
logger = papis.logging.get_logger(__name__)
|
|
|
|
|
|
class PocketBookExtractor:
|
|
def can_process(self, filename: Path) -> bool:
|
|
if not magic.from_file(filename, mime=True) == "text/xml":
|
|
return False
|
|
|
|
content = self._read_file(filename)
|
|
if not content:
|
|
return False
|
|
|
|
html = BeautifulSoup(content, features="xml")
|
|
if not html.find(
|
|
"meta", {"name": "generator", "content": "PocketBook Bookmarks Export"}
|
|
):
|
|
return False
|
|
return True
|
|
|
|
def run(self, filename: Path) -> list[Annotation]:
|
|
"""Extract annotations from pocketbook html file.
|
|
|
|
Export annotations from pocketbook app and load add them
|
|
to a papis document as the exported html file.
|
|
|
|
Returns all readable annotations contained in the file
|
|
passed in, with highlights, notes and pages if available.
|
|
"""
|
|
content = self._read_file(filename)
|
|
if not content:
|
|
return []
|
|
html = BeautifulSoup(content, features="xml")
|
|
|
|
annotations: list[Annotation] = []
|
|
for bm in html.select("div.bookmark"):
|
|
content = (bm.select_one("div.bm-text>p") or html.new_string("")).text
|
|
note = (bm.select_one("div.bm-note>p") or html.new_string("")).text
|
|
page = (bm.select_one("p.bm-page") or html.new_string("")).text
|
|
|
|
el_classes = bm.attrs.get("class", "").split(" ")
|
|
color = (0, 0, 0)
|
|
for c in el_classes:
|
|
if "bm-color-" in c:
|
|
color = COLORS.get(c.removeprefix("bm-color-"), (0, 0, 0))
|
|
break
|
|
|
|
a = Annotation(
|
|
file=str(filename),
|
|
content=content or "",
|
|
note=note or "",
|
|
color=color,
|
|
type="Highlight",
|
|
page=int(page),
|
|
)
|
|
annotations.append(a)
|
|
|
|
logger.debug(
|
|
f"Found {len(annotations)} "
|
|
f"{'annotation' if len(annotations) == 1 else 'annotations'} for {filename}."
|
|
)
|
|
return annotations
|
|
|
|
def _read_file(self, filename: Path) -> str:
|
|
try:
|
|
with open(filename) as f:
|
|
return f.read()
|
|
except FileNotFoundError:
|
|
logger.error(f"Could not open file {filename} for extraction.")
|
|
return ""
|