Add extraction for no-content and note highlights

This commit is contained in:
Marty Oehme 2022-12-22 22:06:41 +01:00
parent d14a95e18b
commit c9f286fc33
Signed by: Marty
GPG key ID: 73BA40D5AFAF49C9
2 changed files with 23 additions and 3 deletions

View file

@ -129,13 +129,20 @@ class ExtractPlugin(PapersPlugin):
with fitz.Document(filename) as doc:
for page in doc:
for annot in page.annots():
content = annot.get_text() or annot.info["content"].replace(
"\n", ""
)
content = self._retrieve_annotation_content(page, annot)
if content:
annotations.append(f"[{(page.number or 0) + 1}] {content}")
return annotations
def _retrieve_annotation_content(self, page, annotation):
content = annotation.info["content"].replace("\n", " ")
written = page.get_textbox(annotation.rect).replace("\n", " ")
if written in content:
return content
elif content:
return f"{written}\nNote: {content}"
return written
def _to_stdout(self, annotated_papers):
"""Write annotations to stdout.