Compare commits

...

9 commits

2 changed files with 82 additions and 14 deletions

View file

@ -20,12 +20,39 @@ active = extract
[[extract]] [[extract]]
on_import = False on_import = False
formatting = "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}"
minimum_similarity = 0.75 minimum_similarity = 0.75
``` ```
If `on_import` is `True` extraction is automatically run whenever a new document is added to the library, If `on_import` is `True` extraction is automatically run whenever a new document is added to the library,
if false extraction has to be handled manually. if false extraction has to be handled manually.
`formatting` takes a string with a variety of template options. You can use any of the following:
- `{page}`: The page number the annotation was found on.
- `{quote}`: The actual quoted string (i.e. highlighted).
- `{note}`: The annotation note (i.e. addded reader).
- `{quote_begin}`: Mark the area that begins a quotation. Useful to get rid of prefixes or suffixes if no quotation exists. Requires a corresponding `{quote_end}` to be set after.
- `{note_begin}`: Same idea for the note area. Requires a corresponding `{note_end}` to be set after.
For example, the default formatting string will result in this output:
```markdown
[4]
> came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop
Note: Often illegally connected to network, revolution of the thirsty
```
The formatting string `{quote_begin}Quote: {quote}{page}{newline}{quote_end}{note_begin}Note: {note}{note_end}`
will result in the following output:
```markdown
Quote: came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop [4]
Note: Often illegally connected to network, revolution of the thirsty
```
Note, however, that in this example the page number if *only* displayed if the annotation contains a quote.
`minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed `minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed
as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form: as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form:

View file

@ -1,4 +1,5 @@
import os import os
import re
import argparse import argparse
import fitz import fitz
@ -39,8 +40,18 @@ class ExtractPlugin(PapersPlugin):
# e.g. `> [{page}] {annotation}` # e.g. `> [{page}] {annotation}`
# or `:: {annotation} :: {page} ::` # or `:: {annotation} :: {page} ::`
# and so on # and so on
self.onimport = conf["plugins"].get("extract", {}).get("onimport", False) self.on_import = conf["plugins"].get("extract", {}).get("on_import", False)
self.minimum_similarity = float(conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)) self.minimum_similarity = float(
conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)
)
self.formatting = (
conf["plugins"]
.get("extract", {})
.get(
"formatting",
"[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}",
)
)
def update_parser(self, subparsers, conf): def update_parser(self, subparsers, conf):
"""Allow the usage of the pubs extract subcommand""" """Allow the usage of the pubs extract subcommand"""
@ -131,12 +142,37 @@ class ExtractPlugin(PapersPlugin):
with fitz.Document(filename) as doc: with fitz.Document(filename) as doc:
for page in doc: for page in doc:
for annot in page.annots(): for annot in page.annots():
content = self._retrieve_annotation_content(page, annot) quote, note = self._retrieve_annotation_content(page, annot)
if content: annotations.append(
annotations.append(f"[{(page.number or 0) + 1}] {content}") self._format_annotation(quote, note, page.number or 0)
)
return annotations return annotations
def _retrieve_annotation_content(self, page, annotation, connector = "\nNote: "): def _format_annotation(self, quote, note, pagenumber=0):
output = self.formatting
replacements = {
"{quote}": quote,
"{note}": note,
"{page}": str(pagenumber),
"{newline}": "\n",
}
if note == "":
output = re.sub(r"{note_begin}.*{note_end}", "", output)
if quote == "":
output = re.sub(r"{quote_begin}.*{quote_end}", "", output)
output = re.sub(r"{note_begin}", "", output)
output = re.sub(r"{note_end}", "", output)
output = re.sub(r"{quote_begin}", "", output)
output = re.sub(r"{quote_end}", "", output)
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
return pattern.sub(lambda x: replacements[x.group(0)], output)
def _retrieve_annotation_content(self, page, annotation):
"""Gets the text content of an annotation. """Gets the text content of an annotation.
Returns the actual content of an annotation. Sometimes Returns the actual content of an annotation. Sometimes
@ -148,11 +184,17 @@ class ExtractPlugin(PapersPlugin):
content = annotation.info["content"].replace("\n", " ") content = annotation.info["content"].replace("\n", " ")
written = page.get_textbox(annotation.rect).replace("\n", " ") written = page.get_textbox(annotation.rect).replace("\n", " ")
# highlight with selection in note
if Levenshtein.ratio(content, written) > self.minimum_similarity: if Levenshtein.ratio(content, written) > self.minimum_similarity:
return content return (content, "")
# an independent note, not a highlight
elif content and not written:
return ("", content)
# both a highlight and a note
elif content: elif content:
return f"{written}{connector}{content}" return (written, content)
return written # highlight with selection not in note
return (written, "")
def _to_stdout(self, annotated_papers): def _to_stdout(self, annotated_papers):
"""Write annotations to stdout. """Write annotations to stdout.
@ -165,9 +207,9 @@ class ExtractPlugin(PapersPlugin):
paper = contents[0] paper = contents[0]
annotations = contents[1] annotations = contents[1]
if annotations: if annotations:
output += f"{paper.citekey}\n" output += f"------ {paper.citekey} ------\n\n"
for annot in annotations: for annot in annotations:
output += f'> "{annot}"\n' output += f"{annot}\n\n"
output += "\n" output += "\n"
print(output) print(output)
@ -226,8 +268,7 @@ class ExtractPlugin(PapersPlugin):
def modify_event(event): def modify_event(event):
if ExtractPlugin.is_loaded(): if ExtractPlugin.is_loaded():
plg = ExtractPlugin.get_instance() plg = ExtractPlugin.get_instance()
if plg.onimport: if plg.on_import:
all_annotations = plg.extract([event.citekey]) all_annotations = plg.extract([event.citekey])
if all_annotations[0][1]: if all_annotations[0][1]:
plg._to_notes(all_annotations, plg.note_extension) plg._to_notes(all_annotations, plg.note_extension)
plg.ui.info(f"Imported {event.citekey} annotations.")