Compare commits

..

9 commits

2 changed files with 82 additions and 14 deletions

View file

@ -20,12 +20,39 @@ active = extract
[[extract]]
on_import = False
formatting = "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}"
minimum_similarity = 0.75
```
If `on_import` is `True` extraction is automatically run whenever a new document is added to the library,
if false extraction has to be handled manually.
`formatting` takes a string with a variety of template options. You can use any of the following:
- `{page}`: The page number the annotation was found on.
- `{quote}`: The actual quoted string (i.e. highlighted).
- `{note}`: The annotation note (i.e. addded reader).
- `{quote_begin}`: Mark the area that begins a quotation. Useful to get rid of prefixes or suffixes if no quotation exists. Requires a corresponding `{quote_end}` to be set after.
- `{note_begin}`: Same idea for the note area. Requires a corresponding `{note_end}` to be set after.
For example, the default formatting string will result in this output:
```markdown
[4]
> came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop
Note: Often illegally connected to network, revolution of the thirsty
```
The formatting string `{quote_begin}Quote: {quote}{page}{newline}{quote_end}{note_begin}Note: {note}{note_end}`
will result in the following output:
```markdown
Quote: came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop [4]
Note: Often illegally connected to network, revolution of the thirsty
```
Note, however, that in this example the page number if *only* displayed if the annotation contains a quote.
`minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed
as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form:

View file

@ -1,4 +1,5 @@
import os
import re
import argparse
import fitz
@ -39,8 +40,18 @@ class ExtractPlugin(PapersPlugin):
# e.g. `> [{page}] {annotation}`
# or `:: {annotation} :: {page} ::`
# and so on
self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)
self.minimum_similarity = float(conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75))
self.on_import = conf["plugins"].get("extract", {}).get("on_import", False)
self.minimum_similarity = float(
conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)
)
self.formatting = (
conf["plugins"]
.get("extract", {})
.get(
"formatting",
"[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}",
)
)
def update_parser(self, subparsers, conf):
"""Allow the usage of the pubs extract subcommand"""
@ -131,12 +142,37 @@ class ExtractPlugin(PapersPlugin):
with fitz.Document(filename) as doc:
for page in doc:
for annot in page.annots():
content = self._retrieve_annotation_content(page, annot)
if content:
annotations.append(f"[{(page.number or 0) + 1}] {content}")
quote, note = self._retrieve_annotation_content(page, annot)
annotations.append(
self._format_annotation(quote, note, page.number or 0)
)
return annotations
def _retrieve_annotation_content(self, page, annotation, connector = "\nNote: "):
def _format_annotation(self, quote, note, pagenumber=0):
output = self.formatting
replacements = {
"{quote}": quote,
"{note}": note,
"{page}": str(pagenumber),
"{newline}": "\n",
}
if note == "":
output = re.sub(r"{note_begin}.*{note_end}", "", output)
if quote == "":
output = re.sub(r"{quote_begin}.*{quote_end}", "", output)
output = re.sub(r"{note_begin}", "", output)
output = re.sub(r"{note_end}", "", output)
output = re.sub(r"{quote_begin}", "", output)
output = re.sub(r"{quote_end}", "", output)
pattern = re.compile(
"|".join(
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
),
flags=re.DOTALL,
)
return pattern.sub(lambda x: replacements[x.group(0)], output)
def _retrieve_annotation_content(self, page, annotation):
"""Gets the text content of an annotation.
Returns the actual content of an annotation. Sometimes
@ -148,11 +184,17 @@ class ExtractPlugin(PapersPlugin):
content = annotation.info["content"].replace("\n", " ")
written = page.get_textbox(annotation.rect).replace("\n", " ")
if Levenshtein.ratio(content,written) > self.minimum_similarity:
return content
# highlight with selection in note
if Levenshtein.ratio(content, written) > self.minimum_similarity:
return (content, "")
# an independent note, not a highlight
elif content and not written:
return ("", content)
# both a highlight and a note
elif content:
return f"{written}{connector}{content}"
return written
return (written, content)
# highlight with selection not in note
return (written, "")
def _to_stdout(self, annotated_papers):
"""Write annotations to stdout.
@ -165,9 +207,9 @@ class ExtractPlugin(PapersPlugin):
paper = contents[0]
annotations = contents[1]
if annotations:
output += f"{paper.citekey}\n"
output += f"------ {paper.citekey} ------\n\n"
for annot in annotations:
output += f'> "{annot}"\n'
output += f"{annot}\n\n"
output += "\n"
print(output)
@ -226,8 +268,7 @@ class ExtractPlugin(PapersPlugin):
def modify_event(event):
if ExtractPlugin.is_loaded():
plg = ExtractPlugin.get_instance()
if plg.onimport:
if plg.on_import:
all_annotations = plg.extract([event.citekey])
if all_annotations[0][1]:
plg._to_notes(all_annotations, plg.note_extension)
plg.ui.info(f"Imported {event.citekey} annotations.")