Compare commits
9 commits
91525827b8
...
59ff74475d
Author | SHA1 | Date | |
---|---|---|---|
59ff74475d | |||
a3e2d8693d | |||
e3aacc4b15 | |||
8f01b93de2 | |||
f103cc51b5 | |||
e255405d96 | |||
b65c2a3be6 | |||
82b99e8420 | |||
4281dba9c2 |
2 changed files with 82 additions and 14 deletions
27
README.md
27
README.md
|
@ -20,12 +20,39 @@ active = extract
|
||||||
|
|
||||||
[[extract]]
|
[[extract]]
|
||||||
on_import = False
|
on_import = False
|
||||||
|
formatting = "[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}"
|
||||||
minimum_similarity = 0.75
|
minimum_similarity = 0.75
|
||||||
```
|
```
|
||||||
|
|
||||||
If `on_import` is `True` extraction is automatically run whenever a new document is added to the library,
|
If `on_import` is `True` extraction is automatically run whenever a new document is added to the library,
|
||||||
if false extraction has to be handled manually.
|
if false extraction has to be handled manually.
|
||||||
|
|
||||||
|
`formatting` takes a string with a variety of template options. You can use any of the following:
|
||||||
|
|
||||||
|
- `{page}`: The page number the annotation was found on.
|
||||||
|
- `{quote}`: The actual quoted string (i.e. highlighted).
|
||||||
|
- `{note}`: The annotation note (i.e. addded reader).
|
||||||
|
- `{quote_begin}`: Mark the area that begins a quotation. Useful to get rid of prefixes or suffixes if no quotation exists. Requires a corresponding `{quote_end}` to be set after.
|
||||||
|
- `{note_begin}`: Same idea for the note area. Requires a corresponding `{note_end}` to be set after.
|
||||||
|
|
||||||
|
For example, the default formatting string will result in this output:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
[4]
|
||||||
|
> came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop
|
||||||
|
Note: Often illegally connected to network, ‘revolution of the thirsty’
|
||||||
|
```
|
||||||
|
|
||||||
|
The formatting string `{quote_begin}Quote: {quote}{page}{newline}{quote_end}{note_begin}Note: {note}{note_end}`
|
||||||
|
will result in the following output:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
Quote: came “urban rights” caused collective outrage. Thus, through- out 2007 and 2008, protestors in towns and villages across the Nile Delta poured into the streets to rally over cuts in water flow. Deployment of massive riot police could not stop [4]
|
||||||
|
Note: Often illegally connected to network, ‘revolution of the thirsty’
|
||||||
|
```
|
||||||
|
|
||||||
|
Note, however, that in this example the page number if *only* displayed if the annotation contains a quote.
|
||||||
|
|
||||||
`minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed
|
`minimum_similarity` sets the required similarity of an annotation's note and written words to be viewed
|
||||||
as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form:
|
as one. Any annotation that has both and is *under* the minimum similarity will be added in the following form:
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
|
@ -39,8 +40,18 @@ class ExtractPlugin(PapersPlugin):
|
||||||
# e.g. `> [{page}] {annotation}`
|
# e.g. `> [{page}] {annotation}`
|
||||||
# or `:: {annotation} :: {page} ::`
|
# or `:: {annotation} :: {page} ::`
|
||||||
# and so on
|
# and so on
|
||||||
self.onimport = conf["plugins"].get("extract", {}).get("onimport", False)
|
self.on_import = conf["plugins"].get("extract", {}).get("on_import", False)
|
||||||
self.minimum_similarity = float(conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75))
|
self.minimum_similarity = float(
|
||||||
|
conf["plugins"].get("extract", {}).get("minimum_similarity", 0.75)
|
||||||
|
)
|
||||||
|
self.formatting = (
|
||||||
|
conf["plugins"]
|
||||||
|
.get("extract", {})
|
||||||
|
.get(
|
||||||
|
"formatting",
|
||||||
|
"[{page}]{newline}{quote_begin}> {quote}{newline}{quote_end}{note_begin}Note: {note}{note_end}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def update_parser(self, subparsers, conf):
|
def update_parser(self, subparsers, conf):
|
||||||
"""Allow the usage of the pubs extract subcommand"""
|
"""Allow the usage of the pubs extract subcommand"""
|
||||||
|
@ -131,12 +142,37 @@ class ExtractPlugin(PapersPlugin):
|
||||||
with fitz.Document(filename) as doc:
|
with fitz.Document(filename) as doc:
|
||||||
for page in doc:
|
for page in doc:
|
||||||
for annot in page.annots():
|
for annot in page.annots():
|
||||||
content = self._retrieve_annotation_content(page, annot)
|
quote, note = self._retrieve_annotation_content(page, annot)
|
||||||
if content:
|
annotations.append(
|
||||||
annotations.append(f"[{(page.number or 0) + 1}] {content}")
|
self._format_annotation(quote, note, page.number or 0)
|
||||||
|
)
|
||||||
return annotations
|
return annotations
|
||||||
|
|
||||||
def _retrieve_annotation_content(self, page, annotation, connector = "\nNote: "):
|
def _format_annotation(self, quote, note, pagenumber=0):
|
||||||
|
output = self.formatting
|
||||||
|
replacements = {
|
||||||
|
"{quote}": quote,
|
||||||
|
"{note}": note,
|
||||||
|
"{page}": str(pagenumber),
|
||||||
|
"{newline}": "\n",
|
||||||
|
}
|
||||||
|
if note == "":
|
||||||
|
output = re.sub(r"{note_begin}.*{note_end}", "", output)
|
||||||
|
if quote == "":
|
||||||
|
output = re.sub(r"{quote_begin}.*{quote_end}", "", output)
|
||||||
|
output = re.sub(r"{note_begin}", "", output)
|
||||||
|
output = re.sub(r"{note_end}", "", output)
|
||||||
|
output = re.sub(r"{quote_begin}", "", output)
|
||||||
|
output = re.sub(r"{quote_end}", "", output)
|
||||||
|
pattern = re.compile(
|
||||||
|
"|".join(
|
||||||
|
[re.escape(k) for k in sorted(replacements, key=len, reverse=True)]
|
||||||
|
),
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
return pattern.sub(lambda x: replacements[x.group(0)], output)
|
||||||
|
|
||||||
|
def _retrieve_annotation_content(self, page, annotation):
|
||||||
"""Gets the text content of an annotation.
|
"""Gets the text content of an annotation.
|
||||||
|
|
||||||
Returns the actual content of an annotation. Sometimes
|
Returns the actual content of an annotation. Sometimes
|
||||||
|
@ -148,11 +184,17 @@ class ExtractPlugin(PapersPlugin):
|
||||||
content = annotation.info["content"].replace("\n", " ")
|
content = annotation.info["content"].replace("\n", " ")
|
||||||
written = page.get_textbox(annotation.rect).replace("\n", " ")
|
written = page.get_textbox(annotation.rect).replace("\n", " ")
|
||||||
|
|
||||||
if Levenshtein.ratio(content,written) > self.minimum_similarity:
|
# highlight with selection in note
|
||||||
return content
|
if Levenshtein.ratio(content, written) > self.minimum_similarity:
|
||||||
|
return (content, "")
|
||||||
|
# an independent note, not a highlight
|
||||||
|
elif content and not written:
|
||||||
|
return ("", content)
|
||||||
|
# both a highlight and a note
|
||||||
elif content:
|
elif content:
|
||||||
return f"{written}{connector}{content}"
|
return (written, content)
|
||||||
return written
|
# highlight with selection not in note
|
||||||
|
return (written, "")
|
||||||
|
|
||||||
def _to_stdout(self, annotated_papers):
|
def _to_stdout(self, annotated_papers):
|
||||||
"""Write annotations to stdout.
|
"""Write annotations to stdout.
|
||||||
|
@ -165,9 +207,9 @@ class ExtractPlugin(PapersPlugin):
|
||||||
paper = contents[0]
|
paper = contents[0]
|
||||||
annotations = contents[1]
|
annotations = contents[1]
|
||||||
if annotations:
|
if annotations:
|
||||||
output += f"{paper.citekey}\n"
|
output += f"------ {paper.citekey} ------\n\n"
|
||||||
for annot in annotations:
|
for annot in annotations:
|
||||||
output += f'> "{annot}"\n'
|
output += f"{annot}\n\n"
|
||||||
output += "\n"
|
output += "\n"
|
||||||
print(output)
|
print(output)
|
||||||
|
|
||||||
|
@ -226,8 +268,7 @@ class ExtractPlugin(PapersPlugin):
|
||||||
def modify_event(event):
|
def modify_event(event):
|
||||||
if ExtractPlugin.is_loaded():
|
if ExtractPlugin.is_loaded():
|
||||||
plg = ExtractPlugin.get_instance()
|
plg = ExtractPlugin.get_instance()
|
||||||
if plg.onimport:
|
if plg.on_import:
|
||||||
all_annotations = plg.extract([event.citekey])
|
all_annotations = plg.extract([event.citekey])
|
||||||
if all_annotations[0][1]:
|
if all_annotations[0][1]:
|
||||||
plg._to_notes(all_annotations, plg.note_extension)
|
plg._to_notes(all_annotations, plg.note_extension)
|
||||||
plg.ui.info(f"Imported {event.citekey} annotations.")
|
|
||||||
|
|
Loading…
Reference in a new issue