diff --git a/papis_extract/extractor.py b/papis_extract/extractor.py index 2ca475c..9467231 100644 --- a/papis_extract/extractor.py +++ b/papis_extract/extractor.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Any, Optional import Levenshtein import fitz_new as fitz @@ -7,8 +8,6 @@ import papis.config from papis_extract.annotation_data import Annotation -COLOR_MAPPING = {} - logger = papis.logging.get_logger(__name__) @@ -23,15 +22,17 @@ def start(filename: Path) -> list[Annotation]: for page in doc: for annot in page.annots(): quote, note = _retrieve_annotation_content(page, annot) + if not quote and not note: + continue a = Annotation( file=str(filename), - text=quote, - content=note, + text=quote or "", + content=note or "", colors=annot.colors, type=annot.type[1], page=(page.number or 0) + 1, ) - a.tag = _tag_from_colorname(a.colorname) + a.tag = _tag_from_colorname(a.colorname or "") annotations.append(a) logger.debug( f"Found {len(annotations)} " @@ -40,11 +41,17 @@ def start(filename: Path) -> list[Annotation]: return annotations -def _tag_from_colorname(colorname): - return COLOR_MAPPING.get(colorname, "") +def _tag_from_colorname(colorname: str) -> str: + color_mapping: dict[str,str] = getdict("tags", "plugins.extract") + if not color_mapping: + return "" + + return color_mapping.get(colorname, "") -def _retrieve_annotation_content(page, annotation): +def _retrieve_annotation_content( + page: fitz.Page, annotation: fitz.Annot +) -> tuple[str | None, str | None]: """Gets the text content of an annotation. Returns the actual content of an annotation. Sometimes @@ -62,12 +69,41 @@ def _retrieve_annotation_content(page, annotation): papis.config.getfloat("minimum_similarity_content", "plugins.extract") or 1.0 ) if Levenshtein.ratio(content, written) > minimum_similarity: - return (content, "") - # an independent note, not a highlight - elif content and not written: - return ("", content) + return (content, None) # both a highlight and a note - elif content: + elif content and written: return (written, content) + # an independent note, not a highlight + elif content: + return (None, content) # highlight with selection not in note - return (written, "") + elif written: + return (written, None) + # just a highlight without any text + return (None, None) + +# mimics the functions in papis.config.{getlist,getint,getfloat} etc. +def getdict(key: str, section: Optional[str] = None) -> dict[str, str]: + """Dict getter + + :returns: A python dict + :raises SyntaxError: Whenever the parsed syntax is either not a valid + python object or a valid python dict. + """ + rawvalue: Any = papis.config.general_get(key, section=section) + if isinstance(rawvalue, dict): + return rawvalue + try: + rawvalue = eval(rawvalue) + except Exception: + raise SyntaxError( + "The key '{}' must be a valid Python object: {}" + .format(key, rawvalue)) + else: + if not isinstance(rawvalue, dict): + raise SyntaxError( + "The key '{}' must be a valid Python dict. Got: {} (type {!r})" + .format(key, rawvalue, type(rawvalue).__name__)) + + return rawvalue +