Extract images from summary in Original dataclasses

This commit is contained in:
Marty Oehme 2025-06-05 22:33:48 +02:00
parent 7a11e45d67
commit 742845a329
Signed by: Marty
GPG key ID: 4E535BC19C61886E

View file

@ -2,6 +2,7 @@ import hashlib
import json import json
import os import os
import pickle import pickle
import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@ -28,11 +29,29 @@ class Original: # BadJoke: Sting
summary: str summary: str
link: str link: str
date: datetime date: datetime
image_link: str | None = None
id: str = field(init=False) id: str = field(init=False)
def _extract_img(self, s: str) -> tuple[str, str]: # [img_link, rest of string]
img: str
m = re.match(r'<img src="(?P<img>.+?)"', s)
try:
img = m.group("img")
except (IndexError, NameError):
return ("", s)
if img:
rest = re.sub(r"<img src=.+?>", "", s)
return (img, rest)
def __post_init__(self): def __post_init__(self):
self.id = hashlib.sha256(self.link.encode()).hexdigest() self.id = hashlib.sha256(self.link.encode()).hexdigest()
extracted = self._extract_img(self.summary)
if extracted[0]:
self.image_link = extracted[0]
self.summary = extracted[1]
@dataclass @dataclass
class Improvement: # GoodJoke: Queen class Improvement: # GoodJoke: Queen