diff --git a/pyproject.toml b/pyproject.toml index 75daa03..bc886ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,15 +9,3 @@ dependencies = [ "python-tgpt>=0.8.1", "yt-dlp>=2025.1.15", ] - -[project.scripts] -summarize = "summarize:cli" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = [ - "summarize.py", -] diff --git a/summarize.py b/summarize.py deleted file mode 100755 index 88f8621..0000000 --- a/summarize.py +++ /dev/null @@ -1,139 +0,0 @@ -import subprocess -import tempfile -from pathlib import Path -from typing import cast - -import click -import pytgpt.phind as phind - -debug = False - - -def summarize_text_file(content: str, prompt: str | None) -> str: - prompt = prompt or "Please summarize the following transcript:" - bot = phind.PHIND() - res = bot.chat(f"{prompt} {content}") - return res - - -def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str: - jq_command = "jq '.events.[].segs | select(. != null).[].utf8'" - result = subprocess.run( - jq_command, shell=True, capture_output=True, text=True, input=content - ).stdout - # Replace newlines with spaces - result = result.replace("\n", "").split() - - # Join lines back together with newlines - processed = ( - " ".join(result).replace('"', "").replace("\\n", "\n" if keep_newlines else " ") - ) - - return processed - - -def grab_subtitles(url: str | Path) -> Path: - import yt_dlp - - temp_dir = get_temp_dir() - ydl_opts = { - "outtmpl": f"{temp_dir}/subs", - "writeautomaticsub": True, - "subtitlesformat": "json3", - "skip_download": True, - } - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(url, download=True) - filename = ydl.prepare_filename(info) - dprint(f"Subtitle file saved as: {filename}") - - for root, _, files in Path(temp_dir).walk(): - for file in files: - if file.endswith(".json3"): - return Path(root).joinpath(file) - - raise ValueError("No correct json3 transcript object found.") - - -INPUT_FORMATS = ["json3", "yt", "txt"] - - -@click.command() -# TODO: Can I set it so it checks existence *only* when no youtube flag exists? -# TODO: Implement some way to allow content passed in on stdin instead of file (- as arg?) -@click.argument("file_path", type=click.Path(exists=False)) -@click.option( - "--from", - "-f", - "get_from", - type=click.Choice(INPUT_FORMATS), - default="txt", - help="Choose format to process between json transcript, yt link or txt file.", -) -@click.option( - "--prompt", - "-p", - default="Please summarize the following transcript:", - type=str, - help="Use custom prompt.", - -) -@click.option( - "--log-level", - "-l", - default=0, - help="Set log level to 1 for debug.", -) -def cli( - file_path: Path | str, - prompt: str, - log_level: int, - get_from: str, -): - """Provide summary for a file at the specified path or a youtube video at the specified url.""" - - if log_level: - global debug - debug = True - - content = "" - - # youtube link, dl transcript - if get_from == "yt": - file_path = grab_subtitles(file_path) - - file_path = cast(Path, file_path) - dprint(f"file path = {file_path}") - # load local file - with Path(file_path).open() as f: - content = f.read() - - if get_from == "json3" or get_from == "yt": - content = extract_transcript_contents(content) - - dprint(f"content = {content}") - if not content: - print("Please provide a file with valid content.") - - print(summarize_text_file(content, prompt)) - - -cached_dir: Path | None = None - - -def get_temp_dir() -> Path: - global cached_dir - if cached_dir is None: - cached_dir = Path(tempfile.mkdtemp()) - dprint(f"Created and cached temp dir {cached_dir}") - return cached_dir - - -def dprint(content: str | None): - if debug: - print(f"[DEBUG] {content}") - - -if __name__ == "__main__": - cli() diff --git a/tgpt_summary.py b/tgpt_summary.py new file mode 100755 index 0000000..8ce7869 --- /dev/null +++ b/tgpt_summary.py @@ -0,0 +1,121 @@ +import subprocess +import tempfile +from pathlib import Path +from typing import cast + +import click +import pytgpt.phind as phind + + +def summarize_text_file(content: str, prompt: str | None) -> str: + prompt = prompt or "Please summarize the following transcript:" + bot = phind.PHIND() + return bot.chat(f"{prompt} {content}") + + +def extract_transcript_contents(content: str) -> str: + jq_command = "jq '.events.[].segs | select(. != null).[].utf8'" + result = subprocess.run( + jq_command, shell=True, capture_output=True, text=True, input=content + ).stdout + # Replace newlines with spaces + result = result.replace("\n", "").split() + + # Join lines back together with newlines + processed = " ".join(result).replace('"', "").replace("\\n", "\n") + + return processed + + +def grab_subtitles(url: str | Path) -> Path: + import yt_dlp + + temp_dir = get_temp_dir() + try: + ydl_opts = { + "outtmpl": f"{temp_dir}/subs", + "writeautomaticsub": True, + "subtitlesformat": "json3", + "skip_download": True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=True) + filename = ydl.prepare_filename(info) + print(f"Subtitle file saved as: {filename}") + + for root, _, files in Path(temp_dir).walk(): + for file in files: + if file.endswith(".json3"): + return Path(root).joinpath(file) + + raise ValueError("No correct json3 transcript object found.") + + except ValueError as e: + print(e) + + +def get_temp_dir() -> str: + # Create a temporary directory + temp_dir = tempfile.mkdtemp() + print(f"Creating temp dir {temp_dir}") + return temp_dir + + +def rm_dir(dir: Path | str) -> None: + # Remove the temporary directory + import shutil + + if Path(dir).is_dir(): + shutil.rmtree(dir) + + +@click.command() +# TODO: Can I set it so it checks existence *only* when no youtube flag exists? +@click.argument("file_path", type=click.Path(exists=False)) +@click.option( + "--json-transcript/--no-json-transcript", + "-j", + default=False, + help="Use downloaded json3 transcript.", +) +@click.option( + "--youtube/--no-youtube", + "-t", + default=False, + help="Get (english) transcript from youtube link.", +) +@click.option( + "--prompt", + "-p", + default="Please provide a detailed but concise summary for the following transcript:", + type=str, + help="Use custom prompt.", +) +def process_file( + file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str +): + """Provide summary for a file at the specified path or a youtube video at the specified url.""" + + content = "" + + # youtube link, dl transcript + if youtube: + file_path = grab_subtitles(file_path) + + file_path = cast(Path, file_path) + print(f"DEBUG: file path = {file_path}") + # load local file + with Path(file_path).open() as f: + content = f.read() + + if json_transcript or youtube: + content = extract_transcript_contents(content) + + if not content: + print("Please provide a file with valid content.") + + print(summarize_text_file(content, prompt)) + +if __name__ == "__main__": + process_file() diff --git a/uv.lock b/uv.lock index 2d7073d..cfef1c2 100644 --- a/uv.lock +++ b/uv.lock @@ -485,7 +485,7 @@ wheels = [ [[package]] name = "pytgpt-summarize" version = "0.1.0" -source = { editable = "." } +source = { virtual = "." } dependencies = [ { name = "click" }, { name = "python-tgpt" },