From d96a9bb616f676d931c9b0faafb05f4f45348a24 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 18 Jan 2025 10:27:33 +0100 Subject: [PATCH 1/7] Rename script to summarize.py --- tgpt_summary.py => summarize.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tgpt_summary.py => summarize.py (100%) diff --git a/tgpt_summary.py b/summarize.py similarity index 100% rename from tgpt_summary.py rename to summarize.py From 5be71fc7da8707274e9c243876508c3972434099 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 18 Jan 2025 10:39:56 +0100 Subject: [PATCH 2/7] Use hatch build system to add cli entrypoint --- pyproject.toml | 12 ++++++++++++ summarize.py | 4 ++-- uv.lock | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bc886ae..75daa03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,3 +9,15 @@ dependencies = [ "python-tgpt>=0.8.1", "yt-dlp>=2025.1.15", ] + +[project.scripts] +summarize = "summarize:cli" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = [ + "summarize.py", +] diff --git a/summarize.py b/summarize.py index 8ce7869..11e7748 100755 --- a/summarize.py +++ b/summarize.py @@ -92,7 +92,7 @@ def rm_dir(dir: Path | str) -> None: type=str, help="Use custom prompt.", ) -def process_file( +def cli( file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str ): """Provide summary for a file at the specified path or a youtube video at the specified url.""" @@ -118,4 +118,4 @@ def process_file( print(summarize_text_file(content, prompt)) if __name__ == "__main__": - process_file() + cli() diff --git a/uv.lock b/uv.lock index cfef1c2..2d7073d 100644 --- a/uv.lock +++ b/uv.lock @@ -485,7 +485,7 @@ wheels = [ [[package]] name = "pytgpt-summarize" version = "0.1.0" -source = { virtual = "." } +source = { editable = "." } dependencies = [ { name = "click" }, { name = "python-tgpt" }, From 4df67bd475122ef7cc67d7f55fd796edd0e26f42 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 18 Jan 2025 14:34:02 +0100 Subject: [PATCH 3/7] Default to single line transcript --- summarize.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/summarize.py b/summarize.py index 11e7748..45d5072 100755 --- a/summarize.py +++ b/summarize.py @@ -13,7 +13,7 @@ def summarize_text_file(content: str, prompt: str | None) -> str: return bot.chat(f"{prompt} {content}") -def extract_transcript_contents(content: str) -> str: +def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str: jq_command = "jq '.events.[].segs | select(. != null).[].utf8'" result = subprocess.run( jq_command, shell=True, capture_output=True, text=True, input=content @@ -22,7 +22,9 @@ def extract_transcript_contents(content: str) -> str: result = result.replace("\n", "").split() # Join lines back together with newlines - processed = " ".join(result).replace('"', "").replace("\\n", "\n") + processed = ( + " ".join(result).replace('"', "").replace("\\n", "\n" if keep_newlines else " ") + ) return processed From d6bc9e6728916cb0afe3b376bd0078242f201acc Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 18 Jan 2025 14:34:21 +0100 Subject: [PATCH 4/7] Do not except any raised error --- summarize.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/summarize.py b/summarize.py index 45d5072..d3b6528 100755 --- a/summarize.py +++ b/summarize.py @@ -33,28 +33,24 @@ def grab_subtitles(url: str | Path) -> Path: import yt_dlp temp_dir = get_temp_dir() - try: - ydl_opts = { - "outtmpl": f"{temp_dir}/subs", - "writeautomaticsub": True, - "subtitlesformat": "json3", - "skip_download": True, - } + ydl_opts = { + "outtmpl": f"{temp_dir}/subs", + "writeautomaticsub": True, + "subtitlesformat": "json3", + "skip_download": True, + } - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(url, download=True) - filename = ydl.prepare_filename(info) - print(f"Subtitle file saved as: {filename}") + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=True) + filename = ydl.prepare_filename(info) + print(f"Subtitle file saved as: {filename}") - for root, _, files in Path(temp_dir).walk(): - for file in files: - if file.endswith(".json3"): - return Path(root).joinpath(file) + for root, _, files in Path(temp_dir).walk(): + for file in files: + if file.endswith(".json3"): + return Path(root).joinpath(file) - raise ValueError("No correct json3 transcript object found.") - - except ValueError as e: - print(e) + raise ValueError("No correct json3 transcript object found.") def get_temp_dir() -> str: From bf3c29803461e8853beb520b70a3c090b737fc0e Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Sat, 18 Jan 2025 14:34:26 +0100 Subject: [PATCH 5/7] Format --- summarize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/summarize.py b/summarize.py index d3b6528..73f1432 100755 --- a/summarize.py +++ b/summarize.py @@ -90,9 +90,7 @@ def rm_dir(dir: Path | str) -> None: type=str, help="Use custom prompt.", ) -def cli( - file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str -): +def cli(file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str): """Provide summary for a file at the specified path or a youtube video at the specified url.""" content = "" @@ -110,10 +108,12 @@ def cli( if json_transcript or youtube: content = extract_transcript_contents(content) + print(f"DEBUG: content = {content}") if not content: print("Please provide a file with valid content.") print(summarize_text_file(content, prompt)) + if __name__ == "__main__": cli() From 260c10b5cd41930a7950291187a381553b55e7e0 Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 28 Jan 2025 08:47:41 +0100 Subject: [PATCH 6/7] Add debug mode --- summarize.py | 62 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/summarize.py b/summarize.py index 73f1432..9ae3a45 100755 --- a/summarize.py +++ b/summarize.py @@ -6,11 +6,14 @@ from typing import cast import click import pytgpt.phind as phind +debug = False + def summarize_text_file(content: str, prompt: str | None) -> str: prompt = prompt or "Please summarize the following transcript:" bot = phind.PHIND() - return bot.chat(f"{prompt} {content}") + res = bot.chat(f"{prompt} {content}") + return res def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str: @@ -43,7 +46,7 @@ def grab_subtitles(url: str | Path) -> Path: with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=True) filename = ydl.prepare_filename(info) - print(f"Subtitle file saved as: {filename}") + dprint(f"Subtitle file saved as: {filename}") for root, _, files in Path(temp_dir).walk(): for file in files: @@ -53,21 +56,6 @@ def grab_subtitles(url: str | Path) -> Path: raise ValueError("No correct json3 transcript object found.") -def get_temp_dir() -> str: - # Create a temporary directory - temp_dir = tempfile.mkdtemp() - print(f"Creating temp dir {temp_dir}") - return temp_dir - - -def rm_dir(dir: Path | str) -> None: - # Remove the temporary directory - import shutil - - if Path(dir).is_dir(): - shutil.rmtree(dir) - - @click.command() # TODO: Can I set it so it checks existence *only* when no youtube flag exists? @click.argument("file_path", type=click.Path(exists=False)) @@ -86,13 +74,29 @@ def rm_dir(dir: Path | str) -> None: @click.option( "--prompt", "-p", - default="Please provide a detailed but concise summary for the following transcript:", + default="Please summarize the following transcript:", type=str, help="Use custom prompt.", ) -def cli(file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str): +@click.option( + "--log-level", + "-l", + default=0, + help="Set log level to 1 for debug.", +) +def cli( + file_path: Path | str, + json_transcript: bool, + youtube: bool, + prompt: str, + log_level: int, +): """Provide summary for a file at the specified path or a youtube video at the specified url.""" + if log_level: + global debug + debug = True + content = "" # youtube link, dl transcript @@ -100,7 +104,7 @@ def cli(file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str file_path = grab_subtitles(file_path) file_path = cast(Path, file_path) - print(f"DEBUG: file path = {file_path}") + dprint(f"file path = {file_path}") # load local file with Path(file_path).open() as f: content = f.read() @@ -108,12 +112,28 @@ def cli(file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str if json_transcript or youtube: content = extract_transcript_contents(content) - print(f"DEBUG: content = {content}") + dprint(f"content = {content}") if not content: print("Please provide a file with valid content.") print(summarize_text_file(content, prompt)) +cached_dir: Path | None = None + + +def get_temp_dir() -> Path: + global cached_dir + if cached_dir is None: + cached_dir = Path(tempfile.mkdtemp()) + dprint(f"Created and cached temp dir {cached_dir}") + return cached_dir + + +def dprint(content: str | None): + if debug: + print(f"[DEBUG] {content}") + + if __name__ == "__main__": cli() From 423b83abb9d32073d7862efd8bb480309c562b1d Mon Sep 17 00:00:00 2001 From: Marty Oehme Date: Tue, 28 Jan 2025 08:59:59 +0100 Subject: [PATCH 7/7] Simplify input format options Simplified the competing options json and youtube into --from which can then select format. --- summarize.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/summarize.py b/summarize.py index 9ae3a45..88f8621 100755 --- a/summarize.py +++ b/summarize.py @@ -56,20 +56,20 @@ def grab_subtitles(url: str | Path) -> Path: raise ValueError("No correct json3 transcript object found.") +INPUT_FORMATS = ["json3", "yt", "txt"] + + @click.command() # TODO: Can I set it so it checks existence *only* when no youtube flag exists? +# TODO: Implement some way to allow content passed in on stdin instead of file (- as arg?) @click.argument("file_path", type=click.Path(exists=False)) @click.option( - "--json-transcript/--no-json-transcript", - "-j", - default=False, - help="Use downloaded json3 transcript.", -) -@click.option( - "--youtube/--no-youtube", - "-t", - default=False, - help="Get (english) transcript from youtube link.", + "--from", + "-f", + "get_from", + type=click.Choice(INPUT_FORMATS), + default="txt", + help="Choose format to process between json transcript, yt link or txt file.", ) @click.option( "--prompt", @@ -77,6 +77,7 @@ def grab_subtitles(url: str | Path) -> Path: default="Please summarize the following transcript:", type=str, help="Use custom prompt.", + ) @click.option( "--log-level", @@ -86,10 +87,9 @@ def grab_subtitles(url: str | Path) -> Path: ) def cli( file_path: Path | str, - json_transcript: bool, - youtube: bool, prompt: str, log_level: int, + get_from: str, ): """Provide summary for a file at the specified path or a youtube video at the specified url.""" @@ -100,7 +100,7 @@ def cli( content = "" # youtube link, dl transcript - if youtube: + if get_from == "yt": file_path = grab_subtitles(file_path) file_path = cast(Path, file_path) @@ -109,7 +109,7 @@ def cli( with Path(file_path).open() as f: content = f.read() - if json_transcript or youtube: + if get_from == "json3" or get_from == "yt": content = extract_transcript_contents(content) dprint(f"content = {content}")