diff --git a/pyproject.toml b/pyproject.toml index bc886ae..75daa03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,3 +9,15 @@ dependencies = [ "python-tgpt>=0.8.1", "yt-dlp>=2025.1.15", ] + +[project.scripts] +summarize = "summarize:cli" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = [ + "summarize.py", +] diff --git a/summarize.py b/summarize.py new file mode 100755 index 0000000..88f8621 --- /dev/null +++ b/summarize.py @@ -0,0 +1,139 @@ +import subprocess +import tempfile +from pathlib import Path +from typing import cast + +import click +import pytgpt.phind as phind + +debug = False + + +def summarize_text_file(content: str, prompt: str | None) -> str: + prompt = prompt or "Please summarize the following transcript:" + bot = phind.PHIND() + res = bot.chat(f"{prompt} {content}") + return res + + +def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str: + jq_command = "jq '.events.[].segs | select(. != null).[].utf8'" + result = subprocess.run( + jq_command, shell=True, capture_output=True, text=True, input=content + ).stdout + # Replace newlines with spaces + result = result.replace("\n", "").split() + + # Join lines back together with newlines + processed = ( + " ".join(result).replace('"', "").replace("\\n", "\n" if keep_newlines else " ") + ) + + return processed + + +def grab_subtitles(url: str | Path) -> Path: + import yt_dlp + + temp_dir = get_temp_dir() + ydl_opts = { + "outtmpl": f"{temp_dir}/subs", + "writeautomaticsub": True, + "subtitlesformat": "json3", + "skip_download": True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=True) + filename = ydl.prepare_filename(info) + dprint(f"Subtitle file saved as: {filename}") + + for root, _, files in Path(temp_dir).walk(): + for file in files: + if file.endswith(".json3"): + return Path(root).joinpath(file) + + raise ValueError("No correct json3 transcript object found.") + + +INPUT_FORMATS = ["json3", "yt", "txt"] + + +@click.command() +# TODO: Can I set it so it checks existence *only* when no youtube flag exists? +# TODO: Implement some way to allow content passed in on stdin instead of file (- as arg?) +@click.argument("file_path", type=click.Path(exists=False)) +@click.option( + "--from", + "-f", + "get_from", + type=click.Choice(INPUT_FORMATS), + default="txt", + help="Choose format to process between json transcript, yt link or txt file.", +) +@click.option( + "--prompt", + "-p", + default="Please summarize the following transcript:", + type=str, + help="Use custom prompt.", + +) +@click.option( + "--log-level", + "-l", + default=0, + help="Set log level to 1 for debug.", +) +def cli( + file_path: Path | str, + prompt: str, + log_level: int, + get_from: str, +): + """Provide summary for a file at the specified path or a youtube video at the specified url.""" + + if log_level: + global debug + debug = True + + content = "" + + # youtube link, dl transcript + if get_from == "yt": + file_path = grab_subtitles(file_path) + + file_path = cast(Path, file_path) + dprint(f"file path = {file_path}") + # load local file + with Path(file_path).open() as f: + content = f.read() + + if get_from == "json3" or get_from == "yt": + content = extract_transcript_contents(content) + + dprint(f"content = {content}") + if not content: + print("Please provide a file with valid content.") + + print(summarize_text_file(content, prompt)) + + +cached_dir: Path | None = None + + +def get_temp_dir() -> Path: + global cached_dir + if cached_dir is None: + cached_dir = Path(tempfile.mkdtemp()) + dprint(f"Created and cached temp dir {cached_dir}") + return cached_dir + + +def dprint(content: str | None): + if debug: + print(f"[DEBUG] {content}") + + +if __name__ == "__main__": + cli() diff --git a/tgpt_summary.py b/tgpt_summary.py deleted file mode 100755 index 8ce7869..0000000 --- a/tgpt_summary.py +++ /dev/null @@ -1,121 +0,0 @@ -import subprocess -import tempfile -from pathlib import Path -from typing import cast - -import click -import pytgpt.phind as phind - - -def summarize_text_file(content: str, prompt: str | None) -> str: - prompt = prompt or "Please summarize the following transcript:" - bot = phind.PHIND() - return bot.chat(f"{prompt} {content}") - - -def extract_transcript_contents(content: str) -> str: - jq_command = "jq '.events.[].segs | select(. != null).[].utf8'" - result = subprocess.run( - jq_command, shell=True, capture_output=True, text=True, input=content - ).stdout - # Replace newlines with spaces - result = result.replace("\n", "").split() - - # Join lines back together with newlines - processed = " ".join(result).replace('"', "").replace("\\n", "\n") - - return processed - - -def grab_subtitles(url: str | Path) -> Path: - import yt_dlp - - temp_dir = get_temp_dir() - try: - ydl_opts = { - "outtmpl": f"{temp_dir}/subs", - "writeautomaticsub": True, - "subtitlesformat": "json3", - "skip_download": True, - } - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(url, download=True) - filename = ydl.prepare_filename(info) - print(f"Subtitle file saved as: {filename}") - - for root, _, files in Path(temp_dir).walk(): - for file in files: - if file.endswith(".json3"): - return Path(root).joinpath(file) - - raise ValueError("No correct json3 transcript object found.") - - except ValueError as e: - print(e) - - -def get_temp_dir() -> str: - # Create a temporary directory - temp_dir = tempfile.mkdtemp() - print(f"Creating temp dir {temp_dir}") - return temp_dir - - -def rm_dir(dir: Path | str) -> None: - # Remove the temporary directory - import shutil - - if Path(dir).is_dir(): - shutil.rmtree(dir) - - -@click.command() -# TODO: Can I set it so it checks existence *only* when no youtube flag exists? -@click.argument("file_path", type=click.Path(exists=False)) -@click.option( - "--json-transcript/--no-json-transcript", - "-j", - default=False, - help="Use downloaded json3 transcript.", -) -@click.option( - "--youtube/--no-youtube", - "-t", - default=False, - help="Get (english) transcript from youtube link.", -) -@click.option( - "--prompt", - "-p", - default="Please provide a detailed but concise summary for the following transcript:", - type=str, - help="Use custom prompt.", -) -def process_file( - file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str -): - """Provide summary for a file at the specified path or a youtube video at the specified url.""" - - content = "" - - # youtube link, dl transcript - if youtube: - file_path = grab_subtitles(file_path) - - file_path = cast(Path, file_path) - print(f"DEBUG: file path = {file_path}") - # load local file - with Path(file_path).open() as f: - content = f.read() - - if json_transcript or youtube: - content = extract_transcript_contents(content) - - if not content: - print("Please provide a file with valid content.") - - print(summarize_text_file(content, prompt)) - -if __name__ == "__main__": - process_file() diff --git a/uv.lock b/uv.lock index cfef1c2..2d7073d 100644 --- a/uv.lock +++ b/uv.lock @@ -485,7 +485,7 @@ wheels = [ [[package]] name = "pytgpt-summarize" version = "0.1.0" -source = { virtual = "." } +source = { editable = "." } dependencies = [ { name = "click" }, { name = "python-tgpt" },