Simplify input format options

Simplified the competing options json and youtube into --from which can then select format.
Add debug mode
2025-01-28 08:59:59 +01:00 · 2025-01-28 08:47:41 +01:00 · 2025-01-18 14:34:26 +01:00 · 2025-01-18 14:34:21 +01:00 · 2025-01-18 14:34:02 +01:00 · 2025-01-18 10:39:56 +01:00
4 changed files with 152 additions and 122 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,3 +9,15 @@ dependencies = [
    "python-tgpt>=0.8.1",
    "yt-dlp>=2025.1.15",
 ]
+
+[project.scripts]
+summarize = "summarize:cli"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = [
+  "summarize.py",
+]
--- a/summarize.py
+++ b/summarize.py
@ -0,0 +1,139 @@
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import cast
+
+import click
+import pytgpt.phind as phind
+
+debug = False
+
+
+def summarize_text_file(content: str, prompt: str | None) -> str:
+    prompt = prompt or "Please summarize the following transcript:"
+    bot = phind.PHIND()
+    res = bot.chat(f"{prompt} {content}")
+    return res
+
+
+def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str:
+    jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
+    result = subprocess.run(
+        jq_command, shell=True, capture_output=True, text=True, input=content
+    ).stdout
+    # Replace newlines with spaces
+    result = result.replace("\n", "").split()
+
+    # Join lines back together with newlines
+    processed = (
+        " ".join(result).replace('"', "").replace("\\n", "\n" if keep_newlines else " ")
+    )
+
+    return processed
+
+
+def grab_subtitles(url: str | Path) -> Path:
+    import yt_dlp
+
+    temp_dir = get_temp_dir()
+    ydl_opts = {
+        "outtmpl": f"{temp_dir}/subs",
+        "writeautomaticsub": True,
+        "subtitlesformat": "json3",
+        "skip_download": True,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+        filename = ydl.prepare_filename(info)
+        dprint(f"Subtitle file saved as: {filename}")
+
+        for root, _, files in Path(temp_dir).walk():
+            for file in files:
+                if file.endswith(".json3"):
+                    return Path(root).joinpath(file)
+
+    raise ValueError("No correct json3 transcript object found.")
+
+
+INPUT_FORMATS = ["json3", "yt", "txt"]
+
+
+@click.command()
+# TODO: Can I set it so it checks existence *only* when no youtube flag exists?
+# TODO: Implement some way to allow content passed in on stdin instead of file (- as arg?)
+@click.argument("file_path", type=click.Path(exists=False))
+@click.option(
+    "--from",
+    "-f",
+    "get_from",
+    type=click.Choice(INPUT_FORMATS),
+    default="txt",
+    help="Choose format to process between json transcript, yt link or txt file.",
+)
+@click.option(
+    "--prompt",
+    "-p",
+    default="Please summarize the following transcript:",
+    type=str,
+    help="Use custom prompt.",
+    
+)
+@click.option(
+    "--log-level",
+    "-l",
+    default=0,
+    help="Set log level to 1 for debug.",
+)
+def cli(
+    file_path: Path | str,
+    prompt: str,
+    log_level: int,
+    get_from: str,
+):
+    """Provide summary for a file at the specified path or a youtube video at the specified url."""
+
+    if log_level:
+        global debug
+        debug = True
+
+    content = ""
+
+    # youtube link, dl transcript
+    if get_from == "yt":
+        file_path = grab_subtitles(file_path)
+
+    file_path = cast(Path, file_path)
+    dprint(f"file path = {file_path}")
+    # load local file
+    with Path(file_path).open() as f:
+        content = f.read()
+
+    if get_from == "json3" or get_from == "yt":
+        content = extract_transcript_contents(content)
+
+    dprint(f"content = {content}")
+    if not content:
+        print("Please provide a file with valid content.")
+
+    print(summarize_text_file(content, prompt))
+
+
+cached_dir: Path | None = None
+
+
+def get_temp_dir() -> Path:
+    global cached_dir
+    if cached_dir is None:
+        cached_dir = Path(tempfile.mkdtemp())
+    dprint(f"Created and cached temp dir {cached_dir}")
+    return cached_dir
+
+
+def dprint(content: str | None):
+    if debug:
+        print(f"[DEBUG] {content}")
+
+
+if __name__ == "__main__":
+    cli()
--- a/tgpt_summary.py
+++ b/tgpt_summary.py
@ -1,121 +0,0 @@
-import subprocess
-import tempfile
-from pathlib import Path
-from typing import cast
-
-import click
-import pytgpt.phind as phind
-
-
-def summarize_text_file(content: str, prompt: str | None) -> str:
-    prompt = prompt or "Please summarize the following transcript:"
-    bot = phind.PHIND()
-    return bot.chat(f"{prompt} {content}")
-
-
-def extract_transcript_contents(content: str) -> str:
-    jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
-    result = subprocess.run(
-        jq_command, shell=True, capture_output=True, text=True, input=content
-    ).stdout
-    # Replace newlines with spaces
-    result = result.replace("\n", "").split()
-
-    # Join lines back together with newlines
-    processed = " ".join(result).replace('"', "").replace("\\n", "\n")
-
-    return processed
-
-
-def grab_subtitles(url: str | Path) -> Path:
-    import yt_dlp
-
-    temp_dir = get_temp_dir()
-    try:
-        ydl_opts = {
-            "outtmpl": f"{temp_dir}/subs",
-            "writeautomaticsub": True,
-            "subtitlesformat": "json3",
-            "skip_download": True,
-        }
-
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            info = ydl.extract_info(url, download=True)
-            filename = ydl.prepare_filename(info)
-            print(f"Subtitle file saved as: {filename}")
-
-            for root, _, files in Path(temp_dir).walk():
-                for file in files:
-                    if file.endswith(".json3"):
-                        return Path(root).joinpath(file)
-
-        raise ValueError("No correct json3 transcript object found.")
-
-    except ValueError as e:
-        print(e)
-
-
-def get_temp_dir() -> str:
-    # Create a temporary directory
-    temp_dir = tempfile.mkdtemp()
-    print(f"Creating temp dir {temp_dir}")
-    return temp_dir
-
-
-def rm_dir(dir: Path | str) -> None:
-    # Remove the temporary directory
-    import shutil
-
-    if Path(dir).is_dir():
-        shutil.rmtree(dir)
-
-
-@click.command()
-# TODO: Can I set it so it checks existence *only* when no youtube flag exists?
-@click.argument("file_path", type=click.Path(exists=False))
-@click.option(
-    "--json-transcript/--no-json-transcript",
-    "-j",
-    default=False,
-    help="Use downloaded json3 transcript.",
-)
-@click.option(
-    "--youtube/--no-youtube",
-    "-t",
-    default=False,
-    help="Get (english) transcript from youtube link.",
-)
-@click.option(
-    "--prompt",
-    "-p",
-    default="Please provide a detailed but concise summary for the following transcript:",
-    type=str,
-    help="Use custom prompt.",
-)
-def process_file(
-    file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str
-):
-    """Provide summary for a file at the specified path or a youtube video at the specified url."""
-
-    content = ""
-
-    # youtube link, dl transcript
-    if youtube:
-        file_path = grab_subtitles(file_path)
-
-    file_path = cast(Path, file_path)
-    print(f"DEBUG: file path = {file_path}")
-    # load local file
-    with Path(file_path).open() as f:
-        content = f.read()
-
-    if json_transcript or youtube:
-        content = extract_transcript_contents(content)
-
-    if not content:
-        print("Please provide a file with valid content.")
-
-    print(summarize_text_file(content, prompt))
-
-if __name__ == "__main__":
-    process_file()
--- a/uv.lock
+++ b/uv.lock
@ -485,7 +485,7 @@ wheels = [
 [[package]]
 name = "pytgpt-summarize"
 version = "0.1.0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
    { name = "click" },
    { name = "python-tgpt" },
Author	SHA1	Message	Date
Marty Oehme	423b83abb9	Simplify input format options Simplified the competing options json and youtube into --from which can then select format.	2025-01-28 08:59:59 +01:00
Marty Oehme	260c10b5cd	Add debug mode	2025-01-28 08:47:41 +01:00
Marty Oehme	bf3c298034	Format	2025-01-18 14:34:26 +01:00
Marty Oehme	d6bc9e6728	Do not except any raised error	2025-01-18 14:34:21 +01:00
Marty Oehme	4df67bd475	Default to single line transcript	2025-01-18 14:34:02 +01:00
Marty Oehme	5be71fc7da	Use hatch build system to add cli entrypoint	2025-01-18 10:39:56 +01:00
Marty Oehme	d96a9bb616	Rename script to summarize.py	2025-01-18 10:27:33 +01:00