Simplify input format options

Simplified the competing options json and youtube into --from which can then select format.
Add debug mode
2025-01-28 08:59:59 +01:00 · 2025-01-28 08:47:41 +01:00 · 2025-01-18 14:34:26 +01:00 · 2025-01-18 14:34:21 +01:00 · 2025-01-18 14:34:02 +01:00 · 2025-01-18 10:39:56 +01:00
4 changed files with 152 additions and 122 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,3 +9,15 @@ dependencies = [
    "python-tgpt>=0.8.1",
    "yt-dlp>=2025.1.15",
 ]
 [project.scripts]
 summarize = "summarize:cli"
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 packages = [
  "summarize.py",
 ]
--- a/summarize.py
+++ b/summarize.py
@ -0,0 +1,139 @@
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import cast
 import click
 import pytgpt.phind as phind
 debug = False
 def summarize_text_file(content: str, prompt: str | None) -> str:
    prompt = prompt or "Please summarize the following transcript:"
    bot = phind.PHIND()
    res = bot.chat(f"{prompt} {content}")
    return res
 def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str:
    jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
    result = subprocess.run(
        jq_command, shell=True, capture_output=True, text=True, input=content
    ).stdout
    # Replace newlines with spaces
    result = result.replace("\n", "").split()
    # Join lines back together with newlines
    processed = (
        " ".join(result).replace('"', "").replace("\\n", "\n" if keep_newlines else " ")
    )
    return processed
 def grab_subtitles(url: str | Path) -> Path:
    import yt_dlp
    temp_dir = get_temp_dir()
    ydl_opts = {
        "outtmpl": f"{temp_dir}/subs",
        "writeautomaticsub": True,
        "subtitlesformat": "json3",
        "skip_download": True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        filename = ydl.prepare_filename(info)
        dprint(f"Subtitle file saved as: {filename}")
        for root, _, files in Path(temp_dir).walk():
            for file in files:
                if file.endswith(".json3"):
                    return Path(root).joinpath(file)
    raise ValueError("No correct json3 transcript object found.")
 INPUT_FORMATS = ["json3", "yt", "txt"]
@click.command()
 # TODO: Can I set it so it checks existence *only* when no youtube flag exists?
 # TODO: Implement some way to allow content passed in on stdin instead of file (- as arg?)
@click.argument("file_path", type=click.Path(exists=False))
@click.option(
    "--from",
    "-f",
    "get_from",
    type=click.Choice(INPUT_FORMATS),
    default="txt",
    help="Choose format to process between json transcript, yt link or txt file.",
 )
@click.option(
    "--prompt",
    "-p",
    default="Please summarize the following transcript:",
    type=str,
    help="Use custom prompt.",
 )
@click.option(
    "--log-level",
    "-l",
    default=0,
    help="Set log level to 1 for debug.",
 )
 def cli(
    file_path: Path | str,
    prompt: str,
    log_level: int,
    get_from: str,
 ):
    """Provide summary for a file at the specified path or a youtube video at the specified url."""
    if log_level:
        global debug
        debug = True
    content = ""
    # youtube link, dl transcript
    if get_from == "yt":
        file_path = grab_subtitles(file_path)
    file_path = cast(Path, file_path)
    dprint(f"file path = {file_path}")
    # load local file
    with Path(file_path).open() as f:
        content = f.read()
    if get_from == "json3" or get_from == "yt":
        content = extract_transcript_contents(content)
    dprint(f"content = {content}")
    if not content:
        print("Please provide a file with valid content.")
    print(summarize_text_file(content, prompt))
 cached_dir: Path | None = None
 def get_temp_dir() -> Path:
    global cached_dir
    if cached_dir is None:
        cached_dir = Path(tempfile.mkdtemp())
    dprint(f"Created and cached temp dir {cached_dir}")
    return cached_dir
 def dprint(content: str | None):
    if debug:
        print(f"[DEBUG] {content}")
 if __name__ == "__main__":
    cli()
--- a/tgpt_summary.py
+++ b/tgpt_summary.py
@ -1,121 +0,0 @@
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import cast
 import click
 import pytgpt.phind as phind
 def summarize_text_file(content: str, prompt: str | None) -> str:
    prompt = prompt or "Please summarize the following transcript:"
    bot = phind.PHIND()
    return bot.chat(f"{prompt} {content}")
 def extract_transcript_contents(content: str) -> str:
    jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
    result = subprocess.run(
        jq_command, shell=True, capture_output=True, text=True, input=content
    ).stdout
    # Replace newlines with spaces
    result = result.replace("\n", "").split()
    # Join lines back together with newlines
    processed = " ".join(result).replace('"', "").replace("\\n", "\n")
    return processed
 def grab_subtitles(url: str | Path) -> Path:
    import yt_dlp
    temp_dir = get_temp_dir()
    try:
        ydl_opts = {
            "outtmpl": f"{temp_dir}/subs",
            "writeautomaticsub": True,
            "subtitlesformat": "json3",
            "skip_download": True,
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            filename = ydl.prepare_filename(info)
            print(f"Subtitle file saved as: {filename}")
            for root, _, files in Path(temp_dir).walk():
                for file in files:
                    if file.endswith(".json3"):
                        return Path(root).joinpath(file)
        raise ValueError("No correct json3 transcript object found.")
    except ValueError as e:
        print(e)
 def get_temp_dir() -> str:
    # Create a temporary directory
    temp_dir = tempfile.mkdtemp()
    print(f"Creating temp dir {temp_dir}")
    return temp_dir
 def rm_dir(dir: Path | str) -> None:
    # Remove the temporary directory
    import shutil
    if Path(dir).is_dir():
        shutil.rmtree(dir)
@click.command()
 # TODO: Can I set it so it checks existence *only* when no youtube flag exists?
@click.argument("file_path", type=click.Path(exists=False))
@click.option(
    "--json-transcript/--no-json-transcript",
    "-j",
    default=False,
    help="Use downloaded json3 transcript.",
 )
@click.option(
    "--youtube/--no-youtube",
    "-t",
    default=False,
    help="Get (english) transcript from youtube link.",
 )
@click.option(
    "--prompt",
    "-p",
    default="Please provide a detailed but concise summary for the following transcript:",
    type=str,
    help="Use custom prompt.",
 )
 def process_file(
    file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str
 ):
    """Provide summary for a file at the specified path or a youtube video at the specified url."""
    content = ""
    # youtube link, dl transcript
    if youtube:
        file_path = grab_subtitles(file_path)
    file_path = cast(Path, file_path)
    print(f"DEBUG: file path = {file_path}")
    # load local file
    with Path(file_path).open() as f:
        content = f.read()
    if json_transcript or youtube:
        content = extract_transcript_contents(content)
    if not content:
        print("Please provide a file with valid content.")
    print(summarize_text_file(content, prompt))
 if __name__ == "__main__":
    process_file()
--- a/uv.lock
+++ b/uv.lock
@ -485,7 +485,7 @@ wheels = [
 [[package]]
 name = "pytgpt-summarize"
 version = "0.1.0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
    { name = "click" },
    { name = "python-tgpt" },
Author	SHA1	Message	Date
Marty Oehme	423b83abb9	Simplify input format options Simplified the competing options json and youtube into --from which can then select format.	2025-01-28 08:59:59 +01:00
Marty Oehme	260c10b5cd	Add debug mode	2025-01-28 08:47:41 +01:00
Marty Oehme	bf3c298034	Format	2025-01-18 14:34:26 +01:00
Marty Oehme	d6bc9e6728	Do not except any raised error	2025-01-18 14:34:21 +01:00
Marty Oehme	4df67bd475	Default to single line transcript	2025-01-18 14:34:02 +01:00
Marty Oehme	5be71fc7da	Use hatch build system to add cli entrypoint	2025-01-18 10:39:56 +01:00
Marty Oehme	d96a9bb616	Rename script to summarize.py	2025-01-18 10:27:33 +01:00