simple-summary/summarize.py

import subprocess
import tempfile
from pathlib import Path
from typing import cast

import click
import pytgpt.phind as phind

debug = False


def summarize_text_file(content: str, prompt: str | None) -> str:
    prompt = prompt or "Please summarize the following transcript:"
    bot = phind.PHIND()
    res = bot.chat(f"{prompt} {content}")
    return res


def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str:
    jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
    result = subprocess.run(
        jq_command, shell=True, capture_output=True, text=True, input=content
    ).stdout
    # Replace newlines with spaces
    result = result.replace("\n", "").split()

    # Join lines back together with newlines
    processed = (
        " ".join(result).replace('"', "").replace("\\n", "\n" if keep_newlines else " ")
    )

    return processed


def grab_subtitles(url: str | Path) -> Path:
    import yt_dlp

    temp_dir = get_temp_dir()
    ydl_opts = {
        "outtmpl": f"{temp_dir}/subs",
        "writeautomaticsub": True,
        "subtitlesformat": "json3",
        "skip_download": True,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        filename = ydl.prepare_filename(info)
        dprint(f"Subtitle file saved as: {filename}")

        for root, _, files in Path(temp_dir).walk():
            for file in files:
                if file.endswith(".json3"):
                    return Path(root).joinpath(file)

    raise ValueError("No correct json3 transcript object found.")


INPUT_FORMATS = ["json3", "yt", "txt"]


@click.command()
# TODO: Can I set it so it checks existence *only* when no youtube flag exists?
# TODO: Implement some way to allow content passed in on stdin instead of file (- as arg?)
@click.argument("file_path", type=click.Path(exists=False))
@click.option(
    "--from",
    "-f",
    "get_from",
    type=click.Choice(INPUT_FORMATS),
    default="txt",
    help="Choose format to process between json transcript, yt link or txt file.",
)
@click.option(
    "--prompt",
    "-p",
    default="Please summarize the following transcript:",
    type=str,
    help="Use custom prompt.",

)
@click.option(
    "--log-level",
    "-l",
    default=0,
    help="Set log level to 1 for debug.",
)
def cli(
    file_path: Path | str,
    prompt: str,
    log_level: int,
    get_from: str,
):
    """Provide summary for a file at the specified path or a youtube video at the specified url."""

    if log_level:
        global debug
        debug = True

    content = ""

    # youtube link, dl transcript
    if get_from == "yt":
        file_path = grab_subtitles(file_path)

    file_path = cast(Path, file_path)
    dprint(f"file path = {file_path}")
    # load local file
    with Path(file_path).open() as f:
        content = f.read()

    if get_from == "json3" or get_from == "yt":
        content = extract_transcript_contents(content)

    dprint(f"content = {content}")
    if not content:
        print("Please provide a file with valid content.")

    print(summarize_text_file(content, prompt))


cached_dir: Path | None = None


def get_temp_dir() -> Path:
    global cached_dir
    if cached_dir is None:
        cached_dir = Path(tempfile.mkdtemp())
    dprint(f"Created and cached temp dir {cached_dir}")
    return cached_dir


def dprint(content: str | None):
    if debug:
        print(f"[DEBUG] {content}")


if __name__ == "__main__":
    cli()
No results found.