Simplified the competing options json and youtube into --from which can then select format.
139 lines
3.4 KiB
Python
Executable file
139 lines
3.4 KiB
Python
Executable file
import subprocess
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import cast
|
|
|
|
import click
|
|
import pytgpt.phind as phind
|
|
|
|
debug = False
|
|
|
|
|
|
def summarize_text_file(content: str, prompt: str | None) -> str:
|
|
prompt = prompt or "Please summarize the following transcript:"
|
|
bot = phind.PHIND()
|
|
res = bot.chat(f"{prompt} {content}")
|
|
return res
|
|
|
|
|
|
def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str:
|
|
jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
|
|
result = subprocess.run(
|
|
jq_command, shell=True, capture_output=True, text=True, input=content
|
|
).stdout
|
|
# Replace newlines with spaces
|
|
result = result.replace("\n", "").split()
|
|
|
|
# Join lines back together with newlines
|
|
processed = (
|
|
" ".join(result).replace('"', "").replace("\\n", "\n" if keep_newlines else " ")
|
|
)
|
|
|
|
return processed
|
|
|
|
|
|
def grab_subtitles(url: str | Path) -> Path:
|
|
import yt_dlp
|
|
|
|
temp_dir = get_temp_dir()
|
|
ydl_opts = {
|
|
"outtmpl": f"{temp_dir}/subs",
|
|
"writeautomaticsub": True,
|
|
"subtitlesformat": "json3",
|
|
"skip_download": True,
|
|
}
|
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
info = ydl.extract_info(url, download=True)
|
|
filename = ydl.prepare_filename(info)
|
|
dprint(f"Subtitle file saved as: {filename}")
|
|
|
|
for root, _, files in Path(temp_dir).walk():
|
|
for file in files:
|
|
if file.endswith(".json3"):
|
|
return Path(root).joinpath(file)
|
|
|
|
raise ValueError("No correct json3 transcript object found.")
|
|
|
|
|
|
INPUT_FORMATS = ["json3", "yt", "txt"]
|
|
|
|
|
|
@click.command()
|
|
# TODO: Can I set it so it checks existence *only* when no youtube flag exists?
|
|
# TODO: Implement some way to allow content passed in on stdin instead of file (- as arg?)
|
|
@click.argument("file_path", type=click.Path(exists=False))
|
|
@click.option(
|
|
"--from",
|
|
"-f",
|
|
"get_from",
|
|
type=click.Choice(INPUT_FORMATS),
|
|
default="txt",
|
|
help="Choose format to process between json transcript, yt link or txt file.",
|
|
)
|
|
@click.option(
|
|
"--prompt",
|
|
"-p",
|
|
default="Please summarize the following transcript:",
|
|
type=str,
|
|
help="Use custom prompt.",
|
|
|
|
)
|
|
@click.option(
|
|
"--log-level",
|
|
"-l",
|
|
default=0,
|
|
help="Set log level to 1 for debug.",
|
|
)
|
|
def cli(
|
|
file_path: Path | str,
|
|
prompt: str,
|
|
log_level: int,
|
|
get_from: str,
|
|
):
|
|
"""Provide summary for a file at the specified path or a youtube video at the specified url."""
|
|
|
|
if log_level:
|
|
global debug
|
|
debug = True
|
|
|
|
content = ""
|
|
|
|
# youtube link, dl transcript
|
|
if get_from == "yt":
|
|
file_path = grab_subtitles(file_path)
|
|
|
|
file_path = cast(Path, file_path)
|
|
dprint(f"file path = {file_path}")
|
|
# load local file
|
|
with Path(file_path).open() as f:
|
|
content = f.read()
|
|
|
|
if get_from == "json3" or get_from == "yt":
|
|
content = extract_transcript_contents(content)
|
|
|
|
dprint(f"content = {content}")
|
|
if not content:
|
|
print("Please provide a file with valid content.")
|
|
|
|
print(summarize_text_file(content, prompt))
|
|
|
|
|
|
cached_dir: Path | None = None
|
|
|
|
|
|
def get_temp_dir() -> Path:
|
|
global cached_dir
|
|
if cached_dir is None:
|
|
cached_dir = Path(tempfile.mkdtemp())
|
|
dprint(f"Created and cached temp dir {cached_dir}")
|
|
return cached_dir
|
|
|
|
|
|
def dprint(content: str | None):
|
|
if debug:
|
|
print(f"[DEBUG] {content}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
cli()
|