Compare commits

...

7 commits

Author SHA1 Message Date
423b83abb9
Simplify input format options
Simplified the competing options json and youtube into --from which can
then select format.
2025-01-28 08:59:59 +01:00
260c10b5cd
Add debug mode 2025-01-28 08:47:41 +01:00
bf3c298034
Format 2025-01-18 14:34:26 +01:00
d6bc9e6728
Do not except any raised error 2025-01-18 14:34:21 +01:00
4df67bd475
Default to single line transcript 2025-01-18 14:34:02 +01:00
5be71fc7da
Use hatch build system to add cli entrypoint 2025-01-18 10:39:56 +01:00
d96a9bb616
Rename script to summarize.py 2025-01-18 10:27:33 +01:00
4 changed files with 152 additions and 122 deletions

View file

@ -9,3 +9,15 @@ dependencies = [
"python-tgpt>=0.8.1", "python-tgpt>=0.8.1",
"yt-dlp>=2025.1.15", "yt-dlp>=2025.1.15",
] ]
[project.scripts]
summarize = "summarize:cli"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = [
"summarize.py",
]

139
summarize.py Executable file
View file

@ -0,0 +1,139 @@
import subprocess
import tempfile
from pathlib import Path
from typing import cast
import click
import pytgpt.phind as phind
debug = False
def summarize_text_file(content: str, prompt: str | None) -> str:
prompt = prompt or "Please summarize the following transcript:"
bot = phind.PHIND()
res = bot.chat(f"{prompt} {content}")
return res
def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str:
jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
result = subprocess.run(
jq_command, shell=True, capture_output=True, text=True, input=content
).stdout
# Replace newlines with spaces
result = result.replace("\n", "").split()
# Join lines back together with newlines
processed = (
" ".join(result).replace('"', "").replace("\\n", "\n" if keep_newlines else " ")
)
return processed
def grab_subtitles(url: str | Path) -> Path:
import yt_dlp
temp_dir = get_temp_dir()
ydl_opts = {
"outtmpl": f"{temp_dir}/subs",
"writeautomaticsub": True,
"subtitlesformat": "json3",
"skip_download": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
dprint(f"Subtitle file saved as: {filename}")
for root, _, files in Path(temp_dir).walk():
for file in files:
if file.endswith(".json3"):
return Path(root).joinpath(file)
raise ValueError("No correct json3 transcript object found.")
INPUT_FORMATS = ["json3", "yt", "txt"]
@click.command()
# TODO: Can I set it so it checks existence *only* when no youtube flag exists?
# TODO: Implement some way to allow content passed in on stdin instead of file (- as arg?)
@click.argument("file_path", type=click.Path(exists=False))
@click.option(
"--from",
"-f",
"get_from",
type=click.Choice(INPUT_FORMATS),
default="txt",
help="Choose format to process between json transcript, yt link or txt file.",
)
@click.option(
"--prompt",
"-p",
default="Please summarize the following transcript:",
type=str,
help="Use custom prompt.",
)
@click.option(
"--log-level",
"-l",
default=0,
help="Set log level to 1 for debug.",
)
def cli(
file_path: Path | str,
prompt: str,
log_level: int,
get_from: str,
):
"""Provide summary for a file at the specified path or a youtube video at the specified url."""
if log_level:
global debug
debug = True
content = ""
# youtube link, dl transcript
if get_from == "yt":
file_path = grab_subtitles(file_path)
file_path = cast(Path, file_path)
dprint(f"file path = {file_path}")
# load local file
with Path(file_path).open() as f:
content = f.read()
if get_from == "json3" or get_from == "yt":
content = extract_transcript_contents(content)
dprint(f"content = {content}")
if not content:
print("Please provide a file with valid content.")
print(summarize_text_file(content, prompt))
cached_dir: Path | None = None
def get_temp_dir() -> Path:
global cached_dir
if cached_dir is None:
cached_dir = Path(tempfile.mkdtemp())
dprint(f"Created and cached temp dir {cached_dir}")
return cached_dir
def dprint(content: str | None):
if debug:
print(f"[DEBUG] {content}")
if __name__ == "__main__":
cli()

View file

@ -1,121 +0,0 @@
import subprocess
import tempfile
from pathlib import Path
from typing import cast
import click
import pytgpt.phind as phind
def summarize_text_file(content: str, prompt: str | None) -> str:
prompt = prompt or "Please summarize the following transcript:"
bot = phind.PHIND()
return bot.chat(f"{prompt} {content}")
def extract_transcript_contents(content: str) -> str:
jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
result = subprocess.run(
jq_command, shell=True, capture_output=True, text=True, input=content
).stdout
# Replace newlines with spaces
result = result.replace("\n", "").split()
# Join lines back together with newlines
processed = " ".join(result).replace('"', "").replace("\\n", "\n")
return processed
def grab_subtitles(url: str | Path) -> Path:
import yt_dlp
temp_dir = get_temp_dir()
try:
ydl_opts = {
"outtmpl": f"{temp_dir}/subs",
"writeautomaticsub": True,
"subtitlesformat": "json3",
"skip_download": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
filename = ydl.prepare_filename(info)
print(f"Subtitle file saved as: {filename}")
for root, _, files in Path(temp_dir).walk():
for file in files:
if file.endswith(".json3"):
return Path(root).joinpath(file)
raise ValueError("No correct json3 transcript object found.")
except ValueError as e:
print(e)
def get_temp_dir() -> str:
# Create a temporary directory
temp_dir = tempfile.mkdtemp()
print(f"Creating temp dir {temp_dir}")
return temp_dir
def rm_dir(dir: Path | str) -> None:
# Remove the temporary directory
import shutil
if Path(dir).is_dir():
shutil.rmtree(dir)
@click.command()
# TODO: Can I set it so it checks existence *only* when no youtube flag exists?
@click.argument("file_path", type=click.Path(exists=False))
@click.option(
"--json-transcript/--no-json-transcript",
"-j",
default=False,
help="Use downloaded json3 transcript.",
)
@click.option(
"--youtube/--no-youtube",
"-t",
default=False,
help="Get (english) transcript from youtube link.",
)
@click.option(
"--prompt",
"-p",
default="Please provide a detailed but concise summary for the following transcript:",
type=str,
help="Use custom prompt.",
)
def process_file(
file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str
):
"""Provide summary for a file at the specified path or a youtube video at the specified url."""
content = ""
# youtube link, dl transcript
if youtube:
file_path = grab_subtitles(file_path)
file_path = cast(Path, file_path)
print(f"DEBUG: file path = {file_path}")
# load local file
with Path(file_path).open() as f:
content = f.read()
if json_transcript or youtube:
content = extract_transcript_contents(content)
if not content:
print("Please provide a file with valid content.")
print(summarize_text_file(content, prompt))
if __name__ == "__main__":
process_file()

2
uv.lock generated
View file

@ -485,7 +485,7 @@ wheels = [
[[package]] [[package]]
name = "pytgpt-summarize" name = "pytgpt-summarize"
version = "0.1.0" version = "0.1.0"
source = { virtual = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "click" }, { name = "click" },
{ name = "python-tgpt" }, { name = "python-tgpt" },