Compare commits
7 commits
ebd7758453
...
423b83abb9
| Author | SHA1 | Date | |
|---|---|---|---|
| 423b83abb9 | |||
| 260c10b5cd | |||
| bf3c298034 | |||
| d6bc9e6728 | |||
| 4df67bd475 | |||
| 5be71fc7da | |||
| d96a9bb616 |
4 changed files with 152 additions and 122 deletions
|
|
@ -9,3 +9,15 @@ dependencies = [
|
|||
"python-tgpt>=0.8.1",
|
||||
"yt-dlp>=2025.1.15",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
summarize = "summarize:cli"
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = [
|
||||
"summarize.py",
|
||||
]
|
||||
|
|
|
|||
139
summarize.py
Executable file
139
summarize.py
Executable file
|
|
@ -0,0 +1,139 @@
|
|||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
import click
|
||||
import pytgpt.phind as phind
|
||||
|
||||
debug = False
|
||||
|
||||
|
||||
def summarize_text_file(content: str, prompt: str | None) -> str:
|
||||
prompt = prompt or "Please summarize the following transcript:"
|
||||
bot = phind.PHIND()
|
||||
res = bot.chat(f"{prompt} {content}")
|
||||
return res
|
||||
|
||||
|
||||
def extract_transcript_contents(content: str, keep_newlines: bool = False) -> str:
|
||||
jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
|
||||
result = subprocess.run(
|
||||
jq_command, shell=True, capture_output=True, text=True, input=content
|
||||
).stdout
|
||||
# Replace newlines with spaces
|
||||
result = result.replace("\n", "").split()
|
||||
|
||||
# Join lines back together with newlines
|
||||
processed = (
|
||||
" ".join(result).replace('"', "").replace("\\n", "\n" if keep_newlines else " ")
|
||||
)
|
||||
|
||||
return processed
|
||||
|
||||
|
||||
def grab_subtitles(url: str | Path) -> Path:
|
||||
import yt_dlp
|
||||
|
||||
temp_dir = get_temp_dir()
|
||||
ydl_opts = {
|
||||
"outtmpl": f"{temp_dir}/subs",
|
||||
"writeautomaticsub": True,
|
||||
"subtitlesformat": "json3",
|
||||
"skip_download": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl.prepare_filename(info)
|
||||
dprint(f"Subtitle file saved as: {filename}")
|
||||
|
||||
for root, _, files in Path(temp_dir).walk():
|
||||
for file in files:
|
||||
if file.endswith(".json3"):
|
||||
return Path(root).joinpath(file)
|
||||
|
||||
raise ValueError("No correct json3 transcript object found.")
|
||||
|
||||
|
||||
INPUT_FORMATS = ["json3", "yt", "txt"]
|
||||
|
||||
|
||||
@click.command()
|
||||
# TODO: Can I set it so it checks existence *only* when no youtube flag exists?
|
||||
# TODO: Implement some way to allow content passed in on stdin instead of file (- as arg?)
|
||||
@click.argument("file_path", type=click.Path(exists=False))
|
||||
@click.option(
|
||||
"--from",
|
||||
"-f",
|
||||
"get_from",
|
||||
type=click.Choice(INPUT_FORMATS),
|
||||
default="txt",
|
||||
help="Choose format to process between json transcript, yt link or txt file.",
|
||||
)
|
||||
@click.option(
|
||||
"--prompt",
|
||||
"-p",
|
||||
default="Please summarize the following transcript:",
|
||||
type=str,
|
||||
help="Use custom prompt.",
|
||||
|
||||
)
|
||||
@click.option(
|
||||
"--log-level",
|
||||
"-l",
|
||||
default=0,
|
||||
help="Set log level to 1 for debug.",
|
||||
)
|
||||
def cli(
|
||||
file_path: Path | str,
|
||||
prompt: str,
|
||||
log_level: int,
|
||||
get_from: str,
|
||||
):
|
||||
"""Provide summary for a file at the specified path or a youtube video at the specified url."""
|
||||
|
||||
if log_level:
|
||||
global debug
|
||||
debug = True
|
||||
|
||||
content = ""
|
||||
|
||||
# youtube link, dl transcript
|
||||
if get_from == "yt":
|
||||
file_path = grab_subtitles(file_path)
|
||||
|
||||
file_path = cast(Path, file_path)
|
||||
dprint(f"file path = {file_path}")
|
||||
# load local file
|
||||
with Path(file_path).open() as f:
|
||||
content = f.read()
|
||||
|
||||
if get_from == "json3" or get_from == "yt":
|
||||
content = extract_transcript_contents(content)
|
||||
|
||||
dprint(f"content = {content}")
|
||||
if not content:
|
||||
print("Please provide a file with valid content.")
|
||||
|
||||
print(summarize_text_file(content, prompt))
|
||||
|
||||
|
||||
cached_dir: Path | None = None
|
||||
|
||||
|
||||
def get_temp_dir() -> Path:
|
||||
global cached_dir
|
||||
if cached_dir is None:
|
||||
cached_dir = Path(tempfile.mkdtemp())
|
||||
dprint(f"Created and cached temp dir {cached_dir}")
|
||||
return cached_dir
|
||||
|
||||
|
||||
def dprint(content: str | None):
|
||||
if debug:
|
||||
print(f"[DEBUG] {content}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
121
tgpt_summary.py
121
tgpt_summary.py
|
|
@ -1,121 +0,0 @@
|
|||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
import click
|
||||
import pytgpt.phind as phind
|
||||
|
||||
|
||||
def summarize_text_file(content: str, prompt: str | None) -> str:
|
||||
prompt = prompt or "Please summarize the following transcript:"
|
||||
bot = phind.PHIND()
|
||||
return bot.chat(f"{prompt} {content}")
|
||||
|
||||
|
||||
def extract_transcript_contents(content: str) -> str:
|
||||
jq_command = "jq '.events.[].segs | select(. != null).[].utf8'"
|
||||
result = subprocess.run(
|
||||
jq_command, shell=True, capture_output=True, text=True, input=content
|
||||
).stdout
|
||||
# Replace newlines with spaces
|
||||
result = result.replace("\n", "").split()
|
||||
|
||||
# Join lines back together with newlines
|
||||
processed = " ".join(result).replace('"', "").replace("\\n", "\n")
|
||||
|
||||
return processed
|
||||
|
||||
|
||||
def grab_subtitles(url: str | Path) -> Path:
|
||||
import yt_dlp
|
||||
|
||||
temp_dir = get_temp_dir()
|
||||
try:
|
||||
ydl_opts = {
|
||||
"outtmpl": f"{temp_dir}/subs",
|
||||
"writeautomaticsub": True,
|
||||
"subtitlesformat": "json3",
|
||||
"skip_download": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=True)
|
||||
filename = ydl.prepare_filename(info)
|
||||
print(f"Subtitle file saved as: {filename}")
|
||||
|
||||
for root, _, files in Path(temp_dir).walk():
|
||||
for file in files:
|
||||
if file.endswith(".json3"):
|
||||
return Path(root).joinpath(file)
|
||||
|
||||
raise ValueError("No correct json3 transcript object found.")
|
||||
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
|
||||
|
||||
def get_temp_dir() -> str:
|
||||
# Create a temporary directory
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
print(f"Creating temp dir {temp_dir}")
|
||||
return temp_dir
|
||||
|
||||
|
||||
def rm_dir(dir: Path | str) -> None:
|
||||
# Remove the temporary directory
|
||||
import shutil
|
||||
|
||||
if Path(dir).is_dir():
|
||||
shutil.rmtree(dir)
|
||||
|
||||
|
||||
@click.command()
|
||||
# TODO: Can I set it so it checks existence *only* when no youtube flag exists?
|
||||
@click.argument("file_path", type=click.Path(exists=False))
|
||||
@click.option(
|
||||
"--json-transcript/--no-json-transcript",
|
||||
"-j",
|
||||
default=False,
|
||||
help="Use downloaded json3 transcript.",
|
||||
)
|
||||
@click.option(
|
||||
"--youtube/--no-youtube",
|
||||
"-t",
|
||||
default=False,
|
||||
help="Get (english) transcript from youtube link.",
|
||||
)
|
||||
@click.option(
|
||||
"--prompt",
|
||||
"-p",
|
||||
default="Please provide a detailed but concise summary for the following transcript:",
|
||||
type=str,
|
||||
help="Use custom prompt.",
|
||||
)
|
||||
def process_file(
|
||||
file_path: Path | str, json_transcript: bool, youtube: bool, prompt: str
|
||||
):
|
||||
"""Provide summary for a file at the specified path or a youtube video at the specified url."""
|
||||
|
||||
content = ""
|
||||
|
||||
# youtube link, dl transcript
|
||||
if youtube:
|
||||
file_path = grab_subtitles(file_path)
|
||||
|
||||
file_path = cast(Path, file_path)
|
||||
print(f"DEBUG: file path = {file_path}")
|
||||
# load local file
|
||||
with Path(file_path).open() as f:
|
||||
content = f.read()
|
||||
|
||||
if json_transcript or youtube:
|
||||
content = extract_transcript_contents(content)
|
||||
|
||||
if not content:
|
||||
print("Please provide a file with valid content.")
|
||||
|
||||
print(summarize_text_file(content, prompt))
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_file()
|
||||
2
uv.lock
generated
2
uv.lock
generated
|
|
@ -485,7 +485,7 @@ wheels = [
|
|||
[[package]]
|
||||
name = "pytgpt-summarize"
|
||||
version = "0.1.0"
|
||||
source = { virtual = "." }
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "click" },
|
||||
{ name = "python-tgpt" },
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue