verbanote-server/verbanote/loaders.py

53 lines
1.2 KiB
Python
Raw Normal View History

import locale
from pathlib import Path
2023-08-20 12:29:36 +00:00
import subprocess
from whisper import Whisper
from pyannote.audio import Pipeline
import torch
import static_ffmpeg
import gdown
def prep() -> None:
locale.getpreferredencoding = lambda: "UTF-8"
# download and add ffmpeg to env
static_ffmpeg.add_paths()
2023-08-20 12:29:36 +00:00
def audiofile(drive_url: str, path: Path) -> Path | None:
if not drive_url:
return None
2023-08-20 12:29:36 +00:00
gdown.download(drive_url, "infile")
fn = Path.joinpath(path, "interview.wav")
subprocess.run(
[
"ffmpeg",
"-i",
"{repr(video_path)}",
"-vn",
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
"-y",
fn,
]
)
return fn
2023-08-20 12:29:36 +00:00
def diarization(access_token: str | None) -> Pipeline:
2023-08-20 12:29:36 +00:00
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization", use_auth_token=access_token
)
2023-08-20 12:29:36 +00:00
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
return pipeline.to(device)
def whisper() -> Whisper:
# LOAD MODEL INTO VRAM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
return whisper.load_model("large", device=device)