52 lines
1.2 KiB
Python
52 lines
1.2 KiB
Python
import locale
|
|
from pathlib import Path
|
|
import subprocess
|
|
from whisper import Whisper
|
|
from pyannote.audio import Pipeline
|
|
import torch
|
|
import static_ffmpeg
|
|
import gdown
|
|
|
|
|
|
def prep() -> None:
|
|
locale.getpreferredencoding = lambda: "UTF-8"
|
|
# download and add ffmpeg to env
|
|
static_ffmpeg.add_paths()
|
|
|
|
|
|
def audiofile(drive_url: str, path: Path) -> Path | None:
|
|
if not drive_url:
|
|
return None
|
|
gdown.download(drive_url, "infile")
|
|
fn = Path.joinpath(path, "interview.wav")
|
|
subprocess.run(
|
|
[
|
|
"ffmpeg",
|
|
"-i",
|
|
"{repr(video_path)}",
|
|
"-vn",
|
|
"-acodec",
|
|
"pcm_s16le",
|
|
"-ar",
|
|
"16000",
|
|
"-ac",
|
|
"1",
|
|
"-y",
|
|
fn,
|
|
]
|
|
)
|
|
return fn
|
|
|
|
|
|
def diarization(access_token: str | None) -> Pipeline:
|
|
pipeline = Pipeline.from_pretrained(
|
|
"pyannote/speaker-diarization", use_auth_token=access_token
|
|
)
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
return pipeline.to(device)
|
|
|
|
|
|
def whisper() -> Whisper:
|
|
# LOAD MODEL INTO VRAM
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
return whisper.load_model("large", device=device)
|