verbanote-server/verbanote/loaders.py

import locale
from pathlib import Path
import subprocess
from whisper import Whisper
from pyannote.audio import Pipeline
import torch
import static_ffmpeg
import gdown


def prep() -> None:
    locale.getpreferredencoding = lambda: "UTF-8"
    # download and add ffmpeg to env
    static_ffmpeg.add_paths()


def audiofile(drive_url: str, path: Path) -> Path | None:
    if not drive_url:
        return None
    gdown.download(drive_url, "infile")
    fn = Path.joinpath(path, "interview.wav")
    subprocess.run(
        [
            "ffmpeg",
            "-i",
            "{repr(video_path)}",
            "-vn",
            "-acodec",
            "pcm_s16le",
            "-ar",
            "16000",
            "-ac",
            "1",
            "-y",
            fn,
        ]
    )
    return fn


def diarization(access_token: str | None) -> Pipeline:
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization", use_auth_token=access_token
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    return pipeline.to(device)


def whisper() -> Whisper:
    # LOAD MODEL INTO VRAM
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    return whisper.load_model("large", device=device)