verbanote-server/verbanote/loaders.py

import locale
from pathlib import Path
from whisper import Whisper, load_model
from pyannote.audio import Pipeline
import torch
import static_ffmpeg
import file_operations


def prep() -> None:
    locale.getpreferredencoding = lambda: "UTF-8"
    # download and add ffmpeg to env
    static_ffmpeg.add_paths()


def audiofile(url: str, input_path: Path) -> Path:
    file = file_operations.download_from_url(url, input_path)
    file_wav = file_operations.convert_to_wav(file, input_path)
    file.unlink()
    return file_wav


def diarization(access_token: str | None) -> Pipeline:
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization", use_auth_token=access_token
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    return pipeline.to(device)


def whispermodel() -> Whisper:
    # LOAD MODEL INTO VRAM
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    return load_model("large", device=device)