import locale from pathlib import Path import subprocess from whisper import Whisper from pyannote.audio import Pipeline import torch import static_ffmpeg import gdown def prep() -> None: locale.getpreferredencoding = lambda: "UTF-8" # download and add ffmpeg to env static_ffmpeg.add_paths() def audiofile(drive_url: str, path: Path) -> Path | None: if not drive_url: return None gdown.download(drive_url, "infile") fn = Path.joinpath(path, "interview.wav") subprocess.run( [ "ffmpeg", "-i", "{repr(video_path)}", "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", fn, ] ) return fn def diarization(access_token: str | None) -> Pipeline: pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization", use_auth_token=access_token ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") return pipeline.to(device) def whisper() -> Whisper: # LOAD MODEL INTO VRAM device = torch.device("cuda" if torch.cuda.is_available() else "cpu") return whisper.load_model("large", device=device)