Reinsert diarization dependencies

2023-08-22 14:27:52 +02:00 · 2023-08-22 14:27:52 +02:00 · 64123a29e0
commit 64123a29e0
parent 48095a1dc9
8 changed files with 105 additions and 399 deletions
--- a/verbanote/file_operations.py
+++ b/verbanote/file_operations.py
@ -1,4 +1,5 @@
 from pathlib import Path
+import logging
 import requests
 import subprocess

@ -6,11 +7,13 @@ import subprocess
 def download_from_url(url: str, input_path: Path) -> Path:
    resp = requests.get(url)
    if not resp.ok:
+        logging.error(f"Created error code: {resp.status_code}")
        raise requests.exceptions.HTTPError()
    # TODO think about implementing a naming scheme based on url path
    fname = Path.joinpath(input_path, "inputfile")
    with open(fname, mode="wb") as file:
        file.write(resp.content)
+    logging.info(f"Downloaded input file: {fname}")
    return fname


@ -41,4 +44,5 @@ def convert_to_wav(file: Path, output_path: Path) -> Path:
            fn,
        ]
    )
+    logging.info(f"Converted {file} to wav format: {fn}")
    return fn
--- a/verbanote/loaders.py
+++ b/verbanote/loaders.py
@ -1,8 +1,8 @@
 import locale
 from pathlib import Path
 # from whisper import Whisper
-# from pyannote.audio import Pipeline
-# import torch
+from pyannote.audio import Pipeline
+import torch
 import static_ffmpeg
 import file_operations

@ -18,14 +18,14 @@ def audiofile(url: str, input_path: Path) -> Path:
    file.unlink()
    return file_wav

-#
-# def diarization(access_token: str | None) -> Pipeline:
-#     pipeline = Pipeline.from_pretrained(
-#         "pyannote/speaker-diarization", use_auth_token=access_token
-#     )
-#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#     return pipeline.to(device)
-#
+
+def diarization(access_token: str | None) -> Pipeline:
+    pipeline = Pipeline.from_pretrained(
+        "pyannote/speaker-diarization", use_auth_token=access_token
+    )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    return pipeline.to(device)
+
 #
 # def whisper() -> Whisper:
 #     # LOAD MODEL INTO VRAM
--- a/verbanote/notebook.py
+++ b/verbanote/notebook.py
@ -1,24 +0,0 @@
-## SETTINGS FOR LATER
-from pathlib import Path
-
-# @markdown Enter the URL of the YouTube video, or the path to the video/audio file you want to transcribe, give the output path, etc. and run the cell. HTML file embeds the video for YouTube, and audio for media files.
-
-Source = "Youtube"  # @param ['Youtube', 'File (Google Drive)']
-# @markdown ---
-# @markdown #### **Youtube video**
-video_url = "https://youtu.be/hpZFJctBUHQ"  # @param {type:"string"}
-# store_audio = True #@param {type:"boolean"}
-# @markdown ---
-# @markdown #### **Google Drive video or audio path (mp4, wav, mp3)**
-video_path = "/content/drive/MyDrive/Customer_Service.mp3"  # @param {type:"string"}
-# @markdown ---
-output_path = "/content/transcript/"  # @param {type:"string"}
-output_path = str(Path(output_path))
-# @markdown ---
-# @markdown #### **Title for transcription of media file**
-audio_title = "Sample Order Taking"  # @param {type:"string"}
-# @markdown ---
-# @markdown #### Copy a token from your [Hugging Face tokens page](https://huggingface.co/settings/tokens) and paste it below.
-access_token = "hf_"  # @param {type:"string"}
-# @markdown ---
-# @markdown **Run this cell again if you change the video.**
--- a/verbanote/process.py
+++ b/verbanote/process.py
@ -4,7 +4,7 @@ import json
 from pathlib import Path
 from pyannote.audio import Pipeline
 from pydub import AudioSegment
-from whisper import Whisper
+# from whisper import Whisper

 MILLISECONDS_TO_SPACE = 2000

@ -12,8 +12,8 @@ MILLISECONDS_TO_SPACE = 2000
 def diarize(audiofile: Path, pipeline: Pipeline, output_path: Path) -> Path:
    audiofile_prepended = _add_audio_silence(audiofile)

-    DEMO_FILE = {"uri": "blabla", "audio": audiofile_prepended}
-    dz = pipeline(DEMO_FILE)
+    DIARIZE_FILE = {"uri": "not-important", "audio": audiofile_prepended}
+    dz = pipeline(DIARIZE_FILE)

    out_file = Path.joinpath(output_path, "diarization.txt")
    with open(out_file, "w") as text_file:
@ -25,22 +25,22 @@ def diarize(audiofile: Path, pipeline: Pipeline, output_path: Path) -> Path:
    return out_file


-def transcribe(
-    model: Whisper,
-    diarized_groups: list,
-    output_path: Path,
-    lang: str = "en",
-    word_timestamps: bool = True,
-):
-    for i in range(len(diarized_groups)):
-        f = {Path.joinpath(output_path, str(i))}
-        audio_f = f"{f}.wav"
-        json_f = f"{f}.json"
-        result = model.transcribe(
-            audio=audio_f, language=lang, word_timestamps=word_timestamps
-        )
-        with open(json_f, "w") as outfile:
-            json.dump(result, outfile, indent=4)
+# def transcribe(
+#     model: Whisper,
+#     diarized_groups: list,
+#     output_path: Path,
+#     lang: str = "en",
+#     word_timestamps: bool = True,
+# ):
+#     for i in range(len(diarized_groups)):
+#         f = {Path.joinpath(output_path, str(i))}
+#         audio_f = f"{f}.wav"
+#         json_f = f"{f}.json"
+#         result = model.transcribe(
+#             audio=audio_f, language=lang, word_timestamps=word_timestamps
+#         )
+#         with open(json_f, "w") as outfile:
+#             json.dump(result, outfile, indent=4)


 def save_diarized_audio_files(
--- a/verbanote/rp_handler.py
+++ b/verbanote/rp_handler.py
@ -1,22 +1,26 @@
+import logging
 from pathlib import Path
 import runpod
 from runpod.serverless import os
 import loaders
-# import process
+import file_operations

+import process

-output_path:Path = Path(os.environ.get("VERBANOTE_OUTPUT_PATH", "/in"))
-input_path:Path = Path(os.environ.get("VERBANOTE_INPUT_PATH", "/out"))
+logging.basicConfig(level=logging.DEBUG)
+
+input_path: Path = Path(os.environ.get("VERBANOTE_INPUT_PATH", "/in"))
+output_path: Path = Path(os.environ.get("VERBANOTE_OUTPUT_PATH", "/out"))

 access_token: str = os.environ.get("VERBANOTE_HF_TOKEN", "")

 loaders.prep()
-# diarize_pipeline = loaders.diarization(access_token)
+diarize_pipeline = loaders.diarization(access_token)
 # whisper_model = loaders.whisper()


 def handler(job):
-    input:dict = job["input"]
+    input: dict = job["input"]
    url: str | None = input.get("url")

    if not url:
@ -27,10 +31,11 @@ def handler(job):
    except Exception:
        return {"error": "audiofile import failed"}

-    # diarized = process.diarize(audiofile, diarize_pipeline, output_path)
-    # diarized_groups = process.save_diarized_audio_files(
-    #     diarized, audiofile, output_path
-    # )
+    diarized = process.diarize(audiofile, diarize_pipeline, output_path)
+    diarized_groups = process.save_diarized_audio_files(
+        diarized, audiofile, output_path
+    )
+    uploaded_file: str = file_operations.upload_to_oxo(file=diarized, expires=1)
    # process.transcribe(
    #     model=whisper_model, diarized_groups=diarized_groups, output_path=output_path
    # )
@ -39,9 +44,11 @@ def handler(job):
        "speaker_timings": "s3-address-to-speakers",
        "transcription_text": "s3-address-to-transcription",
        "transcription_page": "web-address-to-deployment",
-        "audiofile_path": str(audiofile)
+        "audiofile_path": str(audiofile),
+        "audio_url": uploaded_file,
    }

+
 # speakers = {
 #     # speaker, textboxcolor, speaker color
 #     "SPEAKER_00": ("SPEAKER00", "white", "darkgreen"),