verbanote-server/verbanote/process.py

import logging
import os
import re
import json
from dataclasses import dataclass
from pathlib import Path
from pyannote.audio import Pipeline
from pydub import AudioSegment
from whisper import Whisper

MILLISECONDS_TO_SPACE = 2000


@dataclass
class TxtTranscription:
    text: str
    file: Path


def diarize(audiofile: Path, pipeline: Pipeline, output_path: Path) -> Path:
    audiofile_prepended = _add_audio_silence(audiofile)

    logging.info(f"Beginning diarization of {audiofile}...")
    DIARIZE_FILE = {"uri": "not-important", "audio": audiofile_prepended}
    dz = pipeline(DIARIZE_FILE)

    out_file = Path.joinpath(output_path, "diarization.txt")
    with open(out_file, "w") as text_file:
        text_file.write(str(dz))
        logging.info(f"Created diarization in {out_file}.")

    return out_file


def transcribe(
    model: Whisper,
    diarized_groups: list,
    files_path: Path,
    lang: str = "en",
    word_timestamps: bool = True,
) -> None:
    for i in range(len(diarized_groups)):
        audio_f = Path.joinpath(files_path, f"{str(i)}.wav")
        json_f = Path.joinpath(files_path, f"{str(i)}.json")
        logging.info(f"Starting transcription of {str(audio_f)}...")
        result = model.transcribe(
            audio=str(audio_f), language=lang, word_timestamps=word_timestamps
        )
        with open(json_f, "w") as outfile:
            json.dump(result, outfile, indent=4)
            logging.info(f"Transcription written to {str(json_f)}.")


# TODO clean up this mess
def output_txt(diarized_groups: list, transcription_path: Path) -> TxtTranscription:
    txt = list("")
    gidx = -1
    for g in diarized_groups:
        shift = re.findall(r"[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=g[0])[0]
        shift = (
            _millisec(shift) - MILLISECONDS_TO_SPACE
        )  # the start time in the original video
        shift = max(shift, 0)

        gidx += 1

        fname = Path.joinpath(transcription_path, f"{str(gidx)}.json")
        with open(fname) as f:
            captions = json.load(f)["segments"]
            logging.info(f"Loaded {fname} for transcription...")

        if captions:
            speaker = g[0].split()[-1]
            
            txt.append(f"[{speaker}] ")
            for c in captions:
                txt.append(f"{c['text']}")
            txt.append("\n\n")

    output = "".join(txt)
    fname = Path.joinpath(transcription_path, "transcription_result.txt")
    with open(fname, "w", encoding="utf-8") as file:
        file.write(output)
        logging.info(f"Wrote transcription to output file {fname}.")
    return TxtTranscription(text=output, file=fname)


def save_diarized_audio_files(
    diarization: Path, audiofile: Path, output_path: Path
) -> list:
    groups = _group_speakers(diarization)
    _save_individual_audio_files(
        audiofile=audiofile, groups=groups, output_path=output_path
    )
    return groups


def _add_audio_silence(audiofile) -> Path:
    spacermilli = MILLISECONDS_TO_SPACE
    spacer = AudioSegment.silent(duration=spacermilli)
    audio = AudioSegment.from_wav(audiofile)
    audio = spacer.append(audio, crossfade=0)
    fname = Path.joinpath(Path(os.path.dirname(audiofile)), "interview_prepend.wav")
    audio.export(fname, format="wav")
    logging.info(f"Exported audiofile with silence prepended to {fname}.")

    return fname


def _save_individual_audio_files(
    audiofile: Path, groups: list[str], output_path: Path
) -> None:
    audio = AudioSegment.from_wav(audiofile)
    gidx = -1
    for g in groups:
        start = re.findall(r"[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=g[0])[0]
        end = re.findall(r"[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=g[-1])[1]
        start = _millisec(start)  # - spacermilli
        end = _millisec(end)  # - spacermilli
        gidx += 1
        fname = Path.joinpath(output_path, f"{str(gidx)}.wav")
        audio[start:end].export(fname, format="wav")
        logging.info(f"Exported audiopart {gidx} of {len(groups)} to {fname}.")


def _group_speakers(diarization_file: Path) -> list:
    dzs = open(diarization_file).read().splitlines()

    groups: list = []
    g = []
    lastend = 0

    for d in dzs:
        if g and (g[0].split()[-1] != d.split()[-1]):  # same speaker
            groups.append(g)
            g = []

        g.append(d)

        end = re.findall(r"[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=d)[1]
        end = _millisec(end)
        if lastend > end:  # segment engulfed by a previous segment
            groups.append(g)
            g = []
        else:
            lastend = end
    if g:
        groups.append(g)
    return groups


def _millisec(timeStr):
    spl = timeStr.split(":")
    s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2])) * 1000)
    return s
Add info logging to failable functions 2023-08-23 13:10:41 +00:00			`import logging`
Create modelling processes 2023-08-20 12:29:36 +00:00			`import os`
			`import re`
			`import json`
Export TxtTranscription from txt output function 2023-08-23 13:09:53 +00:00			`from dataclasses import dataclass`
Create modelling processes 2023-08-20 12:29:36 +00:00			`from pathlib import Path`
			`from pyannote.audio import Pipeline`
			`from pydub import AudioSegment`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00			`from whisper import Whisper`
Create modelling processes 2023-08-20 12:29:36 +00:00
			`MILLISECONDS_TO_SPACE = 2000`


Export TxtTranscription from txt output function 2023-08-23 13:09:53 +00:00			`@dataclass`
			`class TxtTranscription:`
			`text: str`
			`file: Path`


Create modelling processes 2023-08-20 12:29:36 +00:00			`def diarize(audiofile: Path, pipeline: Pipeline, output_path: Path) -> Path:`
			`audiofile_prepended = _add_audio_silence(audiofile)`

Add logging to diarization 2023-08-23 15:11:47 +00:00			`logging.info(f"Beginning diarization of {audiofile}...")`
Reinsert diarization dependencies 2023-08-22 12:27:52 +00:00			`DIARIZE_FILE = {"uri": "not-important", "audio": audiofile_prepended}`
			`dz = pipeline(DIARIZE_FILE)`
Create modelling processes 2023-08-20 12:29:36 +00:00
			`out_file = Path.joinpath(output_path, "diarization.txt")`
			`with open(out_file, "w") as text_file:`
			`text_file.write(str(dz))`
Add logging to diarization 2023-08-23 15:11:47 +00:00			`logging.info(f"Created diarization in {out_file}.")`
Create modelling processes 2023-08-20 12:29:36 +00:00
			`return out_file`


Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00			`def transcribe(`
			`model: Whisper,`
			`diarized_groups: list,`
Change argument name for transcription 2023-08-23 13:07:48 +00:00			`files_path: Path,`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00			`lang: str = "en",`
			`word_timestamps: bool = True,`
			`) -> None:`
			`for i in range(len(diarized_groups)):`
Fix Path classes wrongly used in string concats 2023-08-23 13:11:44 +00:00			`audio_f = Path.joinpath(files_path, f"{str(i)}.wav")`
			`json_f = Path.joinpath(files_path, f"{str(i)}.json")`
Add info logging to failable functions 2023-08-23 13:10:41 +00:00			`logging.info(f"Starting transcription of {str(audio_f)}...")`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00			`result = model.transcribe(`
Fix Path classes wrongly used in string concats 2023-08-23 13:11:44 +00:00			`audio=str(audio_f), language=lang, word_timestamps=word_timestamps`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00			`)`
			`with open(json_f, "w") as outfile:`
			`json.dump(result, outfile, indent=4)`
Add info logging to failable functions 2023-08-23 13:10:41 +00:00			`logging.info(f"Transcription written to {str(json_f)}.")`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00

			`# TODO clean up this mess`
Export TxtTranscription from txt output function 2023-08-23 13:09:53 +00:00			`def output_txt(diarized_groups: list, transcription_path: Path) -> TxtTranscription:`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00			`txt = list("")`
			`gidx = -1`
			`for g in diarized_groups:`
			`shift = re.findall(r"[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=g[0])[0]`
			`shift = (`
			`_millisec(shift) - MILLISECONDS_TO_SPACE`
			`) # the start time in the original video`
			`shift = max(shift, 0)`

			`gidx += 1`

Fix Path classes wrongly used in string concats 2023-08-23 13:11:44 +00:00			`fname = Path.joinpath(transcription_path, f"{str(gidx)}.json")`
			`with open(fname) as f:`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00			`captions = json.load(f)["segments"]`
Add info logging to failable functions 2023-08-23 13:10:41 +00:00			`logging.info(f"Loaded {fname} for transcription...")`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00
			`if captions:`
			`speaker = g[0].split()[-1]`
Fix speaker rendered throughout paragraph Each sentence or 'segment' in whisper would be preceded by a [speaker] notation. This commit fixes that to only include the speaker in front of a larger group (since a new speaker would start a new diarization group this will always work). 2023-08-23 13:13:04 +00:00
			`txt.append(f"[{speaker}] ")`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00			`for c in captions:`
Fix speaker rendered throughout paragraph Each sentence or 'segment' in whisper would be preceded by a [speaker] notation. This commit fixes that to only include the speaker in front of a larger group (since a new speaker would start a new diarization group this will always work). 2023-08-23 13:13:04 +00:00			`txt.append(f"{c['text']}")`
			`txt.append("\n\n")`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00
			`output = "".join(txt)`
Export TxtTranscription from txt output function 2023-08-23 13:09:53 +00:00			`fname = Path.joinpath(transcription_path, "transcription_result.txt")`
			`with open(fname, "w", encoding="utf-8") as file:`
Reinsert whisper model transcribing 2023-08-23 11:22:55 +00:00			`file.write(output)`
Fix Path classes wrongly used in string concats 2023-08-23 13:11:44 +00:00			`logging.info(f"Wrote transcription to output file {fname}.")`
Export TxtTranscription from txt output function 2023-08-23 13:09:53 +00:00			`return TxtTranscription(text=output, file=fname)`
Create modelling processes 2023-08-20 12:29:36 +00:00

			`def save_diarized_audio_files(`
			`diarization: Path, audiofile: Path, output_path: Path`
			`) -> list:`
			`groups = _group_speakers(diarization)`
Fix Path classes wrongly used in string concats 2023-08-23 13:11:44 +00:00			`_save_individual_audio_files(`
			`audiofile=audiofile, groups=groups, output_path=output_path`
			`)`
Create modelling processes 2023-08-20 12:29:36 +00:00			`return groups`


			`def _add_audio_silence(audiofile) -> Path:`
			`spacermilli = MILLISECONDS_TO_SPACE`
			`spacer = AudioSegment.silent(duration=spacermilli)`
			`audio = AudioSegment.from_wav(audiofile)`
			`audio = spacer.append(audio, crossfade=0)`
Fix Path classes wrongly used in string concats 2023-08-23 13:11:44 +00:00			`fname = Path.joinpath(Path(os.path.dirname(audiofile)), "interview_prepend.wav")`
			`audio.export(fname, format="wav")`
Add info logging to failable functions 2023-08-23 13:10:41 +00:00			`logging.info(f"Exported audiofile with silence prepended to {fname}.")`
Create modelling processes 2023-08-20 12:29:36 +00:00
Fix Path classes wrongly used in string concats 2023-08-23 13:11:44 +00:00			`return fname`
Create modelling processes 2023-08-20 12:29:36 +00:00

			`def _save_individual_audio_files(`
			`audiofile: Path, groups: list[str], output_path: Path`
			`) -> None:`
			`audio = AudioSegment.from_wav(audiofile)`
			`gidx = -1`
			`for g in groups:`
			`start = re.findall(r"[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=g[0])[0]`
			`end = re.findall(r"[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=g[-1])[1]`
			`start = _millisec(start) # - spacermilli`
			`end = _millisec(end) # - spacermilli`
			`gidx += 1`
Fix Path classes wrongly used in string concats 2023-08-23 13:11:44 +00:00			`fname = Path.joinpath(output_path, f"{str(gidx)}.wav")`
			`audio[start:end].export(fname, format="wav")`
Add info logging to failable functions 2023-08-23 13:10:41 +00:00			`logging.info(f"Exported audiopart {gidx} of {len(groups)} to {fname}.")`
Create modelling processes 2023-08-20 12:29:36 +00:00

			`def _group_speakers(diarization_file: Path) -> list:`
			`dzs = open(diarization_file).read().splitlines()`

			`groups: list = []`
			`g = []`
			`lastend = 0`

			`for d in dzs:`
			`if g and (g[0].split()[-1] != d.split()[-1]): # same speaker`
			`groups.append(g)`
			`g = []`

			`g.append(d)`

			`end = re.findall(r"[0-9]+:[0-9]+:[0-9]+\.[0-9]+", string=d)[1]`
			`end = _millisec(end)`
			`if lastend > end: # segment engulfed by a previous segment`
			`groups.append(g)`
			`g = []`
			`else:`
			`lastend = end`
			`if g:`
			`groups.append(g)`
			`return groups`


			`def _millisec(timeStr):`
			`spl = timeStr.split(":")`
			`s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2])) * 1000)`
			`return s`