« ^ »

今日やった事 - 20240902

所要時間: 約 1分

今日、何もできていない。どうしようもない。家にいてもできるような仕事を模索し続ければいいのに。まあ、どうでもいいか。今日はもう時間はあまりないけれど、今日をせいいっぱい生きようと思う。

発話を検出する

import multiprocessing as mp
import multiprocessing.connection as mp_conn
import numpy as np
import sounddevice as sd
import torch


SAMPLING_RATE = 16000
WINDOW_SIZE_SAMPLES = 512


def read_sound(ready_ev: mp.Event, shutdown_ev: mp.Event, out_conn: mp_conn.Connection):
    def _fn(indata: np.ndarray[np.ndarray[np.int16]], frames, time, status):
        out_conn.send_bytes(indata.tobytes())

    ready_ev.set()
    with sd.InputStream(
        samplerate=SAMPLING_RATE,
        dtype="int16",
        blocksize=WINDOW_SIZE_SAMPLES,
        channels=1,
        callback=_fn,
    ):
        print("録音開始")
        while not shutdown_ev.is_set():
            sd.sleep(500)
        print("録音終了")
    shutdown_ev.set()


def main():
    ready_ev = mp.Event()
    shutdown_ev = mp.Event()
    sound_conn_w, sound_conn_r = mp.Pipe()
    p = mp.Process(target=read_sound, args=(ready_ev, shutdown_ev, sound_conn_w))

    p.start()

    vad_model, utils = torch.hub.load("snakers4/silero-vad", "silero_vad")

    (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils

    ready_ev.wait(timeout=10)
    sampling_rate = SAMPLING_RATE
    window_size = WINDOW_SIZE_SAMPLES

    chunk_list = []
    no_speech_count = 0
    try:
        while not shutdown_ev.is_set():
            buf = sound_conn_r.recv_bytes()
            wav_data = np.frombuffer(buf, dtype=np.int16)
            writable_wav_data = np.copy(wav_data)  # コピーして書き込み可能にする
            sound_tensor = torch.Tensor(writable_wav_data)
            for ii in range(0, len(sound_tensor), window_size):
                chunk = sound_tensor[ii : ii + window_size]
                ret = vad_model(chunk, sampling_rate)
                speech_prob = ret.item()  # 閾値よりも大きな数字であれば発話と見做す
                if speech_prob > 0.2:  # 発話検出
                    no_speech_count = 0
                else:  # 発話なし
                    no_speech_count += 1

                if no_speech_count < 20:
                    print("発話中")
                    chunk_list.append(chunk)
                elif chunk_list:  # 発話の区切り
                    array = torch.cat(chunk_list).cpu().numpy()
                    print(f"発話終了: {len(array)} {array.dtype} {array.shape}")
                    chunk_list = []

    except KeyboardInterrupt:
        shutdown_ev.set()
        p.terminate()
        p.join(timeout=10)
    finally:
        p.kill()


if __name__ == "__main__":
    main()