今日、何もできていない。どうしようもない。家にいてもできるような仕事を模索し続ければいいのに。まあ、どうでもいいか。今日はもう時間はあまりないけれど、今日をせいいっぱい生きようと思う。
発話を検出する
import multiprocessing as mp
import multiprocessing.connection as mp_conn
import numpy as np
import sounddevice as sd
import torch
SAMPLING_RATE = 16000
WINDOW_SIZE_SAMPLES = 512
def read_sound(ready_ev: mp.Event, shutdown_ev: mp.Event, out_conn: mp_conn.Connection):
def _fn(indata: np.ndarray[np.ndarray[np.int16]], frames, time, status):
out_conn.send_bytes(indata.tobytes())
ready_ev.set()
with sd.InputStream(
samplerate=SAMPLING_RATE,
dtype="int16",
blocksize=WINDOW_SIZE_SAMPLES,
channels=1,
callback=_fn,
):
print("録音開始")
while not shutdown_ev.is_set():
sd.sleep(500)
print("録音終了")
shutdown_ev.set()
def main():
ready_ev = mp.Event()
shutdown_ev = mp.Event()
sound_conn_w, sound_conn_r = mp.Pipe()
p = mp.Process(target=read_sound, args=(ready_ev, shutdown_ev, sound_conn_w))
p.start()
vad_model, utils = torch.hub.load("snakers4/silero-vad", "silero_vad")
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
ready_ev.wait(timeout=10)
sampling_rate = SAMPLING_RATE
window_size = WINDOW_SIZE_SAMPLES
chunk_list = []
no_speech_count = 0
try:
while not shutdown_ev.is_set():
buf = sound_conn_r.recv_bytes()
wav_data = np.frombuffer(buf, dtype=np.int16)
writable_wav_data = np.copy(wav_data) # コピーして書き込み可能にする
sound_tensor = torch.Tensor(writable_wav_data)
for ii in range(0, len(sound_tensor), window_size):
chunk = sound_tensor[ii : ii + window_size]
ret = vad_model(chunk, sampling_rate)
speech_prob = ret.item() # 閾値よりも大きな数字であれば発話と見做す
if speech_prob > 0.2: # 発話検出
no_speech_count = 0
else: # 発話なし
no_speech_count += 1
if no_speech_count < 20:
print("発話中")
chunk_list.append(chunk)
elif chunk_list: # 発話の区切り
array = torch.cat(chunk_list).cpu().numpy()
print(f"発話終了: {len(array)} {array.dtype} {array.shape}")
chunk_list = []
except KeyboardInterrupt:
shutdown_ev.set()
p.terminate()
p.join(timeout=10)
finally:
p.kill()
if __name__ == "__main__":
main()