Skip to content

[求助]关于远场拾音问题 #3146

@HughKyle

Description

@HughKyle

💡 改进描述

全模块部署,最新版本的代码。

当我与设备之间的距离超过1-1.5m后,vad识别成功率大幅度下降,超过3m后基本上vad基本上没触发过。后来我设置了vad的相关参数(threshold - 0.2、threshold_low - 0.1、min_silence_duration_ms - 700、frame_window_threshold - 2)也不能很好的解决这个问题,时行时不行,不够稳定。这种问题是需要从设备上入手,还是从服务端入手去解决。

我在下面会附上我的一次测试示例。简单说明一下测试情况,在vad中增加关键日志与音频保存功能,在3m左右的距离,我一共说了6句话,一句都没有识别出来,但是有两个时间段中分别有几帧的数据被判定为true。但是在保存的音频文件(1分30秒左右)中我可以清楚的听见、听明白6句话的内容,并且音频在整个过程中始终伴随着“呼呼”声或“滋滋”声。

🌟 改进建议

希望在远场时也能很好的听见人声,让vad能判断成功。

🛠️ 相关代码

debug_farfield_1777373221.wav

完整日志.txt

import time
import os
import wave

import numpy as np
import opuslib_next
import onnxruntime
from config.logger import setup_logging
from core.providers.vad.base import VADProviderBase

TAG = name
logger = setup_logging()

class VADProvider(VADProviderBase):
def init(self, config):
logger.bind(tag=TAG).info("SileroVAD", config)

    model_path = os.path.join(
        config["model_dir"], "src", "silero_vad", "data", "silero_vad.onnx"
    )
    opts = onnxruntime.SessionOptions()
    opts.inter_op_num_threads = 1
    opts.intra_op_num_threads = 1
    self.session = onnxruntime.InferenceSession(
        model_path, providers=["CPUExecutionProvider"], sess_options=opts
    )

    threshold = config.get("threshold", "0.5")
    threshold_low = config.get("threshold_low", "0.1")
    min_silence_duration_ms = config.get("min_silence_duration_ms", "1000")

    self.vad_threshold = float(threshold) if threshold else 0.5
    self.vad_threshold_low = float(threshold_low) if threshold_low else 0.2

    self.silence_threshold_ms = (
        int(min_silence_duration_ms) if min_silence_duration_ms else 1000
    )

    self.frame_window_threshold = 2

def _init_connection_state(self, conn):
    """为连接初始化独立的 VAD 状态"""
    if not hasattr(conn, "_vad_opus_decoder"):
        conn._vad_opus_decoder = opuslib_next.Decoder(16000, 1)
    if not hasattr(conn, "_vad_state"):
        conn._vad_state = np.zeros((2, 1, 128), dtype=np.float32)
    if not hasattr(conn, "_vad_context"):
        conn._vad_context = np.zeros((1, 64), dtype=np.float32)

def release_conn_resources(self, conn):
    # ================= 非常关键:安全关闭 WAV 文件 =================
    if hasattr(conn, "debug_wav_file") and conn.debug_wav_file is not None:
        try:
            conn.debug_wav_file.close()
            logger.bind(tag=TAG).info("WAV 录音文件已保存并关闭。可以去根目录播放了!")
        except Exception as e:
            logger.bind(tag=TAG).error(f"关闭 WAV 文件失败: {e}")
    # ==========================================================
    """释放连接的 VAD 资源(连接关闭时调用)"""
    for attr in ("_vad_opus_decoder", "_vad_state", "_vad_context"):
        if hasattr(conn, attr):
            try:
                delattr(conn, attr)
            except Exception:
                pass

def is_vad(self, conn, opus_packet):
    # 手动模式:直接返回True,不进行实时VAD检测,所有音频都缓存
    if conn.client_listen_mode == "manual":
        return True

    try:
        self._init_connection_state(conn)

        pcm_frame = conn._vad_opus_decoder.decode(opus_packet, 960)
        conn.client_audio_buffer.extend(pcm_frame)

        # ================= 新增:实时保存为 WAV 文件 =================
        if not hasattr(conn, "debug_wav_file"):
            # 创建一个以时间戳命名的标准 wav 文件
            file_name = f"debug_farfield_{int(time.time())}.wav"
            conn.debug_wav_file = wave.open(file_name, "wb")
            conn.debug_wav_file.setnchannels(1)  # 单声道
            conn.debug_wav_file.setsampwidth(2)  # 16-bit (2 bytes)
            conn.debug_wav_file.setframerate(16000)  # 16kHz
            logger.bind(tag=TAG).info(f"🎤 开启远场音频录制,保存为: {file_name}")

        # 将解码后的音频帧写入 wav 文件
        conn.debug_wav_file.writeframes(pcm_frame)
        # ==========================================================

        client_have_voice = False
        while len(conn.client_audio_buffer) >= 512 * 2:
            chunk = conn.client_audio_buffer[: 512 * 2]
            conn.client_audio_buffer = conn.client_audio_buffer[512 * 2 :]

            audio_int16 = np.frombuffer(chunk, dtype=np.int16)

            ### 新增调试:计算当前音频帧的能量 (RMS) 和 最大振幅 ###
            # 这样可以直观看到远场声音到底有多弱
            rms_energy = np.sqrt(np.mean(audio_int16.astype(np.float64) ** 2))
            max_amp = np.max(np.abs(audio_int16))
            #####################################################

            audio_float32 = audio_int16.astype(np.float32) / 32768.0
            # audio_float32 = np.tanh(audio_float32 * 2.0)
            audio_input = np.concatenate(
                [conn._vad_context, audio_float32.reshape(1, -1)], axis=1
            ).astype(np.float32)

            ort_inputs = {
                "input": audio_input,
                "state": conn._vad_state,
                "sr": np.array(16000, dtype=np.int64),
            }
            out, state = self.session.run(None, ort_inputs)

            conn._vad_state = state
            conn._vad_context = audio_input[:, -64:]
            speech_prob = out.item()

            # 双阈值判断
            if speech_prob >= self.vad_threshold:
                is_voice = True
            elif speech_prob <= self.vad_threshold_low:
                is_voice = False
            else:
                is_voice = conn.last_is_voice

            ### 新增调试:高频日志,打印每一帧的状态(只有当有一定底噪时才打印,防止日志刷屏) ###
            # 如果你想看完全部的,可以把 rms_energy > 50 的限制去掉
            if rms_energy > 30:
                logger.bind(tag=TAG).info(
                    f"VAD Frame | Prob: {speech_prob:.4f} | MaxAmp: {max_amp:5d}/32768 | RMS: {rms_energy:.1f} | is_voice: {is_voice}"
                )
            #########################################################################

            # 声音没低于最低值则延续前一个状态,判断为有声音
            conn.last_is_voice = is_voice

            # 更新滑动窗口
            conn.client_voice_window.append(is_voice)
            client_have_voice = (
                conn.client_voice_window.count(True) >= self.frame_window_threshold
            )

            # 如果之前有声音,但本次没有声音,且与上次有声音的时间差已经超过了静默阈值,则认为已经说完一句话
            if conn.client_have_voice and not client_have_voice:
                stop_duration = time.time() * 1000 - conn.vad_last_voice_time
                if stop_duration >= self.silence_threshold_ms:
                    conn.client_voice_stop = True
                    # logger.bind(tag=TAG).info(f"VAD判断说话结束,静音持续时间: {stop_duration:.0f}ms")  # 新增结束日志
            if client_have_voice:
                # if not conn.client_have_voice:
                #     logger.bind(tag=TAG).info(f"VAD判断开始说话, 当前概率: {speech_prob:.4f}")  # 新增开始日志
                conn.client_have_voice = True
                conn.vad_last_voice_time = time.time() * 1000

        return client_have_voice
    except opuslib_next.OpusError as e:
        logger.bind(tag=TAG).info(f"解码错误: {e}")
    except Exception as e:
        logger.bind(tag=TAG).error(f"Error processing audio packet: {e}")

📋 其他信息

部分日志:
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-🎤 开启远场音频录制,保存为: debug_farfield_1777373221.wav
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0783 | MaxAmp: 4439/32768 | RMS: 1910.5 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0521 | MaxAmp: 4276/32768 | RMS: 2234.2 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0668 | MaxAmp: 7634/32768 | RMS: 3017.6 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0370 | MaxAmp: 6864/32768 | RMS: 3515.7 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0160 | MaxAmp: 5834/32768 | RMS: 2563.7 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0091 | MaxAmp: 5374/32768 | RMS: 2623.5 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0096 | MaxAmp: 5078/32768 | RMS: 2149.3 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0123 | MaxAmp: 4187/32768 | RMS: 1827.5 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0086 | MaxAmp: 6903/32768 | RMS: 3037.6 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0164 | MaxAmp: 3898/32768 | RMS: 1596.5 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0114 | MaxAmp: 3970/32768 | RMS: 1889.5 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0048 | MaxAmp: 6314/32768 | RMS: 3439.5 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0191 | MaxAmp: 7040/32768 | RMS: 2898.6 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0055 | MaxAmp: 5234/32768 | RMS: 2949.7 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0069 | MaxAmp: 5230/32768 | RMS: 2607.3 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0861 | MaxAmp: 12175/32768 | RMS: 3730.8 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0139 | MaxAmp: 13362/32768 | RMS: 5222.0 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0313 | MaxAmp: 13981/32768 | RMS: 5987.1 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0141 | MaxAmp: 11472/32768 | RMS: 4880.8 | is_voice: False
260428 18:47:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0039 | MaxAmp: 7821/32768 | RMS: 3719.1 | is_voice: False
.........
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0033 | MaxAmp: 19608/32768 | RMS: 7068.2 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0020 | MaxAmp: 7140/32768 | RMS: 2591.1 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0019 | MaxAmp: 6400/32768 | RMS: 2212.0 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0019 | MaxAmp: 7528/32768 | RMS: 2799.1 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0023 | MaxAmp: 6826/32768 | RMS: 2896.9 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0020 | MaxAmp: 10214/32768 | RMS: 4185.1 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0019 | MaxAmp: 10375/32768 | RMS: 3798.9 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0022 | MaxAmp: 7650/32768 | RMS: 3306.6 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0019 | MaxAmp: 7702/32768 | RMS: 3224.7 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0018 | MaxAmp: 7862/32768 | RMS: 3030.0 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0014 | MaxAmp: 7891/32768 | RMS: 3035.8 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0015 | MaxAmp: 8231/32768 | RMS: 2773.4 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0017 | MaxAmp: 8462/32768 | RMS: 3237.0 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0017 | MaxAmp: 8477/32768 | RMS: 3171.9 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0013 | MaxAmp: 10330/32768 | RMS: 4580.0 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0014 | MaxAmp: 7186/32768 | RMS: 2421.9 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0015 | MaxAmp: 6262/32768 | RMS: 1946.8 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0012 | MaxAmp: 6430/32768 | RMS: 2267.5 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0347 | MaxAmp: 15550/32768 | RMS: 4681.1 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.2169 | MaxAmp: 19898/32768 | RMS: 7652.4 | is_voice: True
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.2886 | MaxAmp: 20602/32768 | RMS: 7402.9 | is_voice: True
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.1844 | MaxAmp: 12900/32768 | RMS: 3949.7 | is_voice: True
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.1446 | MaxAmp: 9234/32768 | RMS: 2442.6 | is_voice: True
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.1116 | MaxAmp: 8276/32768 | RMS: 2798.9 | is_voice: True
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0838 | MaxAmp: 9943/32768 | RMS: 3981.6 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.1183 | MaxAmp: 10414/32768 | RMS: 3463.0 | is_voice: False
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.4319 | MaxAmp: 12633/32768 | RMS: 5373.2 | is_voice: True
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.3682 | MaxAmp: 16935/32768 | RMS: 5848.6 | is_voice: True
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.3874 | MaxAmp: 16401/32768 | RMS: 7078.3 | is_voice: True
260428 18:49:01[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.4958 | MaxAmp: 14718/32768 | RMS: 5972.0 | is_voice: True
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.4374 | MaxAmp: 5595/32768 | RMS: 2037.2 | is_voice: True
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.2591 | MaxAmp: 6169/32768 | RMS: 2895.3 | is_voice: True
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.1131 | MaxAmp: 5183/32768 | RMS: 2690.0 | is_voice: True
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0377 | MaxAmp: 6660/32768 | RMS: 2984.7 | is_voice: False
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0171 | MaxAmp: 5363/32768 | RMS: 2135.4 | is_voice: False
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0100 | MaxAmp: 5442/32768 | RMS: 2384.1 | is_voice: False
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0050 | MaxAmp: 7047/32768 | RMS: 3260.6 | is_voice: False
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0044 | MaxAmp: 5241/32768 | RMS: 2387.7 | is_voice: False
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0019 | MaxAmp: 6588/32768 | RMS: 3507.5 | is_voice: False
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0027 | MaxAmp: 3677/32768 | RMS: 1757.3 | is_voice: False
260428 18:49:02[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0018 | MaxAmp: 7167/32768 | RMS: 3349.2 | is_voice: False
.....
260428 18:49:03[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0010 | MaxAmp: 4939/32768 | RMS: 1855.7 | is_voice: False
260428 18:49:03[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0007 | MaxAmp: 5909/32768 | RMS: 2003.2 | is_voice: False
260428 18:49:03[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0007 | MaxAmp: 8340/32768 | RMS: 3136.7 | is_voice: False
260428 18:49:03[0.9.2_00000000000000][core.providers.vad.silero]-INFO-VAD Frame | Prob: 0.0006 | MaxAmp: 4413/32768 | RMS: 1532.3 | is_voice: False
260428 18:49:03[0.9.2_00000000000000][core.providers.asr.aliyun_stream]-INFO-识别到文本: 退出。
260428 18:49:03[0.9.2_00000000000000][core.providers.asr.base]-INFO-识别文本: 退出。
260428 18:49:03[0.9.2_SiAlAlEdponoCh][core.handle.intentHandler]-INFO-识别到明确的退出命令: 退出
260428 18:49:03[0.9.2_00000000000000][core.providers.vad.silero]-INFO-WAV 录音文件已保存并关闭。可以去根目录播放了!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions