将sherpa-onnx嵌入unity中进行语音识别（ASR）_sherpa-onnx unity

技术文档

起因

因为有语音识别需求，且需要低延迟，在查找了很多ASR项目后，偶然间发现sherpa-onnx比较契合我的需求，因为可以本地部署还免费，查找了不少资料后发现很少有sherpa-onnx+unity的资料，因此尝试自己研究下。

`sherpa-onnx`项目基础介绍

sherpa-onnx 是一个开源项目，旨在提供离线的语音识别、文本转语音、说话人识别和语音活动检测（VAD）功能。该项目支持很多语言，其中包括C#，在原项目文件中有C#示例，因此可以移植到unity中，本文参考了sherpa-onnx的TTS实现和原项目C#示例代码。

首先该项目进行语音识别需要语音识别大模型，我这边以 sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 模型作为测试，要下载该模型请自行去该项目的官方网址去下载。

然后需要导入相关dll库和lib库，我在原项目文件中找了很久都没找到，最后在他的示例项目中找到。

现在万事俱备，接下来可以开始了。

首先在官方的C#示例代码中有流式和非流式的示例代码，此处先以流式作为例子，至于非流式道理差不多。直接上代码：

using SherpaOnnx;using System;using System.Collections;using System.Collections.Generic;using System.IO;using System.Threading;using UnityEngine;public class MyAsr_Test : MonoBehaviour{ private static int minFreq, maxFreq;//最小和最大频率 WaitForSeconds seconds = new WaitForSeconds(0.2f); ///  /// 上一次采样位置 /// 
 int lastSampling; float[] f = new float[16000]; AudioClip microphoneClip; bool recoeding = true; bool Init() { if (Microphone.devices.Length > 0) { Debug.Log($\"设备名称为：{Microphone.devices[0]}\"); Microphone.GetDeviceCaps(null, out minFreq, out maxFreq); if (minFreq == 0 && maxFreq == 0) { maxFreq = 44100; } return true; } return false; } ///  /// 设置录音状态 /// 
 void SetRecoeding(bool recoeding) { this.recoeding = recoeding; if (recoeding) lastSampling = 0; } private IEnumerator Start() { Init(); yield return seconds; var model_dir = Path.Combine(Application.streamingAssetsPath, \"SoundToTextModel\"); var modelDir = Path.Combine(model_dir, \"sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20\"); OnlineRecognizerConfig config = new(); config.FeatConfig.SampleRate = 16000; config.FeatConfig.FeatureDim = 80; config.ModelConfig.Transducer.Encoder = Path.Combine(modelDir, \"encoder-epoch-99-avg-1.onnx\"); config.ModelConfig.Transducer.Decoder = Path.Combine(modelDir, \"decoder-epoch-99-avg-1.onnx\"); config.ModelConfig.Transducer.Joiner = Path.Combine(modelDir, \"joiner-epoch-99-avg-1.onnx\"); config.ModelConfig.Paraformer.Encoder = \"\"; config.ModelConfig.Paraformer.Decoder = \"\"; config.ModelConfig.Tokens = Path.Combine(modelDir, \"tokens.txt\"); config.ModelConfig.Provider = \"cpu\"; config.ModelConfig.NumThreads = 1; config.ModelConfig.Debug = 0; config.DecodingMethod = \"greedy_search\"; config.MaxActivePaths = 4; config.EnableEndpoint = 1; config.Rule1MinTrailingSilence = 2.4f; config.Rule2MinTrailingSilence = 0.8f; config.Rule3MinUtteranceLength = 20; #region 添加识别标点符号的模型 OfflinePunctuationConfig opc = new OfflinePunctuationConfig(); OfflinePunctuationModelConfig opmc = new OfflinePunctuationModelConfig(); string model_path = Path.Combine(model_dir, \"sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12\", \"model.onnx\"); if (!File.Exists(model_path)) { yield break; } opmc.CtTransformer = model_path; opmc.NumThreads = 2; opmc.Provider = \"cpu\"; opmc.Debug = 1; opc.Model = opmc; var offlinePunctuation = new OfflinePunctuation(opc); #endregion OnlineRecognizer recognizer = new(config); OnlineStream s = recognizer.CreateStream(); string lastText = string.Empty; int segmentIndex = 0; // 等待应用获取麦克风权限 yield return Application.RequestUserAuthorization(UserAuthorization.Microphone); // 如果未获取到麦克风权限 if (!Application.HasUserAuthorization(UserAuthorization.Microphone)) yield break; do { microphoneClip = Microphone.Start(null, true, 1, 16000); yield return null; } while (!Microphone.IsRecording(null)); Application.quitting += () => Microphone.End(null); Debug.Log(\"开始录音\"); while (true) { yield return seconds; int currentPos = Microphone.GetPosition(null); bool isSucceed = microphoneClip.GetData(f, 0); if (!recoeding) continue; if (isSucceed) { if (lastSampling != currentPos) {  int count = 0;  float[] p = default;  if (currentPos > lastSampling)  { count = currentPos - lastSampling; p = new float[count]; Array.Copy(f, lastSampling, p, 0, count);  }  else  { count = 16000 - lastSampling; p = new float[count + currentPos]; Array.Copy(f, lastSampling, p, 0, count); Array.Copy(f, 0, p, count, currentPos); count += currentPos;  }  lastSampling = currentPos;  s.AcceptWaveform(16000, p); } } while (recognizer.IsReady(s)) { recognizer.Decode(s); } var text = recognizer.GetResult(s).Text; bool isEndpoint = recognizer.IsEndpoint(s); if (!string.IsNullOrWhiteSpace(text) && lastText != text) { lastText = text; Debug.Log($\"\\r{segmentIndex}: {lastText}\"); } if (isEndpoint) { if (!string.IsNullOrWhiteSpace(text)) {  ++segmentIndex;  lastText = offlinePunctuation.AddPunct(text);  Debug.Log($\"\\r{segmentIndex}: {lastText}\"); } recognizer.Reset(s); } } }}

上述是一个测试脚本，引入了sherpa-onnx自动添加标点的模型（该模型同样去官网自行下载），因为仅使用sherpa-onnx的asr语音识别模型那么它识别的结果不会自动添加标点符号，如果不需要自动添加标点可以将自动添加标点的部分代码删掉。

将该脚本随便挂在一个物体上就可以测试了，个人觉得还行（这里以音乐来测试）。

将sherpa-onnx嵌入unity中进行语音识别（ASR）_sherpa-onnx unity

起因

`sherpa-onnx`项目基础介绍

公告

DeepSeek全套部署资料免费下载

免费可商用字体批量下载

标签

将sherpa-onnx嵌入unity中进行语音识别（ASR）_sherpa-onnx unity

起因

sherpa-onnx项目基础介绍

相关问题

公告

DeepSeek全套部署资料免费下载

免费可商用字体批量下载

标签

`sherpa-onnx`项目基础介绍