> 技术文档 > unity 实现文字转语音TTS-Sherpa-onnx_unity sherpa-onnx

unity 实现文字转语音TTS-Sherpa-onnx_unity sherpa-onnx

GitHub - k2-fsa/sherpa-onnx: Speech-to-text, text-to-speech, speaker diarization, speech enhancement, source separation, and VAD using next-gen Kaldi with onnxruntime without Internet connection. Support embedded systems, Android, iOS, HarmonyOS, Raspberry Pi, RISC-V, x86_64 servers, websocket server/client, support 12 programming languageshttps://github.com/k2-fsa/sherpa-onnxGitHub - xue-fei/sherpa-onnx-unity: sherpa-onnx-unityhttps://github.com/xue-fei/sherpa-onnx-unity感谢先驱。

模型地址:vits-melo-tts-zh_en
https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2

 ReadMe:

Introduction — sherpa 1.3 documentationhttps://k2-fsa.github.io/sherpa/intro.html

此存储库支持在本地运行以下功能

  • 语音转文本(即 ASR);支持流式和非流式
  • 文本转语音(即 TTS)
  • 说话人分类
  • 说话人识别
  • 说话人验证
  • 口语识别
  • 音频标记
  • VAD(例如,silero-vad)
  • 语音增强(例如gtcrn)
  • 关键词识别
  • 源分离(例如,spleeter、UVR)

 代码:

using UnityEngine;using System.Collections;using System.Collections.Generic;using System;using System.Threading;using System.Linq;/// /// 多线程管理类Loom,挂载/// public class Loom : MonoBehaviour{ public static int maxThreads = 8; static int numThreads; private int _count; public static Loom Current; void Awake() { Current = this; } private List _actions = new List(); public struct DelayedQueueItem { public float time; public Action action; } private List _delayed = new List(); List _currentDelayed = new List(); public void QueueOnMainThread(Action action) { QueueOnMainThread(action, 0f); } public void QueueOnMainThread(Action action, float time) { if (time != 0) { lock (Current._delayed) { Current._delayed.Add(new DelayedQueueItem { time = Time.time + time, action = action }); } } else { lock (Current._actions) Current._actions.Add(action); } } public Thread RunAsync(Action a) { while (numThreads >= maxThreads) Thread.Sleep(1); Interlocked.Increment(ref numThreads); ThreadPool.QueueUserWorkItem(RunAction, a); return null; } private void RunAction(object action) { try { ((Action)action)(); } catch { } finally { Interlocked.Decrement(ref numThreads); } } List _currentActions = new List(); void Update() { lock (_actions) { _currentActions.Clear(); _currentActions.AddRange(_actions); _actions.Clear(); } foreach (var a in _currentActions) { a(); } lock (_delayed) { _currentDelayed.Clear(); _currentDelayed.AddRange(_delayed.Where(d => d.time  {//切换为主线程// //todo 主线程的something//});//Loom.RunAsync(() => {//切换为C#线程  //  //todo C#线程的something//});
using SherpaOnnx;using System.Collections.Generic;using UnityEngine;using AOT;using System.IO;using System;using System.Runtime.InteropServices;[RequireComponent(typeof(AudioSource))]public class Sherpa_TextToSpeech : MonoBehaviour{ public static Sherpa_TextToSpeech Instance; private OfflineTts ot; private OfflineTtsGeneratedAudio otga; private OfflineTtsConfig config; private OfflineTtsCallback otc; private AudioSource audioSource; private AudioClip audioClip = null; private int sampleRate = 22050; private List audioData = new List(); private int curAudioClipPos = 0; public float audioLength = 0f; public bool initDone { get; private set; } = false; #region PathDefine private readonly string modelFile = $\"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/model.onnx\"; private readonly string lexiconFile = $\"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/lexicon.txt\"; private readonly string tokensFile = $\"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/tokens.txt\"; private readonly string dictDir = $\"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/dict\"; private readonly string phone_fst = $\"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/phone.fst\"; private readonly string date_fst = $\"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/date.fst\"; private readonly string number_fst = $\"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/number.fst\"; #endregion private void Awake() { Instance = this; audioSource = GetComponent(); audioSource.loop = true; audioSource.playOnAwake = false; DontDestroyOnLoad(gameObject); } private void Start() { initDone = false; Debug.Log(\"开始初始化模型,请等待...\"); Loom.Current.RunAsync(Init); } private void Init() { if (!FileCheck()) return; try { config = new OfflineTtsConfig { Model = {  Vits =  { Model = modelFile, Lexicon = lexiconFile, Tokens = tokensFile, DictDir = dictDir, NoiseScale = 0.667f, NoiseScaleW = 0.8f, LengthScale = 1f  },  NumThreads = 5,  Debug = 1,  Provider = \"cpu\" }, RuleFsts = $\"{phone_fst},{date_fst},{number_fst}\", MaxNumSentences = 1 }; ot = new OfflineTts(config); sampleRate = ot.SampleRate; otc = new OfflineTtsCallback(StaticOnAudioData); initDone = true; Loom.Current.QueueOnMainThread(() => Debug.Log(\"文字转语音初始化完成\")); } catch (Exception e) { Loom.Current.QueueOnMainThread(() => Debug.LogError(\"初始化文字转语音时发生错误: \" + e.Message)); } } private bool FileCheck() { if (!File.Exists(modelFile)) { Debug.LogError(\"模型文件不存在: \" + modelFile); return false; } if (!File.Exists(lexiconFile)) { Debug.LogError(\"词典文件不存在: \" + lexiconFile); return false; } if (!File.Exists(tokensFile)) { Debug.LogError(\"tokens文件不存在: \" + tokensFile); return false; } if (!Directory.Exists(dictDir)) { Debug.LogError(\"字典目录不存在: \" + dictDir); return false; } return true; } [MonoPInvokeCallback(typeof(OfflineTtsCallback))] private static int StaticOnAudioData(IntPtr samples, int n) { return Instance?.OnAudioData(samples, n) ?? 0; } private int OnAudioData(IntPtr samples, int n) { if (n  Debug.LogWarning(\"收到空的音频数据\")); return 0; } float[] tempData = new float[n]; Marshal.Copy(samples, tempData, 0, n); lock (audioData) { audioData.AddRange(tempData); } Loom.Current.QueueOnMainThread(() => { Debug.Log($\"收到音频数据,长度: {n}\"); audioLength += n / (float)sampleRate; Debug.Log($\"音频长度增加 {n / (float)sampleRate} 秒\"); if (!audioSource.isPlaying && audioData.Count > sampleRate * 2) { Debug.Log($\"开始播放音频,数据长度: {audioData.Count}\"); curAudioClipPos = 0; audioClip = AudioClip.Create(\"SynthesizedAudio\", sampleRate * 2, 1, sampleRate, true, OnAudioRead); audioSource.clip = audioClip; audioSource.Play(); } }); return n; } private void OnAudioRead(float[] data) { ExtractAudioData(data); } ///  /// 提取音频数据 ///  private bool ExtractAudioData(float[] data) { if (data == null || data.Length == 0) return false; bool hasData = false; int dataIndex = 0; lock (audioData) { if (audioData.Count > 0 && curAudioClipPos  0; dataIndex = copyCount; } } // 剩余部分填0 if (dataIndex < data.Length) Array.Clear(data, dataIndex, data.Length - dataIndex); return hasData; } ///  /// 生成 ///  public void Generate(string text, float speed, int speakerId) { if (!initDone) { Debug.LogWarning(\"文字转语音未完成初始化\"); return; } if (!File.Exists(modelFile)) { Debug.LogError(\"模型文件不存在: \" + modelFile); return; } Debug.Log($\"开始生成语音,文本为:{text}\"); Loom.Current.RunAsync(() => { try { Debug.Log(\"异步生成语音开始\"); lock (audioData) {  audioData.Clear();  curAudioClipPos = 0; } audioLength = 0f; otga = ot.GenerateWithCallback(text, speed, speakerId, otc); if (otga.SaveToWaveFile(Application.streamingAssetsPath + \"/1.mp3\")) {  Debug.Log(\"异步生成语音结束,保存完成\"); } } catch (Exception e) { Loom.Current.QueueOnMainThread(() =>  Debug.LogError(\"生成语音时发生错误: \" + e.Message)); } }); } private void OnApplicationQuit() { ot?.Dispose(); otga?.Dispose(); otc = null; }}
using System.Collections;using System.Collections.Generic;using UnityEngine;using UnityEngine.UI;public class TTS_Panel : MonoBehaviour{ public Text txt_Des; public Button btn_Generate; void Start() { btn_Generate.onClick.AddListener(() => { if (string.IsNullOrEmpty(txt_Des.text) ||string.IsNullOrWhiteSpace(txt_Des.text))return; Sherpa_TextToSpeech.Instance.Generate(txt_Des.text, 1.0f, 0); }); }}