UnityWebGL使用sherpa-ncnn实时语音识别

k2-fsa/sherpa-ncnn:在没有互联网连接的情况下使用带有 ncnn 的下一代 Kaldi 进行实时语音识别。支持iOS、Android、Raspberry Pi、VisionFive2、LicheePi4A等。 (github.com)

如果是PC端可以直接使用ssssssilver大佬的 https://github.com/ssssssilver/sherpa-ncnn-unity.git

我这边要折腾的是WebGL版本的,所以修改了一番

1、WebSocket,客户端使用了psygames/UnityWebSocket: :whale: The Best Unity WebSocket Plugin for All Platforms. (github.com)

using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;
using UnityEngine;
using UnityEngine.UI;
using UnityWebSocket;

public class uSherpaWebGL : MonoBehaviour
{
    IWebSocket ws;
    public Text text;
    Queue<string> msgs = new Queue<string>();

    // Start is called before the first frame update
    void Start()
    {
        ws = new WebSocket("ws://127.0.0.1:9999");
        ws.OnOpen += OnOpen;
        ws.OnMessage += OnMessage;
        ws.OnError += OnError;
        ws.OnClose += OnClose;
        ws.ConnectAsync();
    }

    // Update is called once per frame
    void Update()
    {
        if (msgs.Count > 0)
        {
            string msg = msgs.Dequeue();
            text.text += msg;
        }
    }

    byte[] desArray;
    public void OnData(float[] input)
    {
        Debug.Log("input.Length:" + input.Length);
        SendData(input);
    }

    void SendData(float[] input)
    {
        var desArraySize = Buffer.ByteLength(input);
        IntPtr srcArrayPtr = Marshal.UnsafeAddrOfPinnedArrayElement(input, 0);
        desArray = new byte[desArraySize];
        Marshal.Copy(srcArrayPtr, desArray, 0, desArraySize);
        if (ws != null && ws.ReadyState == WebSocketState.Open)
        {
            ws.SendAsync(desArray);
        }
    }

    void OnOpen(object sender, OpenEventArgs e)
    {
        Debug.Log("WS connected!");
    }

    void OnMessage(object sender, MessageEventArgs e)
    {
        if (e.IsBinary)
        {
            string str = Encoding.UTF8.GetString(e.RawData);
            Debug.Log("WS received message: " + str);
            msgs.Enqueue(str);
        }
        else if (e.IsText)
        {

        }
    }

    void OnError(object sender, ErrorEventArgs e)
    {
        Debug.Log("WS error: " + e.Message);
    }

    void OnClose(object sender, CloseEventArgs e)
    {
        Debug.Log(string.Format("Closed: StatusCode: {0}, Reason: {1}", e.StatusCode, e.Reason));
    }

    private void OnApplicationQuit()
    {
        if (ws != null && ws.ReadyState != WebSocketState.Closed)
        {
            ws.CloseAsync();
        }
    }
}

服务器端使用了Fleck

// See https://aka.ms/new-console-template for more information
using Fleck;
using System.Text;

namespace uSherpaServer
{
    internal class Program
    {
        // 声明配置和识别器变量
        static SherpaNcnn.OnlineRecognizer recognizer;
        static SherpaNcnn.OnlineStream onlineStream;

        static string tokensPath = "tokens.txt";
        static string encoderParamPath = "encoder_jit_trace-pnnx.ncnn.param";
        static string encoderBinPath = "encoder_jit_trace-pnnx.ncnn.bin";
        static string decoderParamPath = "decoder_jit_trace-pnnx.ncnn.param";
        static string decoderBinPath = "decoder_jit_trace-pnnx.ncnn.bin";
        static string joinerParamPath = "joiner_jit_trace-pnnx.ncnn.param";
        static string joinerBinPath = "joiner_jit_trace-pnnx.ncnn.bin";
        static int numThreads = 1;
        static string decodingMethod = "greedy_search";

        static string modelPath;
        static float sampleRate = 16000;

        static IWebSocketConnection client;
        static void Main(string[] args)
        {
            //需要将此文件夹拷贝到exe所在的目录
            modelPath = Environment.CurrentDirectory + "/sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16";
            // 初始化配置
            SherpaNcnn.OnlineRecognizerConfig config = new SherpaNcnn.OnlineRecognizerConfig
            {
                FeatConfig = { SampleRate = sampleRate, FeatureDim = 80 },
                ModelConfig = {
                Tokens = Path.Combine(modelPath,tokensPath),
                EncoderParam =  Path.Combine(modelPath,encoderParamPath),
                EncoderBin =Path.Combine(modelPath, encoderBinPath),
                DecoderParam =Path.Combine(modelPath, decoderParamPath),
                DecoderBin = Path.Combine(modelPath, decoderBinPath),
                JoinerParam = Path.Combine(modelPath,joinerParamPath),
                JoinerBin =Path.Combine(modelPath,joinerBinPath),
                UseVulkanCompute = 0,
                NumThreads = numThreads
            },
                DecoderConfig = {
                DecodingMethod = decodingMethod,
                NumActivePaths = 4
            },
                EnableEndpoint = 1,
                Rule1MinTrailingSilence = 2.4F,
                Rule2MinTrailingSilence = 1.2F,
                Rule3MinUtteranceLength = 20.0F
            };

            // 创建识别器和在线流
            recognizer = new SherpaNcnn.OnlineRecognizer(config);

            onlineStream = recognizer.CreateStream();

            StartWebServer();
            Update();
            Console.ReadLine();
        }

        static void StartWebServer()
        {
            //存储连接对象的池
            var connectSocketPool = new List<IWebSocketConnection>();
            //创建WebSocket服务端实例并监听本机的9999端口
            var server = new WebSocketServer("ws://127.0.0.1:9999");
            //开启监听
            server.Start(socket =>
            {
                //注册客户端连接建立事件
                socket.OnOpen = () =>
                {
                    client = socket;
                    Console.WriteLine("Open");
                    //将当前客户端连接对象放入连接池中
                    connectSocketPool.Add(socket);
                };
                //注册客户端连接关闭事件
                socket.OnClose = () =>
                {
                    client = null;
                    Console.WriteLine("Close");
                    //将当前客户端连接对象从连接池中移除
                    connectSocketPool.Remove(socket);
                };
                //注册客户端发送信息事件
                socket.OnBinary = message =>
                {
                    float[] floatArray = new float[message.Length / 4];
                    Buffer.BlockCopy(message, 0, floatArray, 0, message.Length);
                    // 将采集到的音频数据传递给识别器
                    onlineStream.AcceptWaveform(sampleRate, floatArray);
                };
            });
        }

        static string lastText = "";

        static void Update()
        {
            while (true)
            {
                // 每帧更新识别器状态
                if (recognizer.IsReady(onlineStream))
                {
                    recognizer.Decode(onlineStream);
                }

                var text = recognizer.GetResult(onlineStream).Text;
                bool isEndpoint = recognizer.IsEndpoint(onlineStream);
                if (!string.IsNullOrWhiteSpace(text) && lastText != text)
                {
                    if (string.IsNullOrWhiteSpace(lastText))
                    {
                        lastText = text;
                        if (client != null)
                        {
                            client.Send(Encoding.UTF8.GetBytes(text));
                            //Console.WriteLine("text1:" + text);
                        }
                    }
                    else
                    {
                        if (client != null)
                        {
                            client.Send(Encoding.UTF8.GetBytes(text.Replace(lastText, "")));
                            lastText = text;
                        }
                    }
                }

                if (isEndpoint)
                {
                    if (!string.IsNullOrWhiteSpace(text))
                    {
                        if (client != null)
                        {
                            client.Send(Encoding.UTF8.GetBytes("。"));
                        }
                       // Console.WriteLine("text2:" + text);
                    }
                    recognizer.Reset(onlineStream);
                    //Console.WriteLine("Reset");
                }
                Thread.Sleep(200); // ms
            }
        }
    }
}

2、Unity录音插件使用了uMicrophoneWebGL 绑定DataEvent事件实时获取话筒数据(float数组)

最后放上工程地址

客户端 uSherpa: fork from https://github.com/ssssssilver/sherpa-ncnn-unity.git改成 Unity WebGL版

服务器端 GitHub - xue-fei/uSherpaServer: uSherpaServer 给Unity提供流式语音识别的websocket服务

相关推荐

  1. 离线语音识别 sherpa-ncnn 尝鲜体验

    2024-05-02 10:20:02       55 阅读
  2. Python使用whisper实现语音识别(ASR)

    2024-05-02 10:20:02       44 阅读

最近更新

  1. docker php8.1+nginx base 镜像 dockerfile 配置

    2024-05-02 10:20:02       94 阅读
  2. Could not load dynamic library ‘cudart64_100.dll‘

    2024-05-02 10:20:02       101 阅读
  3. 在Django里面运行非项目文件

    2024-05-02 10:20:02       82 阅读
  4. Python语言-面向对象

    2024-05-02 10:20:02       91 阅读

热门阅读

  1. 爬虫学习--3.Requests模块

    2024-05-02 10:20:02       26 阅读
  2. C++:现代类型转换

    2024-05-02 10:20:02       25 阅读
  3. 安卓ComponentName简介及使用

    2024-05-02 10:20:02       25 阅读
  4. Set实现(3)| TreeSet

    2024-05-02 10:20:02       23 阅读
  5. MySQL-配置文件

    2024-05-02 10:20:02       27 阅读
  6. Spark运行流程及架构设计

    2024-05-02 10:20:02       25 阅读
  7. 数据量比较大 | 分库分表?

    2024-05-02 10:20:02       28 阅读
  8. rust 使用记录

    2024-05-02 10:20:02       28 阅读
  9. 美国CADS (原爱因斯坦NCPS计划) 2024年进展

    2024-05-02 10:20:02       25 阅读
  10. C++进阶——STL

    2024-05-02 10:20:02       27 阅读
  11. OceanBase在实际应用中有哪些优势和不足?

    2024-05-02 10:20:02       88 阅读