超簡單部署離線語音合成TTS和語音識別

一篇文章講清楚超簡單離線語音合成TTS 和離線語音識別系統部署
本文只介紹兩個輕量級的
語音合成用piper, 語音識別用vosk
部署簡單,效果勉強

語音合成

安裝

linux下安裝

pip install piper-tts

下載模型(63M)

中文模型下載 zh_CN-huayan-medium.onnx 和 zh_CN-huayan-medium.onnx.json兩個文件, 放在同一個目錄
聲音好聽, 但是空格,符號等識別不了, 沒有停頓, 沒有更好的中文模型是個遺憾
如果只是windows系統用微軟的TTS效果比這個好
模型下載地址

https://hf-mirror.com/rhasspy/piper-voices
https://hf-mirror.com/rhasspy/piper-voices/tree/main/zh/zh_CN/huayan/medium

使用

在當前目錄下會輸出66.wav這個音頻文件

echo '今年前5個月，我國貨物貿易進出口總值17.94萬億元' | piper --model ./zh_CN-huayan-medium.onnx --output_file 66.wav

對接

下面代碼未驗證過

import subprocess
import os
import asyncio
from fastapi import FastAPI, HTTPException, Query
from fastapi.responses import FileResponse
from pydantic import BaseModel
import uuidapp = FastAPI()# 設置模型路徑和輸出目錄（使用絕對路徑）
# MODEL_PATH = './model/zh_CN-huayan-x_low.onnx'
MODEL_PATH = './model/zh_CN-huayan-medium.onnx'
OUTPUT_DIR = os.path.abspath('./output/')  # 使用絕對路徑# 確保輸出目錄存在
if not os.path.exists(OUTPUT_DIR):os.makedirs(OUTPUT_DIR)class SynthesizeRequest(BaseModel):text: strmode: str = Query('sync', enum=['sync', 'async'])  # 支持 'sync' 或 'async'# 同步版本的 TTS 生成
def synthesize_text_sync(text: str, output_file: str):try:# 調用 piper 命令生成音頻command = f"echo '{text}' | piper --model {MODEL_PATH} --output_file {output_file}"print(command)result = subprocess.run(#['echo', text, '|', 'piper', '--model', MODEL_PATH, '--output_file', output_file],command,check=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)# 打印 piper 命令的輸出（標準輸出和標準錯誤）print(f"piper stdout: {result.stdout.decode()}")print(f"piper stderr: {result.stderr.decode()}")if not os.path.exists(output_file):raise FileNotFoundError(f"音頻文件未生成: {output_file}")except subprocess.CalledProcessError as e:raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")# 異步版本的 TTS 生成
async def synthesize_text_async(text: str, output_file: str):try:# 使用異步方式調用 piper 命令生成音頻process = await asyncio.create_subprocess_exec('echo', text, '|', 'piper', '--model', MODEL_PATH, '--output_file', output_file,stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)stdout, stderr = await process.communicate()# 打印 piper 命令的輸出（標準輸出和標準錯誤）print(f"piper stdout: {stdout.decode()}")print(f"piper stderr: {stderr.decode()}")if process.returncode != 0 or not os.path.exists(output_file):raise FileNotFoundError(f"音頻文件未生成: {output_file}")except Exception as e:raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")@app.post("/synthesize/")
async def synthesize(request: SynthesizeRequest):text = request.textif not text:raise HTTPException(status_code=400, detail="Text is required")# 生成音頻文件路徑（絕對路徑）# output_file = os.path.join(OUTPUT_DIR, 'welcome.wav')# 生成唯一的文件名，避免并發沖突unique_id = str(uuid.uuid4())  # 使用 UUID 生成唯一標識符output_file = os.path.join(OUTPUT_DIR, f"{unique_id}.wav")  # 生成唯一的文件名# 根據請求的模式選擇同步或異步if request.mode == 'sync':synthesize_text_sync(text, output_file)elif request.mode == 'async':await synthesize_text_async(text, output_file)else:raise HTTPException(status_code=400, detail="Invalid mode, must be 'sync' or 'async'")# 確保文件存在后再返回if not os.path.exists(output_file):raise HTTPException(status_code=500, detail=f"音頻文件生成失敗: {output_file}")# 返回音頻文件return FileResponse(output_file, media_type='audio/wav')

語音識別

用vosk
1,nuget安裝vosk
2,去官網下載一個中文模型(只有42M) : https://alphacephei.com/vosk/models
42M的模型效果很一般,好在使用簡單
還有個1.3G的模型,沒有測試,應該不錯

using Vosk;/// <summary>
/// 語音轉文字
/// 需要去官網下載模型: https://alphacephei.com/vosk/models
/// 解壓模型后放在項目目錄下面
/// </summary>
public class VoskDemo
{public static void DemoBytes(Model model){// Demo byte buffer 基本語音識別VoskRecognizer rec = new VoskRecognizer(model, 16000.0f);rec.SetMaxAlternatives(0);rec.SetWords(true);using(Stream source = File.OpenRead("test.wav")) {byte[] buffer = new byte[4096];int bytesRead;while((bytesRead = source.Read(buffer, 0, buffer.Length)) > 0) {if (rec.AcceptWaveform(buffer, bytesRead)) {Console.WriteLine(rec.Result());} else {Console.WriteLine(rec.PartialResult());}}}Console.WriteLine(rec.FinalResult());}public static void DemoFloats(Model model){// Demo float array  流媒體VoskRecognizer rec = new VoskRecognizer(model, 16000.0f);using(Stream source = File.OpenRead("test.wav")) {byte[] buffer = new byte[4096];int bytesRead;while((bytesRead = source.Read(buffer, 0, buffer.Length)) > 0) {float[] fbuffer = new float[bytesRead / 2];for (int i = 0, n = 0; i < fbuffer.Length; i++, n+=2) {fbuffer[i] = BitConverter.ToInt16(buffer, n);}if (rec.AcceptWaveform(fbuffer, fbuffer.Length)) {Console.WriteLine(rec.Result());} else {Console.WriteLine(rec.PartialResult());}}}Console.WriteLine(rec.FinalResult());}public static void DemoSpeaker(Model model){// Output speakers 說話人識別SpkModel spkModel = new SpkModel("model-spk");VoskRecognizer rec = new VoskRecognizer(model, 16000.0f);rec.SetSpkModel(spkModel);using(Stream source = File.OpenRead("test.wav")) {byte[] buffer = new byte[4096];int bytesRead;while((bytesRead = source.Read(buffer, 0, buffer.Length)) > 0) {if (rec.AcceptWaveform(buffer, bytesRead)) {Console.WriteLine(rec.Result());} else {Console.WriteLine(rec.PartialResult());}}}Console.WriteLine(rec.FinalResult());}public static void Test(){// You can set to -1 to disable logging messagesVosk.Vosk.SetLogLevel(0);Model model = new Model("vosk-model-small-cn-0.22"); // 模型目錄//DemoBytes(model);//DemoFloats(model);DemoSpeaker(model);}
}

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/pingmian/84610.shtml
繁體地址，請注明出處：http://hk.pswp.cn/pingmian/84610.shtml
英文地址，請注明出處：http://en.pswp.cn/pingmian/84610.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！