Whisper+T5-translate實現python實時語音翻譯

1.首先下載模型，加載模型

import torch
import numpy as np
import webrtcvad
import pyaudio
import queue
import threading
from datetime import datetime
from faster_whisper import WhisperModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
from transformers import T5ForConditionalGeneration, T5Tokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16
save_directory = "./faster-distil-whiper-large-v3-local"  # 替換為你希望保存的本地路徑
# en_zh_directory = "./opus-mt-en-zh-local"  # 替換為你希望保存的本地路徑
en_zh_directory = "./t5-translate-en-ru-zh-base-200-sent-local"  # 替換為你希望保存的本地路徑
whisperModel = WhisperModel(save_directory, device="cuda", compute_type="float32")model = T5ForConditionalGeneration.from_pretrained(en_zh_directory)
model.eval()
model.to(device)
tokenizer = T5Tokenizer.from_pretrained(en_zh_directory)
vad = webrtcvad.Vad(3)  # 設置 VAD 靈敏度（0-3，3 最敏感）
prefix = 'translate to zh: '

2.配置麥克風

# 初始化 PyAudio
p = pyaudio.PyAudio()
# 設置音頻流參數
FORMAT = pyaudio.paInt16  # 16-bit 音頻格式
CHANNELS = 1              # 單聲道
RATE = 16000              # 采樣率（Whisper 需要 16kHz）
FRAME_DURATION = 20       # 每幀的時長（ms）
CHUNK = int(RATE * FRAME_DURATION / 1000)  # 每幀的幀數
MIN_SILENCE_DURATION = 0.2  # 最小靜音時長（秒）

3.隊列構建，構建錄音基本參數

# 共享隊列，用于錄音和推理線程之間的數據交換
audio_queue = queue.Queue()silence_frames = 0
silence_frames_lock = threading.Lock()

4.構建錄音函數

# 錄音線程
def record_audio():global silence_framesstream = p.open(format=FORMAT,channels=CHANNELS,rate=RATE,input=True,frames_per_buffer=CHUNK,)print("開始錄音...按 Ctrl+C 停止")try:while True:# 從麥克風讀取音頻數據data = stream.read(CHUNK)audio_data = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0# 使用 VAD 檢測語音活動if vad.is_speech(data, RATE):audio_queue.put(audio_data)with silence_frames_lock:silence_frames = 0  # 重置靜音計數器else:with silence_frames_lock:silence_frames += 1  # 重置靜音計數器except KeyboardInterrupt:print("錄音停止")finally:stream.stop_stream()stream.close()p.terminate()

5.構建翻譯函數

def process_audio():global silence_framesaudio_buffer = np.array([], dtype=np.float32)silence_frames = 0while True:try:# 從隊列中獲取音頻數據audio_data = audio_queue.get(timeout=1)  # 超時 1 秒audio_buffer = np.concatenate((audio_buffer, audio_data))except Exception as e:passcurrent_silence_frames = 0# 檢查靜音計數器with silence_frames_lock:current_silence_frames = silence_frames# 如果檢測到靜音時間超過閾值，處理累積的音頻if (current_silence_frames > MIN_SILENCE_DURATION * (RATE / CHUNK)) or len(audio_buffer) > 320 * 200:if(len(audio_buffer) > 0):#, language="en"segments, _ = whisperModel.transcribe(audio_buffer,vad_filter=True,vad_parameters=dict(min_silence_duration_ms=200), language="en", condition_on_previous_text=True)for segment in segments:if(segment.text == ""):continueelif(segment.text == "Thank you."):print("[%s] %s (%s)" % (str(datetime.now()), "感謝", segment.text))else:            src_text = prefix + segment.textinput_ids = tokenizer(src_text, return_tensors="pt")generated_tokens = model.generate(**input_ids.to(device))result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)print("[%s] %s (%s)" % (str(datetime.now()), result[0], segment.text))# result = pipeline(segment.text)# print("[%s] %s (%s)" % (str(datetime.now()), result[0]['translation_text'], segment.text))audio_buffer = np.array([], dtype=np.float16)silence_frames = 0

6.啟動線程，啟動程序

# 啟動錄音線程和推理線程
record_thread = threading.Thread(target=record_audio)
process_thread = threading.Thread(target=process_audio)record_thread.start()
process_thread.start()

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/895518.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/895518.shtml
英文地址，請注明出處：http://en.pswp.cn/news/895518.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！