feat: Qwen2-Audio语音模型API服务初始版本

This commit is contained in:
2026-04-21 18:04:59 +08:00
commit 8d45553ea6
4 changed files with 499 additions and 0 deletions

131
README.md Normal file
View File

@@ -0,0 +1,131 @@
# Qwen2-Audio 语音模型服务
语音交互模型 API 服务,基于 Qwen2-Audio-7B-Instruct。
## 端口
- **服务端口**: 19018
## 功能
- 语音识别 + 对话生成
- 多轮对话支持
- 文本对话接口(可选)
- 对话历史管理
## 部署步骤
### 1. 安装依赖
```bash
pip install -r requirements.txt
```
### 2. 启动服务
```bash
# 方式一:直接启动
python server.py
# 方式二:使用脚本(可指定端口)
PORT=19018 ./start.sh
```
### 3. 验证服务
```bash
curl http://localhost:19018/
```
返回:
```json
{"status": "ok", "model": "Qwen/Qwen2-Audio-7B-Instruct", "conversations": 0}
```
## API 接口
### 语音推理
```bash
POST /api/voice/inference
Content-Type: multipart/form-data
参数:
- audio: 音频文件 (WAV/MP3/FLAC)
- conversation_id: 对话ID可选不传则创建新对话
- max_length: 最大生成长度默认256
返回:
{
"reply": "你好,有什么可以帮助你的吗?",
"conversation_id": "xxx-xxx-xxx",
"timestamp": "2026-04-21T18:00:00"
}
```
**示例**
```bash
curl -X POST http://localhost:19018/api/voice/inference \
-F "audio=@test.wav"
```
### 文本推理(测试用)
```bash
POST /api/voice/text
Content-Type: multipart/form-data
参数:
- text: 文本消息
- conversation_id: 对话ID可选
返回: 同上
```
### 获取对话历史
```bash
GET /api/voice/conversation/{conversation_id}
```
### 删除对话
```bash
DELETE /api/voice/conversation/{conversation_id}
```
## 多轮对话
第一轮:
```bash
curl -X POST http://localhost:19018/api/voice/inference \
-F "audio=@audio1.wav"
# 返回 conversation_id: "abc-123"
```
第二轮:
```bash
curl -X POST http://localhost:19018/api/voice/inference \
-F "audio=@audio2.wav" \
-F "conversation_id=abc-123"
```
## 环境变量
| 变量 | 说明 | 默认值 |
|------|------|--------|
| MODEL_NAME | 模型名称 | Qwen/Qwen2-Audio-7B-Instruct |
| MAX_HISTORY_TURNS | 最大历史轮数 | 10 |
## 硬件要求
- **GPU**: 推荐 NVIDIA GPU显存 ≥ 16GB
- **CPU**: 可运行,但速度较慢
- **内存**: ≥ 32GB
## 注意事项
1. 模型首次加载需要下载约 15GB请确保网络畅通
2. 音频会自动转换为 16kHz 单声道格式
3. 对话历史存储在内存中,重启服务会丢失

8
requirements.txt Normal file
View File

@@ -0,0 +1,8 @@
fastapi==0.110.0
uvicorn==0.27.1
python-multipart==0.0.9
librosa==0.10.1
soundfile==0.12.1
torch==2.2.0
transformers==4.38.0
modelscope==1.12.0

342
server.py Normal file
View File

@@ -0,0 +1,342 @@
"""
Qwen2-Audio 模型服务
语音转文字 + 多轮对话
"""
import os
import io
import uuid
import tempfile
import logging
from typing import Optional, List, Dict, Any
from datetime import datetime
import librosa
import soundfile as sf
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
# 模型加载(延迟加载)
model = None
processor = None
# 对话历史存储(内存,可换成 Redis
conversations: Dict[str, List[Dict]] = {}
# 配置
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2-Audio-7B-Instruct")
MAX_HISTORY_TURNS = int(os.getenv("MAX_HISTORY_TURNS", "10")) # 最多保留10轮对话
SAMPLE_RATE = 16000 # Qwen2-Audio 采样率
# 日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(
title="Qwen2-Audio Voice Service",
description="语音交互模型服务",
version="1.0.0"
)
# CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class VoiceResponse(BaseModel):
"""语音响应"""
reply: str
conversation_id: str
timestamp: str
class ConversationHistory(BaseModel):
"""对话历史"""
conversation_id: str
history: List[Dict[str, Any]]
def load_model():
"""加载模型(延迟加载)"""
global model, processor
if model is None:
logger.info(f"Loading model: {MODEL_NAME}")
from modelscope import Qwen2AudioForConditionalGeneration, AutoProcessor
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = Qwen2AudioForConditionalGeneration.from_pretrained(
MODEL_NAME,
device_map="auto"
)
logger.info("Model loaded successfully")
return model, processor
def process_audio(audio_bytes: bytes) -> tuple:
"""
处理音频文件
返回: (audio_array, sample_rate)
"""
# 使用 soundfile 读取音频
audio_io = io.BytesIO(audio_bytes)
audio, sr = sf.read(audio_io)
# 转换为单声道
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
# 重采样到 16kHz
if sr != SAMPLE_RATE:
import resampy
audio = resampy.resample(audio, sr, SAMPLE_RATE)
return audio, SAMPLE_RATE
def build_conversation(
history: List[Dict],
audio_array: Optional[Any] = None
) -> List[Dict]:
"""
构建对话格式
"""
conversation = []
for turn in history:
if turn["role"] == "user" and turn.get("audio"):
# 音频消息
conversation.append({
"role": "user",
"content": [{"type": "audio", "audio_url": turn["audio"]}]
})
elif turn["role"] == "user":
# 文本消息
conversation.append({
"role": "user",
"content": [{"type": "text", "text": turn["content"]}]
})
else:
conversation.append({
"role": "assistant",
"content": turn["content"]
})
# 添加当前音频
if audio_array is not None:
conversation.append({
"role": "user",
"content": [{"type": "audio", "audio": audio_array}]
})
return conversation
@app.on_event("startup")
async def startup():
"""启动时预加载模型"""
logger.info("Preloading model...")
load_model()
logger.info("Server ready!")
@app.get("/")
async def root():
"""健康检查"""
return {
"status": "ok",
"model": MODEL_NAME,
"conversations": len(conversations)
}
@app.post("/api/voice/inference", response_model=VoiceResponse)
async def inference(
audio: UploadFile = File(..., description="音频文件"),
conversation_id: Optional[str] = Form(None, description="对话ID不传则创建新对话"),
max_length: int = Form(256, description="最大生成长度")
):
"""
语音推理接口
- 接收音频文件
- 返回模型回复文本
- 支持多轮对话
"""
try:
# 加载模型
model, processor = load_model()
# 读取音频
audio_bytes = await audio.read()
audio_array, sr = process_audio(audio_bytes)
# 获取或创建对话
if conversation_id is None:
conversation_id = str(uuid.uuid4())
conversations[conversation_id] = []
history = conversations.get(conversation_id, [])
# 构建对话
conversation = []
for turn in history:
conversation.append(turn)
# 添加当前音频
conversation.append({
"role": "user",
"content": [{"type": "audio"}]
})
# 处理对话格式
text = processor.apply_chat_template(
conversation,
add_generation_prompt=True,
tokenize=False
)
# 提取历史音频(如果有)
audios = []
for turn in history:
if turn.get("audio_array") is not None:
audios.append(turn["audio_array"])
audios.append(audio_array)
# 推理
inputs = processor(
text=text,
audios=audios if audios else None,
return_tensors="pt",
padding=True
)
inputs.input_ids = inputs.input_ids.to("cuda")
generate_ids = model.generate(**inputs, max_length=max_length)
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
reply = processor.batch_decode(
generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
# 保存对话历史(不保存音频数组,太大)
history.append({
"role": "user",
"content": "[audio]",
"audio_array": None # 不保存,太占内存
})
history.append({
"role": "assistant",
"content": reply
})
# 限制历史长度
if len(history) > MAX_HISTORY_TURNS * 2:
history = history[-MAX_HISTORY_TURNS * 2:]
conversations[conversation_id] = history
logger.info(f"Conversation {conversation_id}: {len(history)//2} turns")
return VoiceResponse(
reply=reply,
conversation_id=conversation_id,
timestamp=datetime.now().isoformat()
)
except Exception as e:
logger.error(f"Inference error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/voice/text", response_model=VoiceResponse)
async def text_inference(
text: str = Form(..., description="文本消息"),
conversation_id: Optional[str] = Form(None, description="对话ID")
):
"""
文本推理接口(可选,用于测试或纯文本对话)
"""
try:
model, processor = load_model()
if conversation_id is None:
conversation_id = str(uuid.uuid4())
conversations[conversation_id] = []
history = conversations.get(conversation_id, [])
# 构建对话
conversation = []
for turn in history:
conversation.append(turn)
conversation.append({
"role": "user",
"content": [{"type": "text", "text": text}]
})
# 处理
prompt = processor.apply_chat_template(
conversation,
add_generation_prompt=True,
tokenize=False
)
inputs = processor(text=prompt, return_tensors="pt", padding=True)
inputs.input_ids = inputs.input_ids.to("cuda")
generate_ids = model.generate(**inputs, max_length=256)
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
reply = processor.batch_decode(
generate_ids,
skip_special_tokens=True,
clean_up_tokenization_tokens=False
)[0]
# 保存历史
history.append({"role": "user", "content": text})
history.append({"role": "assistant", "content": reply})
conversations[conversation_id] = history
return VoiceResponse(
reply=reply,
conversation_id=conversation_id,
timestamp=datetime.now().isoformat()
)
except Exception as e:
logger.error(f"Text inference error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/voice/conversation/{conversation_id}", response_model=ConversationHistory)
async def get_conversation(conversation_id: str):
"""获取对话历史"""
if conversation_id not in conversations:
raise HTTPException(status_code=404, detail="Conversation not found")
return ConversationHistory(
conversation_id=conversation_id,
history=conversations[conversation_id]
)
@app.delete("/api/voice/conversation/{conversation_id}")
async def delete_conversation(conversation_id: str):
"""删除对话"""
if conversation_id in conversations:
del conversations[conversation_id]
return {"status": "deleted"}
return {"status": "not_found"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=19018)

18
start.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
# Qwen2-Audio 模型服务启动脚本
cd "$(dirname "$0")"
# 默认端口
PORT=${PORT:-19018}
# 检查是否有 GPU
if command -v nvidia-smi &> /dev/null; then
echo "GPU detected, using CUDA"
else
echo "No GPU detected, using CPU (slower)"
fi
# 启动服务
echo "Starting Qwen2-Audio server on port $PORT..."
python server.py