feat: Qwen2-Audio语音模型API服务初始版本
This commit is contained in:
131
README.md
Normal file
131
README.md
Normal file
@@ -0,0 +1,131 @@
|
||||
# Qwen2-Audio 语音模型服务
|
||||
|
||||
语音交互模型 API 服务,基于 Qwen2-Audio-7B-Instruct。
|
||||
|
||||
## 端口
|
||||
|
||||
- **服务端口**: 19018
|
||||
|
||||
## 功能
|
||||
|
||||
- 语音识别 + 对话生成
|
||||
- 多轮对话支持
|
||||
- 文本对话接口(可选)
|
||||
- 对话历史管理
|
||||
|
||||
## 部署步骤
|
||||
|
||||
### 1. 安装依赖
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 2. 启动服务
|
||||
|
||||
```bash
|
||||
# 方式一:直接启动
|
||||
python server.py
|
||||
|
||||
# 方式二:使用脚本(可指定端口)
|
||||
PORT=19018 ./start.sh
|
||||
```
|
||||
|
||||
### 3. 验证服务
|
||||
|
||||
```bash
|
||||
curl http://localhost:19018/
|
||||
```
|
||||
|
||||
返回:
|
||||
```json
|
||||
{"status": "ok", "model": "Qwen/Qwen2-Audio-7B-Instruct", "conversations": 0}
|
||||
```
|
||||
|
||||
## API 接口
|
||||
|
||||
### 语音推理
|
||||
|
||||
```bash
|
||||
POST /api/voice/inference
|
||||
Content-Type: multipart/form-data
|
||||
|
||||
参数:
|
||||
- audio: 音频文件 (WAV/MP3/FLAC)
|
||||
- conversation_id: 对话ID(可选,不传则创建新对话)
|
||||
- max_length: 最大生成长度(默认256)
|
||||
|
||||
返回:
|
||||
{
|
||||
"reply": "你好,有什么可以帮助你的吗?",
|
||||
"conversation_id": "xxx-xxx-xxx",
|
||||
"timestamp": "2026-04-21T18:00:00"
|
||||
}
|
||||
```
|
||||
|
||||
**示例**:
|
||||
```bash
|
||||
curl -X POST http://localhost:19018/api/voice/inference \
|
||||
-F "audio=@test.wav"
|
||||
```
|
||||
|
||||
### 文本推理(测试用)
|
||||
|
||||
```bash
|
||||
POST /api/voice/text
|
||||
Content-Type: multipart/form-data
|
||||
|
||||
参数:
|
||||
- text: 文本消息
|
||||
- conversation_id: 对话ID(可选)
|
||||
|
||||
返回: 同上
|
||||
```
|
||||
|
||||
### 获取对话历史
|
||||
|
||||
```bash
|
||||
GET /api/voice/conversation/{conversation_id}
|
||||
```
|
||||
|
||||
### 删除对话
|
||||
|
||||
```bash
|
||||
DELETE /api/voice/conversation/{conversation_id}
|
||||
```
|
||||
|
||||
## 多轮对话
|
||||
|
||||
第一轮:
|
||||
```bash
|
||||
curl -X POST http://localhost:19018/api/voice/inference \
|
||||
-F "audio=@audio1.wav"
|
||||
|
||||
# 返回 conversation_id: "abc-123"
|
||||
```
|
||||
|
||||
第二轮:
|
||||
```bash
|
||||
curl -X POST http://localhost:19018/api/voice/inference \
|
||||
-F "audio=@audio2.wav" \
|
||||
-F "conversation_id=abc-123"
|
||||
```
|
||||
|
||||
## 环境变量
|
||||
|
||||
| 变量 | 说明 | 默认值 |
|
||||
|------|------|--------|
|
||||
| MODEL_NAME | 模型名称 | Qwen/Qwen2-Audio-7B-Instruct |
|
||||
| MAX_HISTORY_TURNS | 最大历史轮数 | 10 |
|
||||
|
||||
## 硬件要求
|
||||
|
||||
- **GPU**: 推荐 NVIDIA GPU,显存 ≥ 16GB
|
||||
- **CPU**: 可运行,但速度较慢
|
||||
- **内存**: ≥ 32GB
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. 模型首次加载需要下载约 15GB,请确保网络畅通
|
||||
2. 音频会自动转换为 16kHz 单声道格式
|
||||
3. 对话历史存储在内存中,重启服务会丢失
|
||||
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
fastapi==0.110.0
|
||||
uvicorn==0.27.1
|
||||
python-multipart==0.0.9
|
||||
librosa==0.10.1
|
||||
soundfile==0.12.1
|
||||
torch==2.2.0
|
||||
transformers==4.38.0
|
||||
modelscope==1.12.0
|
||||
342
server.py
Normal file
342
server.py
Normal file
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Qwen2-Audio 模型服务
|
||||
语音转文字 + 多轮对话
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import uuid
|
||||
import tempfile
|
||||
import logging
|
||||
from typing import Optional, List, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
|
||||
# 模型加载(延迟加载)
|
||||
model = None
|
||||
processor = None
|
||||
|
||||
# 对话历史存储(内存,可换成 Redis)
|
||||
conversations: Dict[str, List[Dict]] = {}
|
||||
|
||||
# 配置
|
||||
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2-Audio-7B-Instruct")
|
||||
MAX_HISTORY_TURNS = int(os.getenv("MAX_HISTORY_TURNS", "10")) # 最多保留10轮对话
|
||||
SAMPLE_RATE = 16000 # Qwen2-Audio 采样率
|
||||
|
||||
# 日志
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(
|
||||
title="Qwen2-Audio Voice Service",
|
||||
description="语音交互模型服务",
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# CORS
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
class VoiceResponse(BaseModel):
|
||||
"""语音响应"""
|
||||
reply: str
|
||||
conversation_id: str
|
||||
timestamp: str
|
||||
|
||||
|
||||
class ConversationHistory(BaseModel):
|
||||
"""对话历史"""
|
||||
conversation_id: str
|
||||
history: List[Dict[str, Any]]
|
||||
|
||||
|
||||
def load_model():
|
||||
"""加载模型(延迟加载)"""
|
||||
global model, processor
|
||||
if model is None:
|
||||
logger.info(f"Loading model: {MODEL_NAME}")
|
||||
from modelscope import Qwen2AudioForConditionalGeneration, AutoProcessor
|
||||
processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
||||
model = Qwen2AudioForConditionalGeneration.from_pretrained(
|
||||
MODEL_NAME,
|
||||
device_map="auto"
|
||||
)
|
||||
logger.info("Model loaded successfully")
|
||||
return model, processor
|
||||
|
||||
|
||||
def process_audio(audio_bytes: bytes) -> tuple:
|
||||
"""
|
||||
处理音频文件
|
||||
返回: (audio_array, sample_rate)
|
||||
"""
|
||||
# 使用 soundfile 读取音频
|
||||
audio_io = io.BytesIO(audio_bytes)
|
||||
audio, sr = sf.read(audio_io)
|
||||
|
||||
# 转换为单声道
|
||||
if len(audio.shape) > 1:
|
||||
audio = audio.mean(axis=1)
|
||||
|
||||
# 重采样到 16kHz
|
||||
if sr != SAMPLE_RATE:
|
||||
import resampy
|
||||
audio = resampy.resample(audio, sr, SAMPLE_RATE)
|
||||
|
||||
return audio, SAMPLE_RATE
|
||||
|
||||
|
||||
def build_conversation(
|
||||
history: List[Dict],
|
||||
audio_array: Optional[Any] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
构建对话格式
|
||||
"""
|
||||
conversation = []
|
||||
|
||||
for turn in history:
|
||||
if turn["role"] == "user" and turn.get("audio"):
|
||||
# 音频消息
|
||||
conversation.append({
|
||||
"role": "user",
|
||||
"content": [{"type": "audio", "audio_url": turn["audio"]}]
|
||||
})
|
||||
elif turn["role"] == "user":
|
||||
# 文本消息
|
||||
conversation.append({
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": turn["content"]}]
|
||||
})
|
||||
else:
|
||||
conversation.append({
|
||||
"role": "assistant",
|
||||
"content": turn["content"]
|
||||
})
|
||||
|
||||
# 添加当前音频
|
||||
if audio_array is not None:
|
||||
conversation.append({
|
||||
"role": "user",
|
||||
"content": [{"type": "audio", "audio": audio_array}]
|
||||
})
|
||||
|
||||
return conversation
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
"""启动时预加载模型"""
|
||||
logger.info("Preloading model...")
|
||||
load_model()
|
||||
logger.info("Server ready!")
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""健康检查"""
|
||||
return {
|
||||
"status": "ok",
|
||||
"model": MODEL_NAME,
|
||||
"conversations": len(conversations)
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/voice/inference", response_model=VoiceResponse)
|
||||
async def inference(
|
||||
audio: UploadFile = File(..., description="音频文件"),
|
||||
conversation_id: Optional[str] = Form(None, description="对话ID,不传则创建新对话"),
|
||||
max_length: int = Form(256, description="最大生成长度")
|
||||
):
|
||||
"""
|
||||
语音推理接口
|
||||
|
||||
- 接收音频文件
|
||||
- 返回模型回复文本
|
||||
- 支持多轮对话
|
||||
"""
|
||||
try:
|
||||
# 加载模型
|
||||
model, processor = load_model()
|
||||
|
||||
# 读取音频
|
||||
audio_bytes = await audio.read()
|
||||
audio_array, sr = process_audio(audio_bytes)
|
||||
|
||||
# 获取或创建对话
|
||||
if conversation_id is None:
|
||||
conversation_id = str(uuid.uuid4())
|
||||
conversations[conversation_id] = []
|
||||
|
||||
history = conversations.get(conversation_id, [])
|
||||
|
||||
# 构建对话
|
||||
conversation = []
|
||||
for turn in history:
|
||||
conversation.append(turn)
|
||||
|
||||
# 添加当前音频
|
||||
conversation.append({
|
||||
"role": "user",
|
||||
"content": [{"type": "audio"}]
|
||||
})
|
||||
|
||||
# 处理对话格式
|
||||
text = processor.apply_chat_template(
|
||||
conversation,
|
||||
add_generation_prompt=True,
|
||||
tokenize=False
|
||||
)
|
||||
|
||||
# 提取历史音频(如果有)
|
||||
audios = []
|
||||
for turn in history:
|
||||
if turn.get("audio_array") is not None:
|
||||
audios.append(turn["audio_array"])
|
||||
audios.append(audio_array)
|
||||
|
||||
# 推理
|
||||
inputs = processor(
|
||||
text=text,
|
||||
audios=audios if audios else None,
|
||||
return_tensors="pt",
|
||||
padding=True
|
||||
)
|
||||
inputs.input_ids = inputs.input_ids.to("cuda")
|
||||
|
||||
generate_ids = model.generate(**inputs, max_length=max_length)
|
||||
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
|
||||
|
||||
reply = processor.batch_decode(
|
||||
generate_ids,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False
|
||||
)[0]
|
||||
|
||||
# 保存对话历史(不保存音频数组,太大)
|
||||
history.append({
|
||||
"role": "user",
|
||||
"content": "[audio]",
|
||||
"audio_array": None # 不保存,太占内存
|
||||
})
|
||||
history.append({
|
||||
"role": "assistant",
|
||||
"content": reply
|
||||
})
|
||||
|
||||
# 限制历史长度
|
||||
if len(history) > MAX_HISTORY_TURNS * 2:
|
||||
history = history[-MAX_HISTORY_TURNS * 2:]
|
||||
|
||||
conversations[conversation_id] = history
|
||||
|
||||
logger.info(f"Conversation {conversation_id}: {len(history)//2} turns")
|
||||
|
||||
return VoiceResponse(
|
||||
reply=reply,
|
||||
conversation_id=conversation_id,
|
||||
timestamp=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Inference error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/api/voice/text", response_model=VoiceResponse)
|
||||
async def text_inference(
|
||||
text: str = Form(..., description="文本消息"),
|
||||
conversation_id: Optional[str] = Form(None, description="对话ID")
|
||||
):
|
||||
"""
|
||||
文本推理接口(可选,用于测试或纯文本对话)
|
||||
"""
|
||||
try:
|
||||
model, processor = load_model()
|
||||
|
||||
if conversation_id is None:
|
||||
conversation_id = str(uuid.uuid4())
|
||||
conversations[conversation_id] = []
|
||||
|
||||
history = conversations.get(conversation_id, [])
|
||||
|
||||
# 构建对话
|
||||
conversation = []
|
||||
for turn in history:
|
||||
conversation.append(turn)
|
||||
conversation.append({
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": text}]
|
||||
})
|
||||
|
||||
# 处理
|
||||
prompt = processor.apply_chat_template(
|
||||
conversation,
|
||||
add_generation_prompt=True,
|
||||
tokenize=False
|
||||
)
|
||||
|
||||
inputs = processor(text=prompt, return_tensors="pt", padding=True)
|
||||
inputs.input_ids = inputs.input_ids.to("cuda")
|
||||
|
||||
generate_ids = model.generate(**inputs, max_length=256)
|
||||
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
|
||||
|
||||
reply = processor.batch_decode(
|
||||
generate_ids,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_tokens=False
|
||||
)[0]
|
||||
|
||||
# 保存历史
|
||||
history.append({"role": "user", "content": text})
|
||||
history.append({"role": "assistant", "content": reply})
|
||||
conversations[conversation_id] = history
|
||||
|
||||
return VoiceResponse(
|
||||
reply=reply,
|
||||
conversation_id=conversation_id,
|
||||
timestamp=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Text inference error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.get("/api/voice/conversation/{conversation_id}", response_model=ConversationHistory)
|
||||
async def get_conversation(conversation_id: str):
|
||||
"""获取对话历史"""
|
||||
if conversation_id not in conversations:
|
||||
raise HTTPException(status_code=404, detail="Conversation not found")
|
||||
|
||||
return ConversationHistory(
|
||||
conversation_id=conversation_id,
|
||||
history=conversations[conversation_id]
|
||||
)
|
||||
|
||||
|
||||
@app.delete("/api/voice/conversation/{conversation_id}")
|
||||
async def delete_conversation(conversation_id: str):
|
||||
"""删除对话"""
|
||||
if conversation_id in conversations:
|
||||
del conversations[conversation_id]
|
||||
return {"status": "deleted"}
|
||||
return {"status": "not_found"}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=19018)
|
||||
18
start.sh
Executable file
18
start.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
# Qwen2-Audio 模型服务启动脚本
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# 默认端口
|
||||
PORT=${PORT:-19018}
|
||||
|
||||
# 检查是否有 GPU
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
echo "GPU detected, using CUDA"
|
||||
else
|
||||
echo "No GPU detected, using CPU (slower)"
|
||||
fi
|
||||
|
||||
# 启动服务
|
||||
echo "Starting Qwen2-Audio server on port $PORT..."
|
||||
python server.py
|
||||
Reference in New Issue
Block a user