From 8d45553ea648145cce647b502293af0e6d8c08d9 Mon Sep 17 00:00:00 2001 From: hubian <908234780@qq.com> Date: Tue, 21 Apr 2026 18:04:59 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20Qwen2-Audio=E8=AF=AD=E9=9F=B3=E6=A8=A1?= =?UTF-8?q?=E5=9E=8BAPI=E6=9C=8D=E5=8A=A1=E5=88=9D=E5=A7=8B=E7=89=88?= =?UTF-8?q?=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 131 ++++++++++++++++++ requirements.txt | 8 ++ server.py | 342 +++++++++++++++++++++++++++++++++++++++++++++++ start.sh | 18 +++ 4 files changed, 499 insertions(+) create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 server.py create mode 100755 start.sh diff --git a/README.md b/README.md new file mode 100644 index 0000000..ecbc0d2 --- /dev/null +++ b/README.md @@ -0,0 +1,131 @@ +# Qwen2-Audio 语音模型服务 + +语音交互模型 API 服务,基于 Qwen2-Audio-7B-Instruct。 + +## 端口 + +- **服务端口**: 19018 + +## 功能 + +- 语音识别 + 对话生成 +- 多轮对话支持 +- 文本对话接口(可选) +- 对话历史管理 + +## 部署步骤 + +### 1. 安装依赖 + +```bash +pip install -r requirements.txt +``` + +### 2. 启动服务 + +```bash +# 方式一:直接启动 +python server.py + +# 方式二:使用脚本(可指定端口) +PORT=19018 ./start.sh +``` + +### 3. 验证服务 + +```bash +curl http://localhost:19018/ +``` + +返回: +```json +{"status": "ok", "model": "Qwen/Qwen2-Audio-7B-Instruct", "conversations": 0} +``` + +## API 接口 + +### 语音推理 + +```bash +POST /api/voice/inference +Content-Type: multipart/form-data + +参数: +- audio: 音频文件 (WAV/MP3/FLAC) +- conversation_id: 对话ID(可选,不传则创建新对话) +- max_length: 最大生成长度(默认256) + +返回: +{ + "reply": "你好,有什么可以帮助你的吗?", + "conversation_id": "xxx-xxx-xxx", + "timestamp": "2026-04-21T18:00:00" +} +``` + +**示例**: +```bash +curl -X POST http://localhost:19018/api/voice/inference \ + -F "audio=@test.wav" +``` + +### 文本推理(测试用) + +```bash +POST /api/voice/text +Content-Type: multipart/form-data + +参数: +- text: 文本消息 +- conversation_id: 对话ID(可选) + +返回: 同上 +``` + +### 获取对话历史 + +```bash +GET /api/voice/conversation/{conversation_id} +``` + +### 删除对话 + +```bash +DELETE /api/voice/conversation/{conversation_id} +``` + +## 多轮对话 + +第一轮: +```bash +curl -X POST http://localhost:19018/api/voice/inference \ + -F "audio=@audio1.wav" + +# 返回 conversation_id: "abc-123" +``` + +第二轮: +```bash +curl -X POST http://localhost:19018/api/voice/inference \ + -F "audio=@audio2.wav" \ + -F "conversation_id=abc-123" +``` + +## 环境变量 + +| 变量 | 说明 | 默认值 | +|------|------|--------| +| MODEL_NAME | 模型名称 | Qwen/Qwen2-Audio-7B-Instruct | +| MAX_HISTORY_TURNS | 最大历史轮数 | 10 | + +## 硬件要求 + +- **GPU**: 推荐 NVIDIA GPU,显存 ≥ 16GB +- **CPU**: 可运行,但速度较慢 +- **内存**: ≥ 32GB + +## 注意事项 + +1. 模型首次加载需要下载约 15GB,请确保网络畅通 +2. 音频会自动转换为 16kHz 单声道格式 +3. 对话历史存储在内存中,重启服务会丢失 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cbc0bab --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +fastapi==0.110.0 +uvicorn==0.27.1 +python-multipart==0.0.9 +librosa==0.10.1 +soundfile==0.12.1 +torch==2.2.0 +transformers==4.38.0 +modelscope==1.12.0 \ No newline at end of file diff --git a/server.py b/server.py new file mode 100644 index 0000000..89d78f8 --- /dev/null +++ b/server.py @@ -0,0 +1,342 @@ +""" +Qwen2-Audio 模型服务 +语音转文字 + 多轮对话 +""" + +import os +import io +import uuid +import tempfile +import logging +from typing import Optional, List, Dict, Any +from datetime import datetime + +import librosa +import soundfile as sf +from fastapi import FastAPI, UploadFile, File, HTTPException, Form +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel + +# 模型加载(延迟加载) +model = None +processor = None + +# 对话历史存储(内存,可换成 Redis) +conversations: Dict[str, List[Dict]] = {} + +# 配置 +MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2-Audio-7B-Instruct") +MAX_HISTORY_TURNS = int(os.getenv("MAX_HISTORY_TURNS", "10")) # 最多保留10轮对话 +SAMPLE_RATE = 16000 # Qwen2-Audio 采样率 + +# 日志 +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = FastAPI( + title="Qwen2-Audio Voice Service", + description="语音交互模型服务", + version="1.0.0" +) + +# CORS +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +class VoiceResponse(BaseModel): + """语音响应""" + reply: str + conversation_id: str + timestamp: str + + +class ConversationHistory(BaseModel): + """对话历史""" + conversation_id: str + history: List[Dict[str, Any]] + + +def load_model(): + """加载模型(延迟加载)""" + global model, processor + if model is None: + logger.info(f"Loading model: {MODEL_NAME}") + from modelscope import Qwen2AudioForConditionalGeneration, AutoProcessor + processor = AutoProcessor.from_pretrained(MODEL_NAME) + model = Qwen2AudioForConditionalGeneration.from_pretrained( + MODEL_NAME, + device_map="auto" + ) + logger.info("Model loaded successfully") + return model, processor + + +def process_audio(audio_bytes: bytes) -> tuple: + """ + 处理音频文件 + 返回: (audio_array, sample_rate) + """ + # 使用 soundfile 读取音频 + audio_io = io.BytesIO(audio_bytes) + audio, sr = sf.read(audio_io) + + # 转换为单声道 + if len(audio.shape) > 1: + audio = audio.mean(axis=1) + + # 重采样到 16kHz + if sr != SAMPLE_RATE: + import resampy + audio = resampy.resample(audio, sr, SAMPLE_RATE) + + return audio, SAMPLE_RATE + + +def build_conversation( + history: List[Dict], + audio_array: Optional[Any] = None +) -> List[Dict]: + """ + 构建对话格式 + """ + conversation = [] + + for turn in history: + if turn["role"] == "user" and turn.get("audio"): + # 音频消息 + conversation.append({ + "role": "user", + "content": [{"type": "audio", "audio_url": turn["audio"]}] + }) + elif turn["role"] == "user": + # 文本消息 + conversation.append({ + "role": "user", + "content": [{"type": "text", "text": turn["content"]}] + }) + else: + conversation.append({ + "role": "assistant", + "content": turn["content"] + }) + + # 添加当前音频 + if audio_array is not None: + conversation.append({ + "role": "user", + "content": [{"type": "audio", "audio": audio_array}] + }) + + return conversation + + +@app.on_event("startup") +async def startup(): + """启动时预加载模型""" + logger.info("Preloading model...") + load_model() + logger.info("Server ready!") + + +@app.get("/") +async def root(): + """健康检查""" + return { + "status": "ok", + "model": MODEL_NAME, + "conversations": len(conversations) + } + + +@app.post("/api/voice/inference", response_model=VoiceResponse) +async def inference( + audio: UploadFile = File(..., description="音频文件"), + conversation_id: Optional[str] = Form(None, description="对话ID,不传则创建新对话"), + max_length: int = Form(256, description="最大生成长度") +): + """ + 语音推理接口 + + - 接收音频文件 + - 返回模型回复文本 + - 支持多轮对话 + """ + try: + # 加载模型 + model, processor = load_model() + + # 读取音频 + audio_bytes = await audio.read() + audio_array, sr = process_audio(audio_bytes) + + # 获取或创建对话 + if conversation_id is None: + conversation_id = str(uuid.uuid4()) + conversations[conversation_id] = [] + + history = conversations.get(conversation_id, []) + + # 构建对话 + conversation = [] + for turn in history: + conversation.append(turn) + + # 添加当前音频 + conversation.append({ + "role": "user", + "content": [{"type": "audio"}] + }) + + # 处理对话格式 + text = processor.apply_chat_template( + conversation, + add_generation_prompt=True, + tokenize=False + ) + + # 提取历史音频(如果有) + audios = [] + for turn in history: + if turn.get("audio_array") is not None: + audios.append(turn["audio_array"]) + audios.append(audio_array) + + # 推理 + inputs = processor( + text=text, + audios=audios if audios else None, + return_tensors="pt", + padding=True + ) + inputs.input_ids = inputs.input_ids.to("cuda") + + generate_ids = model.generate(**inputs, max_length=max_length) + generate_ids = generate_ids[:, inputs.input_ids.size(1):] + + reply = processor.batch_decode( + generate_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False + )[0] + + # 保存对话历史(不保存音频数组,太大) + history.append({ + "role": "user", + "content": "[audio]", + "audio_array": None # 不保存,太占内存 + }) + history.append({ + "role": "assistant", + "content": reply + }) + + # 限制历史长度 + if len(history) > MAX_HISTORY_TURNS * 2: + history = history[-MAX_HISTORY_TURNS * 2:] + + conversations[conversation_id] = history + + logger.info(f"Conversation {conversation_id}: {len(history)//2} turns") + + return VoiceResponse( + reply=reply, + conversation_id=conversation_id, + timestamp=datetime.now().isoformat() + ) + + except Exception as e: + logger.error(f"Inference error: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/api/voice/text", response_model=VoiceResponse) +async def text_inference( + text: str = Form(..., description="文本消息"), + conversation_id: Optional[str] = Form(None, description="对话ID") +): + """ + 文本推理接口(可选,用于测试或纯文本对话) + """ + try: + model, processor = load_model() + + if conversation_id is None: + conversation_id = str(uuid.uuid4()) + conversations[conversation_id] = [] + + history = conversations.get(conversation_id, []) + + # 构建对话 + conversation = [] + for turn in history: + conversation.append(turn) + conversation.append({ + "role": "user", + "content": [{"type": "text", "text": text}] + }) + + # 处理 + prompt = processor.apply_chat_template( + conversation, + add_generation_prompt=True, + tokenize=False + ) + + inputs = processor(text=prompt, return_tensors="pt", padding=True) + inputs.input_ids = inputs.input_ids.to("cuda") + + generate_ids = model.generate(**inputs, max_length=256) + generate_ids = generate_ids[:, inputs.input_ids.size(1):] + + reply = processor.batch_decode( + generate_ids, + skip_special_tokens=True, + clean_up_tokenization_tokens=False + )[0] + + # 保存历史 + history.append({"role": "user", "content": text}) + history.append({"role": "assistant", "content": reply}) + conversations[conversation_id] = history + + return VoiceResponse( + reply=reply, + conversation_id=conversation_id, + timestamp=datetime.now().isoformat() + ) + + except Exception as e: + logger.error(f"Text inference error: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/api/voice/conversation/{conversation_id}", response_model=ConversationHistory) +async def get_conversation(conversation_id: str): + """获取对话历史""" + if conversation_id not in conversations: + raise HTTPException(status_code=404, detail="Conversation not found") + + return ConversationHistory( + conversation_id=conversation_id, + history=conversations[conversation_id] + ) + + +@app.delete("/api/voice/conversation/{conversation_id}") +async def delete_conversation(conversation_id: str): + """删除对话""" + if conversation_id in conversations: + del conversations[conversation_id] + return {"status": "deleted"} + return {"status": "not_found"} + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=19018) \ No newline at end of file diff --git a/start.sh b/start.sh new file mode 100755 index 0000000..001453d --- /dev/null +++ b/start.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Qwen2-Audio 模型服务启动脚本 + +cd "$(dirname "$0")" + +# 默认端口 +PORT=${PORT:-19018} + +# 检查是否有 GPU +if command -v nvidia-smi &> /dev/null; then + echo "GPU detected, using CUDA" +else + echo "No GPU detected, using CPU (slower)" +fi + +# 启动服务 +echo "Starting Qwen2-Audio server on port $PORT..." +python server.py \ No newline at end of file