Compare commits

...

1 Commits

Author SHA1 Message Date
2dca775911 feat: 多模态支持 - 图片数据传递给视觉模型 2026-04-14 09:21:36 +08:00
2 changed files with 27 additions and 2 deletions

View File

@@ -969,7 +969,8 @@ async def websocket_endpoint(websocket: WebSocket, user_id: str):
messages=history,
provider_config=agent_config['provider'],
agent_config=agent_config['agent'],
enable_thinking=enable_thinking
enable_thinking=enable_thinking,
images=image_contents # 传递图片数据给多模态模型
)
logger.info(f"LLM响应: response长度={len(response)}, thinking长度={len(thinking_content) if thinking_content else 0}")

View File

@@ -98,11 +98,19 @@ class LLMService:
messages: List[Dict],
provider_config: dict,
agent_config: dict,
enable_thinking: bool = True
enable_thinking: bool = True,
images: List[Dict] = None # 图片数据列表 [{'name', 'type', 'data': base64}]
) -> Tuple[str, Optional[str]]:
"""
调用AI模型进行对话
Args:
messages: 对话历史
provider_config: LLM Provider配置
agent_config: Agent配置
enable_thinking: 是否启用思考
images: 图片数据列表(用于多模态模型)
Returns:
Tuple[str, Optional[str]]: (回复内容, 思考过程)
"""
@@ -123,6 +131,22 @@ class LLMService:
if final_messages and final_messages[0]['role'] != 'system':
final_messages.insert(0, {"role": "system", "content": system_prompt})
# 如果有图片,构建多模态消息(只修改最后一条用户消息)
if images and len(images) > 0:
# 找到最后一条用户消息
for i in range(len(final_messages) - 1, -1, -1):
if final_messages[i]['role'] == 'user':
original_text = final_messages[i]['content']
# 构建多模态内容
multimodal_content = [{"type": "text", "text": original_text if original_text else "请描述这张图片"}]
for img in images:
multimodal_content.append({
"type": "image_url",
"image_url": {"url": img['data']} # base64 data URL
})
final_messages[i]['content'] = multimodal_content
break
thinking_content = None
# 处理思考功能