feat: 多模态支持 - 图片数据传递给视觉模型

This commit is contained in:
2026-04-14 09:21:36 +08:00
parent a34bef50ae
commit 2dca775911
2 changed files with 27 additions and 2 deletions

View File

@@ -98,11 +98,19 @@ class LLMService:
messages: List[Dict],
provider_config: dict,
agent_config: dict,
enable_thinking: bool = True
enable_thinking: bool = True,
images: List[Dict] = None # 图片数据列表 [{'name', 'type', 'data': base64}]
) -> Tuple[str, Optional[str]]:
"""
调用AI模型进行对话
Args:
messages: 对话历史
provider_config: LLM Provider配置
agent_config: Agent配置
enable_thinking: 是否启用思考
images: 图片数据列表(用于多模态模型)
Returns:
Tuple[str, Optional[str]]: (回复内容, 思考过程)
"""
@@ -123,6 +131,22 @@ class LLMService:
if final_messages and final_messages[0]['role'] != 'system':
final_messages.insert(0, {"role": "system", "content": system_prompt})
# 如果有图片,构建多模态消息(只修改最后一条用户消息)
if images and len(images) > 0:
# 找到最后一条用户消息
for i in range(len(final_messages) - 1, -1, -1):
if final_messages[i]['role'] == 'user':
original_text = final_messages[i]['content']
# 构建多模态内容
multimodal_content = [{"type": "text", "text": original_text if original_text else "请描述这张图片"}]
for img in images:
multimodal_content.append({
"type": "image_url",
"image_url": {"url": img['data']} # base64 data URL
})
final_messages[i]['content'] = multimodal_content
break
thinking_content = None
# 处理思考功能