feat: 扫描版PDF支持视觉模型OCR识别

- 添加 pdf_to_images 将PDF页面转为图像
- 添加 extract_text_from_image 使用视觉模型OCR识别图像文字
- 检测扫描版PDF自动切换OCR模式
- glm-4.6v 等视觉模型可识别图像中的文字
- 进度提示显示OCR识别过程
This commit is contained in:
2026-04-16 23:02:59 +08:00
parent e524938276
commit 17a442b144

View File

@@ -7,10 +7,20 @@ import json
import time
import hashlib
import threading
import base64
import io
from datetime import datetime, timedelta
from pypdf import PdfReader
from openai import OpenAI
from flask import current_app
from PIL import Image
# pdf2image 用于将PDF转为图像
try:
from pdf2image import convert_from_path
PDF_TO_IMAGE_AVAILABLE = True
except ImportError:
PDF_TO_IMAGE_AVAILABLE = False
# ==================== LLM客户端 ====================
class TranslationService:
@@ -99,6 +109,115 @@ class TranslationService:
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
return text.strip()
def is_vision_model(self):
"""检查是否是视觉模型"""
model = self.llm_config.get('model', '')
# 常见视觉模型名称
vision_models = ['vision', 'vlm', 'glm-4v', 'glm-4.6v', 'gpt-4-vision', 'gpt-4o', 'qwen-vl', 'claude-3']
return any(v in model.lower() for v in vision_models)
def pdf_to_images(self, pdf_path, max_pages=None):
"""将PDF页面转换为图像"""
if not PDF_TO_IMAGE_AVAILABLE:
return None, "pdf2image未安装无法处理扫描版PDF。请安装: pip install pdf2image"
try:
# 获取PDF页数
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
if max_pages:
pages_to_convert = min(max_pages, total_pages)
else:
pages_to_convert = total_pages
# 转换PDF为图像
images = convert_from_path(
pdf_path,
first_page=1,
last_page=pages_to_convert,
dpi=200, # 适当的DPI
fmt='jpeg'
)
return images, None
except Exception as e:
return None, f"PDF转图像失败: {str(e)}"
def extract_text_from_image(self, image):
"""使用视觉模型从图像中提取文字"""
if not self.is_vision_model():
return None, "当前模型不是视觉模型,无法识别图像文字"
try:
# 将图像转为base64
buffered = io.BytesIO()
image.save(buffered, format="JPEG")
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
# 构建多模态请求
response = self.client.chat.completions.create(
model=self.llm_config['model'],
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "请识别并提取这张图片中的所有文字内容。只输出提取的文字,不要添加任何解释或说明。保持原有的段落和格式。"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_base64}"
}
}
]
}
],
max_tokens=self.llm_config['max_tokens'],
temperature=0.1,
timeout=self.llm_config['timeout'],
)
content = response.choices[0].message.content
return content.strip() if content else '', None
except Exception as e:
return '', f"视觉模型识别失败: {str(e)}"
def extract_text_from_scanned_pdf(self, pdf_path, progress_callback=None):
"""从扫描版PDF提取文字使用视觉模型OCR"""
images, error = self.pdf_to_images(pdf_path)
if error:
return [], error
pages_text = []
total = len(images)
for i, image in enumerate(images):
if progress_callback:
progress_callback(int((i+1)/total*50), total, f"OCR识别第{i+1}页...")
text, err = self.extract_text_from_image(image)
if err:
pages_text.append({
'page': i + 1,
'text': '',
'error': err
})
else:
pages_text.append({
'page': i + 1,
'text': text or '',
'error': None
})
return pages_text, None
def chunk_text(self, text, max_size=2000):
"""分块"""
paragraphs = text.split('\n\n')
@@ -131,19 +250,38 @@ class TranslationService:
Returns:
翻译统计信息
"""
# 先尝试常规提取
pages = self.extract_pdf_text(pdf_path)
total_pages = len(pages)
# 检查是否有可翻译内容
total_text = sum(len(p['text']) for p in pages)
# 如果无法提取文本尝试使用视觉模型OCR
if total_pages == 0 or total_text < 10:
error_msg = "PDF无法提取文本内容。可能原因\n1. PDF是扫描版图像形式需要OCR处理\n2. PDF为空或加密\n请使用包含可提取文本的PDF文件。"
if self.is_vision_model() and PDF_TO_IMAGE_AVAILABLE:
if progress_callback:
progress_callback(0, 0, "检测到扫描版PDF使用视觉模型OCR...")
pages, error = self.extract_text_from_scanned_pdf(pdf_path, progress_callback)
if error:
raise ValueError(error)
total_pages = len(pages)
total_text = sum(len(p['text']) for p in pages)
if total_text < 10:
raise ValueError("视觉模型OCR未能提取到有效文字内容")
if progress_callback:
progress_callback(50, total_pages, "OCR完成开始翻译...")
else:
error_msg = "PDF无法提取文本内容。可能原因\n1. PDF是扫描版图像形式\n2. 当前大模型不是视觉模型,无法识别图像文字\n\n如需处理扫描版PDF请配置视觉大模型如 glm-4.6v、gpt-4-vision"
if progress_callback:
progress_callback(0, 0, error_msg)
raise ValueError(error_msg)
if progress_callback:
progress_callback(0, total_pages, "开始翻译...")
progress_callback(50, total_pages, "开始翻译...")
translated_pages = []
total_chunks = 0
@@ -158,8 +296,10 @@ class TranslationService:
translated_chunks.append(translated)
if progress_callback:
progress = int((i + 1) / len(chunks) * 100 / total_pages)
progress_callback(progress, total_pages, f"翻译第{page_data['page']}")
# OCR占50%翻译占50%
page_progress = (i + 1) / len(chunks)
overall_progress = 50 + int(page_progress * 50 / total_pages)
progress_callback(overall_progress, total_pages, f"翻译第{page_data['page']}")
translated_pages.append({
'page': page_data['page'],