feat: 扫描版PDF支持视觉模型OCR识别

- 添加 pdf_to_images 将PDF页面转为图像
- 添加 extract_text_from_image 使用视觉模型OCR识别图像文字
- 检测扫描版PDF自动切换OCR模式
- glm-4.6v 等视觉模型可识别图像中的文字
- 进度提示显示OCR识别过程
This commit is contained in:
2026-04-16 23:02:59 +08:00
parent e524938276
commit 17a442b144

View File

@@ -7,10 +7,20 @@ import json
import time import time
import hashlib import hashlib
import threading import threading
import base64
import io
from datetime import datetime, timedelta from datetime import datetime, timedelta
from pypdf import PdfReader from pypdf import PdfReader
from openai import OpenAI from openai import OpenAI
from flask import current_app from flask import current_app
from PIL import Image
# pdf2image 用于将PDF转为图像
try:
from pdf2image import convert_from_path
PDF_TO_IMAGE_AVAILABLE = True
except ImportError:
PDF_TO_IMAGE_AVAILABLE = False
# ==================== LLM客户端 ==================== # ==================== LLM客户端 ====================
class TranslationService: class TranslationService:
@@ -99,6 +109,115 @@ class TranslationService:
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text) text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
return text.strip() return text.strip()
def is_vision_model(self):
"""检查是否是视觉模型"""
model = self.llm_config.get('model', '')
# 常见视觉模型名称
vision_models = ['vision', 'vlm', 'glm-4v', 'glm-4.6v', 'gpt-4-vision', 'gpt-4o', 'qwen-vl', 'claude-3']
return any(v in model.lower() for v in vision_models)
def pdf_to_images(self, pdf_path, max_pages=None):
"""将PDF页面转换为图像"""
if not PDF_TO_IMAGE_AVAILABLE:
return None, "pdf2image未安装无法处理扫描版PDF。请安装: pip install pdf2image"
try:
# 获取PDF页数
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
if max_pages:
pages_to_convert = min(max_pages, total_pages)
else:
pages_to_convert = total_pages
# 转换PDF为图像
images = convert_from_path(
pdf_path,
first_page=1,
last_page=pages_to_convert,
dpi=200, # 适当的DPI
fmt='jpeg'
)
return images, None
except Exception as e:
return None, f"PDF转图像失败: {str(e)}"
def extract_text_from_image(self, image):
"""使用视觉模型从图像中提取文字"""
if not self.is_vision_model():
return None, "当前模型不是视觉模型,无法识别图像文字"
try:
# 将图像转为base64
buffered = io.BytesIO()
image.save(buffered, format="JPEG")
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
# 构建多模态请求
response = self.client.chat.completions.create(
model=self.llm_config['model'],
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "请识别并提取这张图片中的所有文字内容。只输出提取的文字,不要添加任何解释或说明。保持原有的段落和格式。"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_base64}"
}
}
]
}
],
max_tokens=self.llm_config['max_tokens'],
temperature=0.1,
timeout=self.llm_config['timeout'],
)
content = response.choices[0].message.content
return content.strip() if content else '', None
except Exception as e:
return '', f"视觉模型识别失败: {str(e)}"
def extract_text_from_scanned_pdf(self, pdf_path, progress_callback=None):
"""从扫描版PDF提取文字使用视觉模型OCR"""
images, error = self.pdf_to_images(pdf_path)
if error:
return [], error
pages_text = []
total = len(images)
for i, image in enumerate(images):
if progress_callback:
progress_callback(int((i+1)/total*50), total, f"OCR识别第{i+1}页...")
text, err = self.extract_text_from_image(image)
if err:
pages_text.append({
'page': i + 1,
'text': '',
'error': err
})
else:
pages_text.append({
'page': i + 1,
'text': text or '',
'error': None
})
return pages_text, None
def chunk_text(self, text, max_size=2000): def chunk_text(self, text, max_size=2000):
"""分块""" """分块"""
paragraphs = text.split('\n\n') paragraphs = text.split('\n\n')
@@ -131,19 +250,38 @@ class TranslationService:
Returns: Returns:
翻译统计信息 翻译统计信息
""" """
# 先尝试常规提取
pages = self.extract_pdf_text(pdf_path) pages = self.extract_pdf_text(pdf_path)
total_pages = len(pages) total_pages = len(pages)
# 检查是否有可翻译内容
total_text = sum(len(p['text']) for p in pages) total_text = sum(len(p['text']) for p in pages)
# 如果无法提取文本尝试使用视觉模型OCR
if total_pages == 0 or total_text < 10: if total_pages == 0 or total_text < 10:
error_msg = "PDF无法提取文本内容。可能原因\n1. PDF是扫描版图像形式需要OCR处理\n2. PDF为空或加密\n请使用包含可提取文本的PDF文件。" if self.is_vision_model() and PDF_TO_IMAGE_AVAILABLE:
if progress_callback: if progress_callback:
progress_callback(0, 0, error_msg) progress_callback(0, 0, "检测到扫描版PDF使用视觉模型OCR...")
raise ValueError(error_msg)
pages, error = self.extract_text_from_scanned_pdf(pdf_path, progress_callback)
if error:
raise ValueError(error)
total_pages = len(pages)
total_text = sum(len(p['text']) for p in pages)
if total_text < 10:
raise ValueError("视觉模型OCR未能提取到有效文字内容")
if progress_callback:
progress_callback(50, total_pages, "OCR完成开始翻译...")
else:
error_msg = "PDF无法提取文本内容。可能原因\n1. PDF是扫描版图像形式\n2. 当前大模型不是视觉模型,无法识别图像文字\n\n如需处理扫描版PDF请配置视觉大模型如 glm-4.6v、gpt-4-vision"
if progress_callback:
progress_callback(0, 0, error_msg)
raise ValueError(error_msg)
if progress_callback: if progress_callback:
progress_callback(0, total_pages, "开始翻译...") progress_callback(50, total_pages, "开始翻译...")
translated_pages = [] translated_pages = []
total_chunks = 0 total_chunks = 0
@@ -158,8 +296,10 @@ class TranslationService:
translated_chunks.append(translated) translated_chunks.append(translated)
if progress_callback: if progress_callback:
progress = int((i + 1) / len(chunks) * 100 / total_pages) # OCR占50%翻译占50%
progress_callback(progress, total_pages, f"翻译第{page_data['page']}") page_progress = (i + 1) / len(chunks)
overall_progress = 50 + int(page_progress * 50 / total_pages)
progress_callback(overall_progress, total_pages, f"翻译第{page_data['page']}")
translated_pages.append({ translated_pages.append({
'page': page_data['page'], 'page': page_data['page'],