feat: 扫描版PDF支持视觉模型OCR识别
- 添加 pdf_to_images 将PDF页面转为图像 - 添加 extract_text_from_image 使用视觉模型OCR识别图像文字 - 检测扫描版PDF自动切换OCR模式 - glm-4.6v 等视觉模型可识别图像中的文字 - 进度提示显示OCR识别过程
This commit is contained in:
158
services.py
158
services.py
@@ -7,10 +7,20 @@ import json
|
|||||||
import time
|
import time
|
||||||
import hashlib
|
import hashlib
|
||||||
import threading
|
import threading
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from flask import current_app
|
from flask import current_app
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
# pdf2image 用于将PDF转为图像
|
||||||
|
try:
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
PDF_TO_IMAGE_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
PDF_TO_IMAGE_AVAILABLE = False
|
||||||
|
|
||||||
# ==================== LLM客户端 ====================
|
# ==================== LLM客户端 ====================
|
||||||
class TranslationService:
|
class TranslationService:
|
||||||
@@ -99,6 +109,115 @@ class TranslationService:
|
|||||||
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
|
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
def is_vision_model(self):
|
||||||
|
"""检查是否是视觉模型"""
|
||||||
|
model = self.llm_config.get('model', '')
|
||||||
|
# 常见视觉模型名称
|
||||||
|
vision_models = ['vision', 'vlm', 'glm-4v', 'glm-4.6v', 'gpt-4-vision', 'gpt-4o', 'qwen-vl', 'claude-3']
|
||||||
|
return any(v in model.lower() for v in vision_models)
|
||||||
|
|
||||||
|
def pdf_to_images(self, pdf_path, max_pages=None):
|
||||||
|
"""将PDF页面转换为图像"""
|
||||||
|
if not PDF_TO_IMAGE_AVAILABLE:
|
||||||
|
return None, "pdf2image未安装,无法处理扫描版PDF。请安装: pip install pdf2image"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 获取PDF页数
|
||||||
|
reader = PdfReader(pdf_path)
|
||||||
|
total_pages = len(reader.pages)
|
||||||
|
|
||||||
|
if max_pages:
|
||||||
|
pages_to_convert = min(max_pages, total_pages)
|
||||||
|
else:
|
||||||
|
pages_to_convert = total_pages
|
||||||
|
|
||||||
|
# 转换PDF为图像
|
||||||
|
images = convert_from_path(
|
||||||
|
pdf_path,
|
||||||
|
first_page=1,
|
||||||
|
last_page=pages_to_convert,
|
||||||
|
dpi=200, # 适当的DPI
|
||||||
|
fmt='jpeg'
|
||||||
|
)
|
||||||
|
|
||||||
|
return images, None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"PDF转图像失败: {str(e)}"
|
||||||
|
|
||||||
|
def extract_text_from_image(self, image):
|
||||||
|
"""使用视觉模型从图像中提取文字"""
|
||||||
|
if not self.is_vision_model():
|
||||||
|
return None, "当前模型不是视觉模型,无法识别图像文字"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 将图像转为base64
|
||||||
|
buffered = io.BytesIO()
|
||||||
|
image.save(buffered, format="JPEG")
|
||||||
|
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
|
# 构建多模态请求
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.llm_config['model'],
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "请识别并提取这张图片中的所有文字内容。只输出提取的文字,不要添加任何解释或说明。保持原有的段落和格式。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/jpeg;base64,{img_base64}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
max_tokens=self.llm_config['max_tokens'],
|
||||||
|
temperature=0.1,
|
||||||
|
timeout=self.llm_config['timeout'],
|
||||||
|
)
|
||||||
|
|
||||||
|
content = response.choices[0].message.content
|
||||||
|
return content.strip() if content else '', None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return '', f"视觉模型识别失败: {str(e)}"
|
||||||
|
|
||||||
|
def extract_text_from_scanned_pdf(self, pdf_path, progress_callback=None):
|
||||||
|
"""从扫描版PDF提取文字(使用视觉模型OCR)"""
|
||||||
|
images, error = self.pdf_to_images(pdf_path)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
return [], error
|
||||||
|
|
||||||
|
pages_text = []
|
||||||
|
total = len(images)
|
||||||
|
|
||||||
|
for i, image in enumerate(images):
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(int((i+1)/total*50), total, f"OCR识别第{i+1}页...")
|
||||||
|
|
||||||
|
text, err = self.extract_text_from_image(image)
|
||||||
|
|
||||||
|
if err:
|
||||||
|
pages_text.append({
|
||||||
|
'page': i + 1,
|
||||||
|
'text': '',
|
||||||
|
'error': err
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
pages_text.append({
|
||||||
|
'page': i + 1,
|
||||||
|
'text': text or '',
|
||||||
|
'error': None
|
||||||
|
})
|
||||||
|
|
||||||
|
return pages_text, None
|
||||||
|
|
||||||
def chunk_text(self, text, max_size=2000):
|
def chunk_text(self, text, max_size=2000):
|
||||||
"""分块"""
|
"""分块"""
|
||||||
paragraphs = text.split('\n\n')
|
paragraphs = text.split('\n\n')
|
||||||
@@ -131,19 +250,38 @@ class TranslationService:
|
|||||||
Returns:
|
Returns:
|
||||||
翻译统计信息
|
翻译统计信息
|
||||||
"""
|
"""
|
||||||
|
# 先尝试常规提取
|
||||||
pages = self.extract_pdf_text(pdf_path)
|
pages = self.extract_pdf_text(pdf_path)
|
||||||
total_pages = len(pages)
|
total_pages = len(pages)
|
||||||
|
|
||||||
# 检查是否有可翻译内容
|
|
||||||
total_text = sum(len(p['text']) for p in pages)
|
total_text = sum(len(p['text']) for p in pages)
|
||||||
|
|
||||||
|
# 如果无法提取文本,尝试使用视觉模型OCR
|
||||||
if total_pages == 0 or total_text < 10:
|
if total_pages == 0 or total_text < 10:
|
||||||
error_msg = "PDF无法提取文本内容。可能原因:\n1. PDF是扫描版(图像形式),需要OCR处理\n2. PDF为空或加密\n请使用包含可提取文本的PDF文件。"
|
if self.is_vision_model() and PDF_TO_IMAGE_AVAILABLE:
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
progress_callback(0, 0, error_msg)
|
progress_callback(0, 0, "检测到扫描版PDF,使用视觉模型OCR...")
|
||||||
raise ValueError(error_msg)
|
|
||||||
|
pages, error = self.extract_text_from_scanned_pdf(pdf_path, progress_callback)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
raise ValueError(error)
|
||||||
|
|
||||||
|
total_pages = len(pages)
|
||||||
|
total_text = sum(len(p['text']) for p in pages)
|
||||||
|
|
||||||
|
if total_text < 10:
|
||||||
|
raise ValueError("视觉模型OCR未能提取到有效文字内容")
|
||||||
|
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(50, total_pages, "OCR完成,开始翻译...")
|
||||||
|
else:
|
||||||
|
error_msg = "PDF无法提取文本内容。可能原因:\n1. PDF是扫描版(图像形式)\n2. 当前大模型不是视觉模型,无法识别图像文字\n\n如需处理扫描版PDF,请配置视觉大模型(如 glm-4.6v、gpt-4-vision)"
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(0, 0, error_msg)
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
progress_callback(0, total_pages, "开始翻译...")
|
progress_callback(50, total_pages, "开始翻译...")
|
||||||
|
|
||||||
translated_pages = []
|
translated_pages = []
|
||||||
total_chunks = 0
|
total_chunks = 0
|
||||||
@@ -158,8 +296,10 @@ class TranslationService:
|
|||||||
translated_chunks.append(translated)
|
translated_chunks.append(translated)
|
||||||
|
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
progress = int((i + 1) / len(chunks) * 100 / total_pages)
|
# OCR占50%,翻译占50%
|
||||||
progress_callback(progress, total_pages, f"翻译第{page_data['page']}页")
|
page_progress = (i + 1) / len(chunks)
|
||||||
|
overall_progress = 50 + int(page_progress * 50 / total_pages)
|
||||||
|
progress_callback(overall_progress, total_pages, f"翻译第{page_data['page']}页")
|
||||||
|
|
||||||
translated_pages.append({
|
translated_pages.append({
|
||||||
'page': page_data['page'],
|
'page': page_data['page'],
|
||||||
|
|||||||
Reference in New Issue
Block a user