feat: 扫描版PDF支持视觉模型OCR识别

- 添加 pdf_to_images 将PDF页面转为图像 - 添加 extract_text_from_image 使用视觉模型OCR识别图像文字 - 检测扫描版PDF自动切换OCR模式 - glm-4.6v 等视觉模型可识别图像中的文字 - 进度提示显示OCR识别过程
2026-04-16 23:02:59 +08:00
parent e524938276
commit 17a442b144
1 changed files with 149 additions and 9 deletions
--- a/services.py
+++ b/services.py
@@ -7,10 +7,20 @@ import json
 import time
 import hashlib
 import threading
+import base64
+import io
 from datetime import datetime, timedelta
 from pypdf import PdfReader
 from openai import OpenAI
 from flask import current_app
+from PIL import Image
+
+# pdf2image 用于将PDF转为图像
+try:
+    from pdf2image import convert_from_path
+    PDF_TO_IMAGE_AVAILABLE = True
+except ImportError:
+    PDF_TO_IMAGE_AVAILABLE = False

 # ==================== LLM客户端 ====================
 class TranslationService:
@@ -99,6 +109,115 @@ class TranslationService:
        text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
        return text.strip()
    
+    def is_vision_model(self):
+        """检查是否是视觉模型"""
+        model = self.llm_config.get('model', '')
+        # 常见视觉模型名称
+        vision_models = ['vision', 'vlm', 'glm-4v', 'glm-4.6v', 'gpt-4-vision', 'gpt-4o', 'qwen-vl', 'claude-3']
+        return any(v in model.lower() for v in vision_models)
+    
+    def pdf_to_images(self, pdf_path, max_pages=None):
+        """将PDF页面转换为图像"""
+        if not PDF_TO_IMAGE_AVAILABLE:
+            return None, "pdf2image未安装，无法处理扫描版PDF。请安装: pip install pdf2image"
+        
+        try:
+            # 获取PDF页数
+            reader = PdfReader(pdf_path)
+            total_pages = len(reader.pages)
+            
+            if max_pages:
+                pages_to_convert = min(max_pages, total_pages)
+            else:
+                pages_to_convert = total_pages
+            
+            # 转换PDF为图像
+            images = convert_from_path(
+                pdf_path,
+                first_page=1,
+                last_page=pages_to_convert,
+                dpi=200,  # 适当的DPI
+                fmt='jpeg'
+            )
+            
+            return images, None
+            
+        except Exception as e:
+            return None, f"PDF转图像失败: {str(e)}"
+    
+    def extract_text_from_image(self, image):
+        """使用视觉模型从图像中提取文字"""
+        if not self.is_vision_model():
+            return None, "当前模型不是视觉模型，无法识别图像文字"
+        
+        try:
+            # 将图像转为base64
+            buffered = io.BytesIO()
+            image.save(buffered, format="JPEG")
+            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            
+            # 构建多模态请求
+            response = self.client.chat.completions.create(
+                model=self.llm_config['model'],
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": "请识别并提取这张图片中的所有文字内容。只输出提取的文字，不要添加任何解释或说明。保持原有的段落和格式。"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{img_base64}"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                max_tokens=self.llm_config['max_tokens'],
+                temperature=0.1,
+                timeout=self.llm_config['timeout'],
+            )
+            
+            content = response.choices[0].message.content
+            return content.strip() if content else '', None
+            
+        except Exception as e:
+            return '', f"视觉模型识别失败: {str(e)}"
+    
+    def extract_text_from_scanned_pdf(self, pdf_path, progress_callback=None):
+        """从扫描版PDF提取文字（使用视觉模型OCR）"""
+        images, error = self.pdf_to_images(pdf_path)
+        
+        if error:
+            return [], error
+        
+        pages_text = []
+        total = len(images)
+        
+        for i, image in enumerate(images):
+            if progress_callback:
+                progress_callback(int((i+1)/total*50), total, f"OCR识别第{i+1}页...")
+            
+            text, err = self.extract_text_from_image(image)
+            
+            if err:
+                pages_text.append({
+                    'page': i + 1,
+                    'text': '',
+                    'error': err
+                })
+            else:
+                pages_text.append({
+                    'page': i + 1,
+                    'text': text or '',
+                    'error': None
+                })
+        
+        return pages_text, None
+    
    def chunk_text(self, text, max_size=2000):
        """分块"""
        paragraphs = text.split('\n\n')
@@ -131,19 +250,38 @@ class TranslationService:
        Returns:
            翻译统计信息
        """
+        # 先尝试常规提取
        pages = self.extract_pdf_text(pdf_path)
        total_pages = len(pages)
-        
-        # 检查是否有可翻译内容
        total_text = sum(len(p['text']) for p in pages)
+        
+        # 如果无法提取文本，尝试使用视觉模型OCR
        if total_pages == 0 or total_text < 10:
-            error_msg = "PDF无法提取文本内容。可能原因：\n1. PDF是扫描版（图像形式），需要OCR处理\n2. PDF为空或加密\n请使用包含可提取文本的PDF文件。"
-            if progress_callback:
-                progress_callback(0, 0, error_msg)
-            raise ValueError(error_msg)
+            if self.is_vision_model() and PDF_TO_IMAGE_AVAILABLE:
+                if progress_callback:
+                    progress_callback(0, 0, "检测到扫描版PDF，使用视觉模型OCR...")
+                
+                pages, error = self.extract_text_from_scanned_pdf(pdf_path, progress_callback)
+                
+                if error:
+                    raise ValueError(error)
+                
+                total_pages = len(pages)
+                total_text = sum(len(p['text']) for p in pages)
+                
+                if total_text < 10:
+                    raise ValueError("视觉模型OCR未能提取到有效文字内容")
+                
+                if progress_callback:
+                    progress_callback(50, total_pages, "OCR完成，开始翻译...")
+            else:
+                error_msg = "PDF无法提取文本内容。可能原因：\n1. PDF是扫描版（图像形式）\n2. 当前大模型不是视觉模型，无法识别图像文字\n\n如需处理扫描版PDF，请配置视觉大模型（如 glm-4.6v、gpt-4-vision）"
+                if progress_callback:
+                    progress_callback(0, 0, error_msg)
+                raise ValueError(error_msg)
        
        if progress_callback:
-            progress_callback(0, total_pages, "开始翻译...")
+            progress_callback(50, total_pages, "开始翻译...")
        
        translated_pages = []
        total_chunks = 0
@@ -158,8 +296,10 @@ class TranslationService:
                translated_chunks.append(translated)
                
                if progress_callback:
-                    progress = int((i + 1) / len(chunks) * 100 / total_pages)
-                    progress_callback(progress, total_pages, f"翻译第{page_data['page']}页")
+                    # OCR占50%，翻译占50%
+                    page_progress = (i + 1) / len(chunks)
+                    overall_progress = 50 + int(page_progress * 50 / total_pages)
+                    progress_callback(overall_progress, total_pages, f"翻译第{page_data['page']}页")
            
            translated_pages.append({
                'page': page_data['page'],