From 17a442b144ddd1761e83c7e1ee9f116a8186661b Mon Sep 17 00:00:00 2001 From: coder Date: Thu, 16 Apr 2026 23:02:59 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=89=AB=E6=8F=8F=E7=89=88PDF=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E8=A7=86=E8=A7=89=E6=A8=A1=E5=9E=8BOCR=E8=AF=86?= =?UTF-8?q?=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加 pdf_to_images 将PDF页面转为图像 - 添加 extract_text_from_image 使用视觉模型OCR识别图像文字 - 检测扫描版PDF自动切换OCR模式 - glm-4.6v 等视觉模型可识别图像中的文字 - 进度提示显示OCR识别过程 --- services.py | 158 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 9 deletions(-) diff --git a/services.py b/services.py index 376d6b8..a8d792c 100644 --- a/services.py +++ b/services.py @@ -7,10 +7,20 @@ import json import time import hashlib import threading +import base64 +import io from datetime import datetime, timedelta from pypdf import PdfReader from openai import OpenAI from flask import current_app +from PIL import Image + +# pdf2image 用于将PDF转为图像 +try: + from pdf2image import convert_from_path + PDF_TO_IMAGE_AVAILABLE = True +except ImportError: + PDF_TO_IMAGE_AVAILABLE = False # ==================== LLM客户端 ==================== class TranslationService: @@ -99,6 +109,115 @@ class TranslationService: text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text) return text.strip() + def is_vision_model(self): + """检查是否是视觉模型""" + model = self.llm_config.get('model', '') + # 常见视觉模型名称 + vision_models = ['vision', 'vlm', 'glm-4v', 'glm-4.6v', 'gpt-4-vision', 'gpt-4o', 'qwen-vl', 'claude-3'] + return any(v in model.lower() for v in vision_models) + + def pdf_to_images(self, pdf_path, max_pages=None): + """将PDF页面转换为图像""" + if not PDF_TO_IMAGE_AVAILABLE: + return None, "pdf2image未安装,无法处理扫描版PDF。请安装: pip install pdf2image" + + try: + # 获取PDF页数 + reader = PdfReader(pdf_path) + total_pages = len(reader.pages) + + if max_pages: + pages_to_convert = min(max_pages, total_pages) + else: + pages_to_convert = total_pages + + # 转换PDF为图像 + images = convert_from_path( + pdf_path, + first_page=1, + last_page=pages_to_convert, + dpi=200, # 适当的DPI + fmt='jpeg' + ) + + return images, None + + except Exception as e: + return None, f"PDF转图像失败: {str(e)}" + + def extract_text_from_image(self, image): + """使用视觉模型从图像中提取文字""" + if not self.is_vision_model(): + return None, "当前模型不是视觉模型,无法识别图像文字" + + try: + # 将图像转为base64 + buffered = io.BytesIO() + image.save(buffered, format="JPEG") + img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') + + # 构建多模态请求 + response = self.client.chat.completions.create( + model=self.llm_config['model'], + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "请识别并提取这张图片中的所有文字内容。只输出提取的文字,不要添加任何解释或说明。保持原有的段落和格式。" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{img_base64}" + } + } + ] + } + ], + max_tokens=self.llm_config['max_tokens'], + temperature=0.1, + timeout=self.llm_config['timeout'], + ) + + content = response.choices[0].message.content + return content.strip() if content else '', None + + except Exception as e: + return '', f"视觉模型识别失败: {str(e)}" + + def extract_text_from_scanned_pdf(self, pdf_path, progress_callback=None): + """从扫描版PDF提取文字(使用视觉模型OCR)""" + images, error = self.pdf_to_images(pdf_path) + + if error: + return [], error + + pages_text = [] + total = len(images) + + for i, image in enumerate(images): + if progress_callback: + progress_callback(int((i+1)/total*50), total, f"OCR识别第{i+1}页...") + + text, err = self.extract_text_from_image(image) + + if err: + pages_text.append({ + 'page': i + 1, + 'text': '', + 'error': err + }) + else: + pages_text.append({ + 'page': i + 1, + 'text': text or '', + 'error': None + }) + + return pages_text, None + def chunk_text(self, text, max_size=2000): """分块""" paragraphs = text.split('\n\n') @@ -131,19 +250,38 @@ class TranslationService: Returns: 翻译统计信息 """ + # 先尝试常规提取 pages = self.extract_pdf_text(pdf_path) total_pages = len(pages) - - # 检查是否有可翻译内容 total_text = sum(len(p['text']) for p in pages) + + # 如果无法提取文本,尝试使用视觉模型OCR if total_pages == 0 or total_text < 10: - error_msg = "PDF无法提取文本内容。可能原因:\n1. PDF是扫描版(图像形式),需要OCR处理\n2. PDF为空或加密\n请使用包含可提取文本的PDF文件。" - if progress_callback: - progress_callback(0, 0, error_msg) - raise ValueError(error_msg) + if self.is_vision_model() and PDF_TO_IMAGE_AVAILABLE: + if progress_callback: + progress_callback(0, 0, "检测到扫描版PDF,使用视觉模型OCR...") + + pages, error = self.extract_text_from_scanned_pdf(pdf_path, progress_callback) + + if error: + raise ValueError(error) + + total_pages = len(pages) + total_text = sum(len(p['text']) for p in pages) + + if total_text < 10: + raise ValueError("视觉模型OCR未能提取到有效文字内容") + + if progress_callback: + progress_callback(50, total_pages, "OCR完成,开始翻译...") + else: + error_msg = "PDF无法提取文本内容。可能原因:\n1. PDF是扫描版(图像形式)\n2. 当前大模型不是视觉模型,无法识别图像文字\n\n如需处理扫描版PDF,请配置视觉大模型(如 glm-4.6v、gpt-4-vision)" + if progress_callback: + progress_callback(0, 0, error_msg) + raise ValueError(error_msg) if progress_callback: - progress_callback(0, total_pages, "开始翻译...") + progress_callback(50, total_pages, "开始翻译...") translated_pages = [] total_chunks = 0 @@ -158,8 +296,10 @@ class TranslationService: translated_chunks.append(translated) if progress_callback: - progress = int((i + 1) / len(chunks) * 100 / total_pages) - progress_callback(progress, total_pages, f"翻译第{page_data['page']}页") + # OCR占50%,翻译占50% + page_progress = (i + 1) / len(chunks) + overall_progress = 50 + int(page_progress * 50 / total_pages) + progress_callback(overall_progress, total_pages, f"翻译第{page_data['page']}页") translated_pages.append({ 'page': page_data['page'],