feat: 扫描版PDF支持视觉模型OCR识别

- 添加 pdf_to_images 将PDF页面转为图像 - 添加 extract_text_from_image 使用视觉模型OCR识别图像文字 - 检测扫描版PDF自动切换OCR模式 - glm-4.6v 等视觉模型可识别图像中的文字 - 进度提示显示OCR识别过程
fix: 添加PDF文本提取检测，提示扫描版PDF无法翻译
2026-04-16 23:02:59 +08:00 · 2026-04-16 22:06:03 +08:00 · 2026-04-16 21:50:24 +08:00
3 changed files with 496 additions and 5 deletions
--- a/app.py
+++ b/app.py
@@ -445,6 +445,36 @@ def upload_pdf():
    })
@app.route('/api/config')
 def api_config():
    """获取系统配置"""
    from admin import get_llm_config, get_site_config
    return jsonify({
        'site_name': get_site_config().get('site_name'),
        'max_file_size': get_site_config().get('max_file_size'),
        'cache_expire_days': get_site_config().get('cache_expire_days'),
        'llm_config': get_llm_config()
    })
@app.route('/api/translations')
 def api_translations_list():
    """获取翻译记录列表"""
    user = get_current_user()
    if user:
        translations = Translation.query.filter_by(user_id=user.id)\
            .order_by(Translation.created_at.desc()).limit(20).all()
    else:
        # 访客返回空列表
        translations = []
    return jsonify({
        'translations': [t.to_dict() for t in translations]
    })
@app.route('/api/status/<int:translation_id>')
 def translation_status(translation_id):
    """获取翻译状态"""
@@ -462,8 +492,12 @@ def translation_status(translation_id):
        'id': translation.id,
        'status': translation.status,
        'progress': translation.progress,
        'filename': translation.original_filename,
        'pages': translation.page_count,
        'from_cache': translation.from_cache,
-        'error': translation.error_message
+        'error': translation.error_message,
        'created_at': translation.created_at.isoformat() if translation.created_at else None,
        'completed_at': translation.completed_at.isoformat() if translation.completed_at else None,
    })
--- a/pdf_translate.py
+++ b/pdf_translate.py
@@ -0,0 +1,307 @@
 #!/usr/bin/env python3
 """
 PDF翻译命令行工具
 使用方法:
    pdf_translate translate <pdf_file> [--instruction "翻译要求"] [--user <username>] [--password <password>]
    pdf_translate list [--user <username>] [--password <password>]
    pdf_translate status <translation_id> [--user <username>] [--password <password>]
    pdf_translate download <translation_id> <output_file> [--user <username>] [--password <password>]
    pdf_translate config
 """
 import argparse
 import sys
 import os
 import json
 import requests
 from pathlib import Path
 # API 基础地址
 API_BASE = "http://localhost:19000"
 def login(username, password, api_base=API_BASE):
    """登录获取session"""
    url = f"{api_base}/api/login"
    data = {"username": username, "password": password}
    try:
        response = requests.post(url, json=data, timeout=10)
        result = response.json()
        if result.get('success'):
            print(f"✅ 登录成功: {username}")
            return response.cookies
        else:
            print(f"❌ 登录失败: {result.get('error', '未知错误')}")
            return None
    except Exception as e:
        print(f"❌ 登录请求失败: {e}")
        return None
 def get_user_info(cookies, api_base=API_BASE):
    """获取用户信息"""
    url = f"{api_base}/api/user/info"
    try:
        response = requests.get(url, cookies=cookies, timeout=10)
        result = response.json()
        return result.get('user')
    except:
        return None
 def translate_pdf(pdf_file, instruction=None, cookies=None, api_base=API_BASE):
    """上传并翻译PDF"""
    if not os.path.exists(pdf_file):
        print(f"❌ 文件不存在: {pdf_file}")
        return None
    url = f"{api_base}/api/upload"
    print(f"📤 上传文件: {pdf_file}")
    try:
        with open(pdf_file, 'rb') as f:
            files = {'file': f}
            data = {}
            if instruction:
                data['instruction'] = instruction
            response = requests.post(url, files=files, data=data, cookies=cookies, timeout=60)
            result = response.json()
        if response.status_code != 200:
            print(f"❌ 上传失败: {result.get('error', '未知错误')}")
            return None
        translation_id = result.get('translation_id')
        task_id = result.get('task_id')
        from_cache = result.get('from_cache', False)
        print(f"✅ 上传成功!")
        print(f"   翻译ID: {translation_id}")
        print(f"   任务ID: {task_id}")
        print(f"   使用缓存: {'是' if from_cache else '否'}")
        # 等待翻译完成
        if not from_cache:
            print(f"\n🔄 等待翻译完成...")
            poll_translation_status(translation_id, cookies, api_base)
        return translation_id
    except Exception as e:
        print(f"❌ 上传请求失败: {e}")
        return None
 def poll_translation_status(translation_id, cookies=None, api_base=API_BASE):
    """轮询翻译状态"""
    url = f"{api_base}/api/status/{translation_id}"
    import time
    max_wait = 300  # 最大等待5分钟
    waited = 0
    while waited < max_wait:
        try:
            response = requests.get(url, cookies=cookies, timeout=10)
            result = response.json()
            status = result.get('status', 'unknown')
            progress = result.get('progress', 0)
            print(f"   状态: {status}, 进度: {progress}%")
            if status == 'completed':
                print(f"✅ 翻译完成!")
                return True
            elif status == 'failed':
                print(f"❌ 翻译失败: {result.get('error', '未知错误')}")
                return False
            time.sleep(5)
            waited += 5
        except Exception as e:
            print(f"⚠️ 状态查询失败: {e}")
            time.sleep(5)
            waited += 5
    print(f"⚠️ 等待超时，请在网页查看结果")
    return False
 def list_translations(cookies=None, api_base=API_BASE):
    """列出翻译记录"""
    url = f"{api_base}/api/translations"
    try:
        response = requests.get(url, cookies=cookies, timeout=10)
        result = response.json()
        translations = result.get('translations', [])
        if not translations:
            print("暂无翻译记录")
            return
        print(f"\n📋 翻译记录 (共{len(translations)}条):\n")
        print("ID    | 文件名           | 状态      | 进度 | 时间")
        print("-" * 60)
        for t in translations:
            id_str = str(t.get('id', '?'))[:4]
            filename = t.get('filename', '?')[:15]
            status = t.get('status', '?')
            progress = t.get('progress', 0)
            created = t.get('created_at', '?')
            if created and created != '?':
                created = created[5:16]  # 月-日 时:分
            print(f"{id_str:4} | {filename:15} | {status:8} | {progress:3}% | {created}")
    except Exception as e:
        print(f"❌ 获取列表失败: {e}")
 def get_translation_status(translation_id, cookies=None, api_base=API_BASE):
    """获取翻译状态"""
    url = f"{api_base}/api/status/{translation_id}"
    try:
        response = requests.get(url, cookies=cookies, timeout=10)
        result = response.json()
        print(f"\n📊 翻译状态 (ID: {translation_id}):\n")
        print(f"   状态: {result.get('status', '未知')}")
        print(f"   进度: {result.get('progress', 0)}%")
        print(f"   文件: {result.get('filename', '未知')}")
        print(f"   页数: {result.get('pages', '?')}")
        print(f"   缓存: {'是' if result.get('from_cache') else '否'}")
        if result.get('error'):
            print(f"   错误: {result.get('error')}")
    except Exception as e:
        print(f"❌ 获取状态失败: {e}")
 def download_translation(translation_id, output_file, cookies=None, api_base=API_BASE):
    """下载翻译结果"""
    url = f"{api_base}/api/download/{translation_id}"
    try:
        response = requests.get(url, cookies=cookies, timeout=30)
        if response.status_code != 200:
            print(f"❌ 下载失败: {response.status_code}")
            return False
        with open(output_file, 'wb') as f:
            f.write(response.content)
        print(f"✅ 下载成功: {output_file}")
        return True
    except Exception as e:
        print(f"❌ 下载失败: {e}")
        return False
 def show_config(api_base=API_BASE):
    """显示当前大模型配置"""
    url = f"{api_base}/api/config"
    try:
        response = requests.get(url, timeout=10)
        result = response.json()
        print(f"\n⚙️ 当前配置:\n")
        print(f"   网站名称: {result.get('site_name', '未知')}")
        print(f"   最大文件: {result.get('max_file_size', '?')}MB")
        print(f"   缓存天数: {result.get('cache_expire_days', '?')}天")
        llm = result.get('llm_config', {})
        print(f"   大模型API: {llm.get('api_base', '未设置')}")
        print(f"   模型: {llm.get('model', '未设置')}")
    except Exception as e:
        print(f"❌ 获取配置失败: {e}")
 def main():
    parser = argparse.ArgumentParser(
        description="PDF翻译命令行工具",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 示例:
  pdf_translate translate test.pdf --user admin --password admin123
  pdf_translate translate test.pdf --instruction "保持专业术语原文"
  pdf_translate list --user admin --password admin123
  pdf_translate status 1
  pdf_translate download 1 result.md
  pdf_translate config
        """
    )
    parser.add_argument('--user', '-u', help='用户名')
    parser.add_argument('--password', '-p', help='密码')
    parser.add_argument('--api', default=API_BASE, help=f'API地址 (默认: {API_BASE})')
    subparsers = parser.add_subparsers(dest='command', help='命令')
    # translate 命令
    trans_parser = subparsers.add_parser('translate', help='翻译PDF文件')
    trans_parser.add_argument('file', help='PDF文件路径')
    trans_parser.add_argument('--instruction', '-i', help='翻译要求')
    # list 命令
    list_parser = subparsers.add_parser('list', help='列出翻译记录')
    # status 命令
    status_parser = subparsers.add_parser('status', help='查看翻译状态')
    status_parser.add_argument('id', type=int, help='翻译ID')
    # download 命令
    download_parser = subparsers.add_parser('download', help='下载翻译结果')
    download_parser.add_argument('id', type=int, help='翻译ID')
    download_parser.add_argument('output', help='输出文件路径')
    # config 命令
    config_parser = subparsers.add_parser('config', help='显示当前配置')
    args = parser.parse_args()
    # 更新 API 地址（如果用户指定了）
    api_base_arg = args.api
    if not args.command:
        parser.print_help()
        return
    # 登录（如果提供了用户名密码）
    cookies = None
    if args.user and args.password:
        cookies = login(args.user, args.password, api_base_arg)
        if not cookies:
            print("请检查用户名和密码")
            return
        user = get_user_info(cookies, api_base_arg)
        if user:
            print(f"👤 用户: {user.get('username')} ({user.get('user_type')})")
    # 执行命令
    if args.command == 'translate':
        translation_id = translate_pdf(args.file, args.instruction, cookies, api_base_arg)
        if translation_id:
            print(f"\n💡 在网页查看结果: {api_base_arg}/history")
    elif args.command == 'list':
        list_translations(cookies, api_base_arg)
    elif args.command == 'status':
        get_translation_status(args.id, cookies, api_base_arg)
    elif args.command == 'download':
        download_translation(args.id, args.output, cookies, api_base_arg)
    elif args.command == 'config':
        show_config(api_base_arg)
 if __name__ == '__main__':
    main()
--- a/services.py
+++ b/services.py
@@ -7,10 +7,20 @@ import json
 import time
 import hashlib
 import threading
 import base64
 import io
 from datetime import datetime, timedelta
 from pypdf import PdfReader
 from openai import OpenAI
 from flask import current_app
 from PIL import Image
 # pdf2image 用于将PDF转为图像
 try:
    from pdf2image import convert_from_path
    PDF_TO_IMAGE_AVAILABLE = True
 except ImportError:
    PDF_TO_IMAGE_AVAILABLE = False
 # ==================== LLM客户端 ====================
 class TranslationService:
@@ -99,6 +109,115 @@ class TranslationService:
        text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
        return text.strip()
    def is_vision_model(self):
        """检查是否是视觉模型"""
        model = self.llm_config.get('model', '')
        # 常见视觉模型名称
        vision_models = ['vision', 'vlm', 'glm-4v', 'glm-4.6v', 'gpt-4-vision', 'gpt-4o', 'qwen-vl', 'claude-3']
        return any(v in model.lower() for v in vision_models)
    def pdf_to_images(self, pdf_path, max_pages=None):
        """将PDF页面转换为图像"""
        if not PDF_TO_IMAGE_AVAILABLE:
            return None, "pdf2image未安装，无法处理扫描版PDF。请安装: pip install pdf2image"
        try:
            # 获取PDF页数
            reader = PdfReader(pdf_path)
            total_pages = len(reader.pages)
            if max_pages:
                pages_to_convert = min(max_pages, total_pages)
            else:
                pages_to_convert = total_pages
            # 转换PDF为图像
            images = convert_from_path(
                pdf_path,
                first_page=1,
                last_page=pages_to_convert,
                dpi=200,  # 适当的DPI
                fmt='jpeg'
            )
            return images, None
        except Exception as e:
            return None, f"PDF转图像失败: {str(e)}"
    def extract_text_from_image(self, image):
        """使用视觉模型从图像中提取文字"""
        if not self.is_vision_model():
            return None, "当前模型不是视觉模型，无法识别图像文字"
        try:
            # 将图像转为base64
            buffered = io.BytesIO()
            image.save(buffered, format="JPEG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
            # 构建多模态请求
            response = self.client.chat.completions.create(
                model=self.llm_config['model'],
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": "请识别并提取这张图片中的所有文字内容。只输出提取的文字，不要添加任何解释或说明。保持原有的段落和格式。"
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{img_base64}"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=self.llm_config['max_tokens'],
                temperature=0.1,
                timeout=self.llm_config['timeout'],
            )
            content = response.choices[0].message.content
            return content.strip() if content else '', None
        except Exception as e:
            return '', f"视觉模型识别失败: {str(e)}"
    def extract_text_from_scanned_pdf(self, pdf_path, progress_callback=None):
        """从扫描版PDF提取文字（使用视觉模型OCR）"""
        images, error = self.pdf_to_images(pdf_path)
        if error:
            return [], error
        pages_text = []
        total = len(images)
        for i, image in enumerate(images):
            if progress_callback:
                progress_callback(int((i+1)/total*50), total, f"OCR识别第{i+1}页...")
            text, err = self.extract_text_from_image(image)
            if err:
                pages_text.append({
                    'page': i + 1,
                    'text': '',
                    'error': err
                })
            else:
                pages_text.append({
                    'page': i + 1,
                    'text': text or '',
                    'error': None
                })
        return pages_text, None
    def chunk_text(self, text, max_size=2000):
        """分块"""
        paragraphs = text.split('\n\n')
@@ -131,11 +250,38 @@ class TranslationService:
        Returns:
            翻译统计信息
        """
        # 先尝试常规提取
        pages = self.extract_pdf_text(pdf_path)
        total_pages = len(pages)
        total_text = sum(len(p['text']) for p in pages)
        # 如果无法提取文本，尝试使用视觉模型OCR
        if total_pages == 0 or total_text < 10:
            if self.is_vision_model() and PDF_TO_IMAGE_AVAILABLE:
                if progress_callback:
                    progress_callback(0, 0, "检测到扫描版PDF，使用视觉模型OCR...")
                pages, error = self.extract_text_from_scanned_pdf(pdf_path, progress_callback)
                if error:
                    raise ValueError(error)
                total_pages = len(pages)
                total_text = sum(len(p['text']) for p in pages)
                if total_text < 10:
                    raise ValueError("视觉模型OCR未能提取到有效文字内容")
                if progress_callback:
                    progress_callback(50, total_pages, "OCR完成，开始翻译...")
            else:
                error_msg = "PDF无法提取文本内容。可能原因：\n1. PDF是扫描版（图像形式）\n2. 当前大模型不是视觉模型，无法识别图像文字\n\n如需处理扫描版PDF，请配置视觉大模型（如 glm-4.6v、gpt-4-vision）"
                if progress_callback:
                    progress_callback(0, 0, error_msg)
                raise ValueError(error_msg)
        if progress_callback:
-            progress_callback(0, total_pages, "开始翻译...")
+            progress_callback(50, total_pages, "开始翻译...")
        translated_pages = []
        total_chunks = 0
@@ -150,8 +296,10 @@ class TranslationService:
                translated_chunks.append(translated)
                if progress_callback:
-                    progress = int((i + 1) / len(chunks) * 100 / total_pages)
+                    # OCR占50%，翻译占50%
-                    progress_callback(progress, total_pages, f"翻译第{page_data['page']}页")
+                    page_progress = (i + 1) / len(chunks)
                    overall_progress = 50 + int(page_progress * 50 / total_pages)
                    progress_callback(overall_progress, total_pages, f"翻译第{page_data['page']}页")
            translated_pages.append({
                'page': page_data['page'],
@@ -279,12 +427,14 @@ class TranslationTask:
                with app.app_context():
                    from admin import get_llm_config
                    llm_config = get_llm_config()
-                    config['LLM_CONFIG'] = llm_config
+                    config = {'LLM_CONFIG': llm_config}
            service = TranslationService(config)
            task['status'] = 'processing'
            task['started_at'] = datetime.now().isoformat()
            print(f"[翻译任务] 开始翻译，使用配置: {config.get('LLM_CONFIG', {}).get('api_base', '未知')}")
            # 更新数据库状态为 processing
            if app and translation_id:
                with app.app_context():