fix: 实现对比查看功能

- 读取翻译结果文件内容
- 尝试从原PDF提取原文(OCR识别的文字)
- 按页面分块显示对比
- 前端支持分页对比展示
This commit is contained in:
2026-04-16 23:45:57 +08:00
parent e5c9ea322a
commit 3479cbd04c
2 changed files with 102 additions and 16 deletions

59
app.py
View File

@@ -609,16 +609,65 @@ def compare_view(translation_id):
return jsonify({'error': '请登录后使用此功能'}), 401
translation = Translation.query.get(translation_id)
if not translation or translation.user_id != user.id:
if not translation or (translation.user_id != user.id and user.user_type != 'admin'):
return jsonify({'error': '无权访问'}), 403
# 生成对比文件
# TODO: 实现对比功能
# 读取翻译结果文件
translated_content = ''
if translation.output_path and os.path.exists(translation.output_path):
try:
with open(translation.output_path, 'r', encoding='utf-8') as f:
translated_content = f.read()
except Exception as e:
translated_content = f'读取失败: {str(e)}'
# 从翻译结果中提取各页内容
# 翻译结果格式是Markdown包含"## 第 X 页"分隔
original_pages = []
translated_pages = []
if translated_content:
# 解析翻译结果的页面结构
import re
page_pattern = r'## 第 (\d+) 页\n\n(.*?)\n\n---'
matches = re.findall(page_pattern, translated_content, re.DOTALL)
for page_num, content in matches:
translated_pages.append({
'page': int(page_num),
'content': content.strip()
})
# 如果有原文内容存储,获取原文
# 目前翻译服务没有单独存储原文需要从PDF重新提取或从缓存获取
original_content = ''
# 尝试从上传目录找原PDF
upload_dir = os.path.dirname(translation.output_path.replace('outputs', 'uploads').replace('_translated.md', ''))
possible_paths = [
translation.output_path.replace('outputs', 'uploads').replace('_translated.md', ''),
os.path.join(upload_dir, translation.original_filename),
]
for pdf_path in possible_paths:
if os.path.exists(pdf_path) and pdf_path.endswith('.pdf'):
try:
from pypdf import PdfReader
reader = PdfReader(pdf_path)
for page in reader.pages:
text = page.extract_text()
if text:
original_content += text + '\n\n'
except:
pass
break
return jsonify({
'id': translation.id,
'original': '原文内容',
'translated': '文内容'
'filename': translation.original_filename,
'original': original_content or '文内容从扫描版PDF提取',
'translated': translated_content,
'pages': translated_pages
})