fix: 添加PDF文本提取检测,提示扫描版PDF无法翻译
- 检查PDF是否能提取文本内容 - 如果是扫描版PDF(图像形式),抛出明确错误提示 - 需要OCR处理的PDF会提示用户使用文本版PDF
This commit is contained in:
@@ -134,6 +134,14 @@ class TranslationService:
|
|||||||
pages = self.extract_pdf_text(pdf_path)
|
pages = self.extract_pdf_text(pdf_path)
|
||||||
total_pages = len(pages)
|
total_pages = len(pages)
|
||||||
|
|
||||||
|
# 检查是否有可翻译内容
|
||||||
|
total_text = sum(len(p['text']) for p in pages)
|
||||||
|
if total_pages == 0 or total_text < 10:
|
||||||
|
error_msg = "PDF无法提取文本内容。可能原因:\n1. PDF是扫描版(图像形式),需要OCR处理\n2. PDF为空或加密\n请使用包含可提取文本的PDF文件。"
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(0, 0, error_msg)
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
progress_callback(0, total_pages, "开始翻译...")
|
progress_callback(0, total_pages, "开始翻译...")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user