fix: 修复文档读取编码问题 - 支持GBK/GB2312等中文编码

问题: 上传的GBK编码文档显示乱码解决: 自动检测并尝试多种编码(utf-8, gbk, gb2312, gb18030, big5等)
2026-04-09 17:13:02 +08:00
1 changed files with 40 additions and 12 deletions
--- a/services.py
+++ b/services.py
@@ -317,19 +317,47 @@ class DocumentIndexer:
        """读取文档内容"""
        ext = os.path.splitext(filepath)[1].lower()
        
-        # 尝试读取文本文件（包括没有扩展名的）
-        try:
-            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
-                content = f.read()
-                if content.strip():  # 如果能读取到内容
-                    return content
-        except:
-            pass
+        # 尝试多种编码读取文本文件
+        encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5', 'utf-16', 'latin-1']
        
-        # 按扩展名处理特定格式
-        if ext in ['.txt', '.md', '.json', '.html']:
-            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
-                return f.read()
+        def try_read_with_encoding(encodings_list):
+            for enc in encodings_list:
+                try:
+                    with open(filepath, 'r', encoding=enc) as f:
+                        content = f.read()
+                        # 检查是否有有效的中文字符
+                        if content.strip() and len(content) > 0:
+                            # 简单验证：检查是否有乱码
+                            # 如果内容看起来合理，返回它
+                            return content, enc
+                except (UnicodeDecodeError, UnicodeError):
+                    continue
+            return None, None
+        
+        # 按扩展名处理
+        if ext in ['.txt', '.md', '.json', '.html', '']:
+            # 先尝试常见编码
+            content, used_enc = try_read_with_encoding(encodings)
+            if content:
+                print(f"  使用编码 {used_enc} 读取文件")
+                return content
+            
+            # 如果都失败，尝试二进制读取后解码
+            try:
+                with open(filepath, 'rb') as f:
+                    raw = f.read()
+                # 尝试chardet检测编码
+                try:
+                    import chardet
+                    detected = chardet.detect(raw)
+                    if detected['encoding']:
+                        return raw.decode(detected['encoding'])
+                except:
+                    pass
+                # 最后尝试
+                return raw.decode('utf-8', errors='replace')
+            except:
+                return ""
        
        elif ext == '.pdf':
            try: