1 Commits

Author SHA1 Message Date
bdbfa2a176 fix: 修复文档读取编码问题 - 支持GBK/GB2312等中文编码
问题: 上传的GBK编码文档显示乱码
解决: 自动检测并尝试多种编码(utf-8, gbk, gb2312, gb18030, big5等)
2026-04-09 17:13:02 +08:00

View File

@@ -317,19 +317,47 @@ class DocumentIndexer:
"""读取文档内容"""
ext = os.path.splitext(filepath)[1].lower()
# 尝试读取文本文件(包括没有扩展名的)
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
if content.strip(): # 如果能读取到内容
return content
except:
pass
# 尝试多种编码读取文本文件
encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5', 'utf-16', 'latin-1']
# 按扩展名处理特定格式
if ext in ['.txt', '.md', '.json', '.html']:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def try_read_with_encoding(encodings_list):
for enc in encodings_list:
try:
with open(filepath, 'r', encoding=enc) as f:
content = f.read()
# 检查是否有有效的中文字符
if content.strip() and len(content) > 0:
# 简单验证:检查是否有乱码
# 如果内容看起来合理,返回它
return content, enc
except (UnicodeDecodeError, UnicodeError):
continue
return None, None
# 按扩展名处理
if ext in ['.txt', '.md', '.json', '.html', '']:
# 先尝试常见编码
content, used_enc = try_read_with_encoding(encodings)
if content:
print(f" 使用编码 {used_enc} 读取文件")
return content
# 如果都失败,尝试二进制读取后解码
try:
with open(filepath, 'rb') as f:
raw = f.read()
# 尝试chardet检测编码
try:
import chardet
detected = chardet.detect(raw)
if detected['encoding']:
return raw.decode(detected['encoding'])
except:
pass
# 最后尝试
return raw.decode('utf-8', errors='replace')
except:
return ""
elif ext == '.pdf':
try: