Fix: 修复无扩展名文件无法读取的问题

- 增加多种编码尝试
- 支持无扩展名文件识别
- 增强文件读取容错性
This commit is contained in:
2026-04-08 00:10:58 +08:00
parent cdaadef10c
commit 3c862314c7
4 changed files with 26559 additions and 0 deletions

View File

@@ -272,6 +272,16 @@ class DocumentIndexer:
"""读取文档内容"""
ext = os.path.splitext(filepath)[1].lower()
# 尝试读取文本文件(包括没有扩展名的)
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
if content.strip(): # 如果能读取到内容
return content
except:
pass
# 按扩展名处理特定格式
if ext in ['.txt', '.md', '.json', '.html']:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
@@ -295,6 +305,19 @@ class DocumentIndexer:
except:
pass
# 最后尝试以二进制方式读取并解码
try:
with open(filepath, 'rb') as f:
content = f.read()
# 尝试多种编码
for encoding in ['utf-8', 'gbk', 'gb2312', 'latin-1']:
try:
return content.decode(encoding)
except:
continue
except:
pass
return None
def _split_content(self, content):