Fix: 修复无扩展名文件无法读取的问题
- 增加多种编码尝试 - 支持无扩展名文件识别 - 增强文件读取容错性
This commit is contained in:
23
services.py
23
services.py
@@ -272,6 +272,16 @@ class DocumentIndexer:
|
||||
"""读取文档内容"""
|
||||
ext = os.path.splitext(filepath)[1].lower()
|
||||
|
||||
# 尝试读取文本文件(包括没有扩展名的)
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
if content.strip(): # 如果能读取到内容
|
||||
return content
|
||||
except:
|
||||
pass
|
||||
|
||||
# 按扩展名处理特定格式
|
||||
if ext in ['.txt', '.md', '.json', '.html']:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
return f.read()
|
||||
@@ -295,6 +305,19 @@ class DocumentIndexer:
|
||||
except:
|
||||
pass
|
||||
|
||||
# 最后尝试以二进制方式读取并解码
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
content = f.read()
|
||||
# 尝试多种编码
|
||||
for encoding in ['utf-8', 'gbk', 'gb2312', 'latin-1']:
|
||||
try:
|
||||
return content.decode(encoding)
|
||||
except:
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _split_content(self, content):
|
||||
|
||||
Reference in New Issue
Block a user