Fix: 修复无扩展名文件无法读取的问题
- 增加多种编码尝试 - 支持无扩展名文件识别 - 增强文件读取容错性
This commit is contained in:
9226
documents/txt
Normal file
9226
documents/txt
Normal file
File diff suppressed because it is too large
Load Diff
9226
documents/txt_20260407235930
Normal file
9226
documents/txt_20260407235930
Normal file
File diff suppressed because it is too large
Load Diff
8084
documents/txt_20260407235940
Normal file
8084
documents/txt_20260407235940
Normal file
File diff suppressed because it is too large
Load Diff
23
services.py
23
services.py
@@ -272,6 +272,16 @@ class DocumentIndexer:
|
|||||||
"""读取文档内容"""
|
"""读取文档内容"""
|
||||||
ext = os.path.splitext(filepath)[1].lower()
|
ext = os.path.splitext(filepath)[1].lower()
|
||||||
|
|
||||||
|
# 尝试读取文本文件(包括没有扩展名的)
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
content = f.read()
|
||||||
|
if content.strip(): # 如果能读取到内容
|
||||||
|
return content
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 按扩展名处理特定格式
|
||||||
if ext in ['.txt', '.md', '.json', '.html']:
|
if ext in ['.txt', '.md', '.json', '.html']:
|
||||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
@@ -295,6 +305,19 @@ class DocumentIndexer:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# 最后尝试以二进制方式读取并解码
|
||||||
|
try:
|
||||||
|
with open(filepath, 'rb') as f:
|
||||||
|
content = f.read()
|
||||||
|
# 尝试多种编码
|
||||||
|
for encoding in ['utf-8', 'gbk', 'gb2312', 'latin-1']:
|
||||||
|
try:
|
||||||
|
return content.decode(encoding)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _split_content(self, content):
|
def _split_content(self, content):
|
||||||
|
|||||||
Reference in New Issue
Block a user