Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bdbfa2a176 |
52
services.py
52
services.py
@@ -317,19 +317,47 @@ class DocumentIndexer:
|
||||
"""读取文档内容"""
|
||||
ext = os.path.splitext(filepath)[1].lower()
|
||||
|
||||
# 尝试读取文本文件(包括没有扩展名的)
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
if content.strip(): # 如果能读取到内容
|
||||
return content
|
||||
except:
|
||||
pass
|
||||
# 尝试多种编码读取文本文件
|
||||
encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5', 'utf-16', 'latin-1']
|
||||
|
||||
# 按扩展名处理特定格式
|
||||
if ext in ['.txt', '.md', '.json', '.html']:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
return f.read()
|
||||
def try_read_with_encoding(encodings_list):
|
||||
for enc in encodings_list:
|
||||
try:
|
||||
with open(filepath, 'r', encoding=enc) as f:
|
||||
content = f.read()
|
||||
# 检查是否有有效的中文字符
|
||||
if content.strip() and len(content) > 0:
|
||||
# 简单验证:检查是否有乱码
|
||||
# 如果内容看起来合理,返回它
|
||||
return content, enc
|
||||
except (UnicodeDecodeError, UnicodeError):
|
||||
continue
|
||||
return None, None
|
||||
|
||||
# 按扩展名处理
|
||||
if ext in ['.txt', '.md', '.json', '.html', '']:
|
||||
# 先尝试常见编码
|
||||
content, used_enc = try_read_with_encoding(encodings)
|
||||
if content:
|
||||
print(f" 使用编码 {used_enc} 读取文件")
|
||||
return content
|
||||
|
||||
# 如果都失败,尝试二进制读取后解码
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
raw = f.read()
|
||||
# 尝试chardet检测编码
|
||||
try:
|
||||
import chardet
|
||||
detected = chardet.detect(raw)
|
||||
if detected['encoding']:
|
||||
return raw.decode(detected['encoding'])
|
||||
except:
|
||||
pass
|
||||
# 最后尝试
|
||||
return raw.decode('utf-8', errors='replace')
|
||||
except:
|
||||
return ""
|
||||
|
||||
elif ext == '.pdf':
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user