Fix: 修复无扩展名文件无法读取的问题

- 增加多种编码尝试 - 支持无扩展名文件识别 - 增强文件读取容错性
2026-04-08 00:10:58 +08:00
parent cdaadef10c
commit 3c862314c7
4 changed files with 26559 additions and 0 deletions
--- a/documents/txt
+++ b/documents/txt
--- a/documents/txt_20260407235930
+++ b/documents/txt_20260407235930
--- a/documents/txt_20260407235940
+++ b/documents/txt_20260407235940
--- a/services.py
+++ b/services.py
@@ -272,6 +272,16 @@ class DocumentIndexer:
        """读取文档内容"""
        ext = os.path.splitext(filepath)[1].lower()
        # 尝试读取文本文件（包括没有扩展名的）
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                if content.strip():  # 如果能读取到内容
                    return content
        except:
            pass
        # 按扩展名处理特定格式
        if ext in ['.txt', '.md', '.json', '.html']:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()
@@ -295,6 +305,19 @@ class DocumentIndexer:
            except:
                pass
        # 最后尝试以二进制方式读取并解码
        try:
            with open(filepath, 'rb') as f:
                content = f.read()
                # 尝试多种编码
                for encoding in ['utf-8', 'gbk', 'gb2312', 'latin-1']:
                    try:
                        return content.decode(encoding)
                    except:
                        continue
        except:
            pass
        return None
    def _split_content(self, content):