feat: 添加索引模式开关 - 支持简单索引模式（不使用LLM）

新增配置: - USE_LLM_INDEX = False (默认关闭LLM增强索引) 简单索引模式: - 不调用LLM，速度快 - 使用词频统计提取关键词 - 适合快速建立索引 LLM增强模式(USE_LLM_INDEX=True): - 使用LLM分析文档，提取摘要、关键词、实体 - 索引质量更高，但速度慢、需要LLM服务
fix: 修复文档读取编码问题 - 支持GBK/GB2312等中文编码
2026-04-09 17:28:37 +08:00 · 2026-04-09 17:13:02 +08:00
2 changed files with 92 additions and 25 deletions
--- a/config.py
+++ b/config.py
@@ -33,6 +33,12 @@ DOC_CONFIG = {
    "max_summary_length": 500,  # 摘要最大长度
 }

+# ==================== 索引模式 ====================
+# use_llm_index: 是否使用LLM增强索引
+# True - 使用LLM分析文档，提取关键词、摘要等（需要LLM服务，速度慢）
+# False - 使用简单分词和词频统计（速度快，无需LLM）
+USE_LLM_INDEX = False  # 默认关闭LLM索引
+
 # ==================== 索引配置 ====================
 INDEX_CONFIG = {
    # BM25参数
--- a/services.py
+++ b/services.py
@@ -12,7 +12,7 @@ from collections import Counter
 from openai import OpenAI
 from flask import current_app

-from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG
+from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG, USE_LLM_INDEX
 from models import db, Document, DocumentChunk, InvertedIndex, IndexStats, QueryLog


@@ -252,15 +252,25 @@ class DocumentIndexer:
            doc.content = content
            doc.word_count = len(content)
            
-            # 使用LLM分析整个文档
-            print(f"  正在分析文档: {doc.filename}")
-            analysis = self.llm.analyze_document(content, doc.title)
+            # 检查是否使用LLM增强索引
+            use_llm = USE_LLM_INDEX
            
-            doc.summary = analysis.get('summary', '')
-            doc.set_keywords(analysis.get('keywords', []))
-            doc.set_topics(analysis.get('topics', []))
-            doc.category = analysis.get('category', '')
-            doc.set_entities(analysis.get('entities', {}))
+            if use_llm:
+                # 使用LLM分析整个文档
+                print(f"  正在使用LLM分析文档: {doc.filename}")
+                analysis = self.llm.analyze_document(content, doc.title)
+                
+                doc.summary = analysis.get('summary', '')
+                doc.set_keywords(analysis.get('keywords', []))
+                doc.set_topics(analysis.get('topics', []))
+                doc.category = analysis.get('category', '')
+                doc.set_entities(analysis.get('entities', {}))
+            else:
+                # 简单模式：不使用LLM
+                print(f"  正在索引文档(简单模式): {doc.filename}")
+                # 从内容中提取简单关键词
+                simple_keywords = self._extract_simple_keywords(content)
+                doc.set_keywords(simple_keywords[:20])
            
            # 分块处理
            chunks = self._split_content(content)
@@ -279,10 +289,14 @@ class DocumentIndexer:
                    end_char=len(chunk_content)
                )
                
-                # LLM分析分块
-                chunk_analysis = self.llm.analyze_chunk(chunk_content)
-                chunk.summary = chunk_analysis.get('summary', '')
-                chunk.set_keywords(chunk_analysis.get('keywords', []))
+                if use_llm:
+                    # LLM分析分块
+                    chunk_analysis = self.llm.analyze_chunk(chunk_content)
+                    chunk.summary = chunk_analysis.get('summary', '')
+                    chunk.set_keywords(chunk_analysis.get('keywords', []))
+                else:
+                    # 简单模式：从分块提取关键词
+                    chunk.set_keywords(self._extract_simple_keywords(chunk_content)[:10])
                
                # 计算词频
                term_freq = self._compute_term_freq(chunk_content)
@@ -313,23 +327,70 @@ class DocumentIndexer:
            print(f"  ✗ 索引失败: {e}")
            return False
    
+    def _extract_simple_keywords(self, content):
+        """简单提取关键词（不使用LLM）"""
+        import re
+        from collections import Counter
+        
+        # 提取中文词组（2-4字）
+        chinese_words = re.findall(r'[\u4e00-\u9fff]{2,4}', content)
+        
+        # 过滤常见无意义词
+        stopwords = {'的', '是', '在', '了', '和', '与', '或', '等', '这', '那', 
+                     '有', '为', '对', '也', '被', '把', '给', '向', '从', '到',
+                     '一个', '一些', '这种', '那种', '什么', '怎么', '如何', '为什么'}
+        
+        # 统计词频
+        word_freq = Counter(w for w in chinese_words if w not in stopwords)
+        
+        # 返回高频词
+        return [w for w, _ in word_freq.most_common(30)]
+    
    def _read_document(self, filepath):
        """读取文档内容"""
        ext = os.path.splitext(filepath)[1].lower()
        
-        # 尝试读取文本文件（包括没有扩展名的）
-        try:
-            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
-                content = f.read()
-                if content.strip():  # 如果能读取到内容
-                    return content
-        except:
-            pass
+        # 尝试多种编码读取文本文件
+        encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5', 'utf-16', 'latin-1']
        
-        # 按扩展名处理特定格式
-        if ext in ['.txt', '.md', '.json', '.html']:
-            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
-                return f.read()
+        def try_read_with_encoding(encodings_list):
+            for enc in encodings_list:
+                try:
+                    with open(filepath, 'r', encoding=enc) as f:
+                        content = f.read()
+                        # 检查是否有有效的中文字符
+                        if content.strip() and len(content) > 0:
+                            # 简单验证：检查是否有乱码
+                            # 如果内容看起来合理，返回它
+                            return content, enc
+                except (UnicodeDecodeError, UnicodeError):
+                    continue
+            return None, None
+        
+        # 按扩展名处理
+        if ext in ['.txt', '.md', '.json', '.html', '']:
+            # 先尝试常见编码
+            content, used_enc = try_read_with_encoding(encodings)
+            if content:
+                print(f"  使用编码 {used_enc} 读取文件")
+                return content
+            
+            # 如果都失败，尝试二进制读取后解码
+            try:
+                with open(filepath, 'rb') as f:
+                    raw = f.read()
+                # 尝试chardet检测编码
+                try:
+                    import chardet
+                    detected = chardet.detect(raw)
+                    if detected['encoding']:
+                        return raw.decode(detected['encoding'])
+                except:
+                    pass
+                # 最后尝试
+                return raw.decode('utf-8', errors='replace')
+            except:
+                return ""
        
        elif ext == '.pdf':
            try: