2 Commits
v1.1.2 ... main

Author SHA1 Message Date
ab6469b9dd feat: 添加索引模式开关 - 支持简单索引模式(不使用LLM)
新增配置:
- USE_LLM_INDEX = False (默认关闭LLM增强索引)

简单索引模式:
- 不调用LLM,速度快
- 使用词频统计提取关键词
- 适合快速建立索引

LLM增强模式(USE_LLM_INDEX=True):
- 使用LLM分析文档,提取摘要、关键词、实体
- 索引质量更高,但速度慢、需要LLM服务
2026-04-09 17:28:37 +08:00
bdbfa2a176 fix: 修复文档读取编码问题 - 支持GBK/GB2312等中文编码
问题: 上传的GBK编码文档显示乱码
解决: 自动检测并尝试多种编码(utf-8, gbk, gb2312, gb18030, big5等)
2026-04-09 17:13:02 +08:00
2 changed files with 92 additions and 25 deletions

View File

@@ -33,6 +33,12 @@ DOC_CONFIG = {
"max_summary_length": 500, # 摘要最大长度
}
# ==================== 索引模式 ====================
# use_llm_index: 是否使用LLM增强索引
# True - 使用LLM分析文档提取关键词、摘要等需要LLM服务速度慢
# False - 使用简单分词和词频统计速度快无需LLM
USE_LLM_INDEX = False # 默认关闭LLM索引
# ==================== 索引配置 ====================
INDEX_CONFIG = {
# BM25参数

View File

@@ -12,7 +12,7 @@ from collections import Counter
from openai import OpenAI
from flask import current_app
from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG
from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG, USE_LLM_INDEX
from models import db, Document, DocumentChunk, InvertedIndex, IndexStats, QueryLog
@@ -252,15 +252,25 @@ class DocumentIndexer:
doc.content = content
doc.word_count = len(content)
# 使用LLM分析整个文档
print(f" 正在分析文档: {doc.filename}")
analysis = self.llm.analyze_document(content, doc.title)
# 检查是否使用LLM增强索引
use_llm = USE_LLM_INDEX
doc.summary = analysis.get('summary', '')
doc.set_keywords(analysis.get('keywords', []))
doc.set_topics(analysis.get('topics', []))
doc.category = analysis.get('category', '')
doc.set_entities(analysis.get('entities', {}))
if use_llm:
# 使用LLM分析整个文档
print(f" 正在使用LLM分析文档: {doc.filename}")
analysis = self.llm.analyze_document(content, doc.title)
doc.summary = analysis.get('summary', '')
doc.set_keywords(analysis.get('keywords', []))
doc.set_topics(analysis.get('topics', []))
doc.category = analysis.get('category', '')
doc.set_entities(analysis.get('entities', {}))
else:
# 简单模式不使用LLM
print(f" 正在索引文档(简单模式): {doc.filename}")
# 从内容中提取简单关键词
simple_keywords = self._extract_simple_keywords(content)
doc.set_keywords(simple_keywords[:20])
# 分块处理
chunks = self._split_content(content)
@@ -279,10 +289,14 @@ class DocumentIndexer:
end_char=len(chunk_content)
)
# LLM分析分块
chunk_analysis = self.llm.analyze_chunk(chunk_content)
chunk.summary = chunk_analysis.get('summary', '')
chunk.set_keywords(chunk_analysis.get('keywords', []))
if use_llm:
# LLM分析分块
chunk_analysis = self.llm.analyze_chunk(chunk_content)
chunk.summary = chunk_analysis.get('summary', '')
chunk.set_keywords(chunk_analysis.get('keywords', []))
else:
# 简单模式:从分块提取关键词
chunk.set_keywords(self._extract_simple_keywords(chunk_content)[:10])
# 计算词频
term_freq = self._compute_term_freq(chunk_content)
@@ -313,23 +327,70 @@ class DocumentIndexer:
print(f" ✗ 索引失败: {e}")
return False
def _extract_simple_keywords(self, content):
"""简单提取关键词不使用LLM"""
import re
from collections import Counter
# 提取中文词组2-4字
chinese_words = re.findall(r'[\u4e00-\u9fff]{2,4}', content)
# 过滤常见无意义词
stopwords = {'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'一个', '一些', '这种', '那种', '什么', '怎么', '如何', '为什么'}
# 统计词频
word_freq = Counter(w for w in chinese_words if w not in stopwords)
# 返回高频词
return [w for w, _ in word_freq.most_common(30)]
def _read_document(self, filepath):
"""读取文档内容"""
ext = os.path.splitext(filepath)[1].lower()
# 尝试读取文本文件(包括没有扩展名的)
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
if content.strip(): # 如果能读取到内容
return content
except:
pass
# 尝试多种编码读取文本文件
encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5', 'utf-16', 'latin-1']
# 按扩展名处理特定格式
if ext in ['.txt', '.md', '.json', '.html']:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def try_read_with_encoding(encodings_list):
for enc in encodings_list:
try:
with open(filepath, 'r', encoding=enc) as f:
content = f.read()
# 检查是否有有效的中文字符
if content.strip() and len(content) > 0:
# 简单验证:检查是否有乱码
# 如果内容看起来合理,返回它
return content, enc
except (UnicodeDecodeError, UnicodeError):
continue
return None, None
# 按扩展名处理
if ext in ['.txt', '.md', '.json', '.html', '']:
# 先尝试常见编码
content, used_enc = try_read_with_encoding(encodings)
if content:
print(f" 使用编码 {used_enc} 读取文件")
return content
# 如果都失败,尝试二进制读取后解码
try:
with open(filepath, 'rb') as f:
raw = f.read()
# 尝试chardet检测编码
try:
import chardet
detected = chardet.detect(raw)
if detected['encoding']:
return raw.decode(detected['encoding'])
except:
pass
# 最后尝试
return raw.decode('utf-8', errors='replace')
except:
return ""
elif ext == '.pdf':
try: