Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ab6469b9dd |
@@ -33,6 +33,12 @@ DOC_CONFIG = {
|
|||||||
"max_summary_length": 500, # 摘要最大长度
|
"max_summary_length": 500, # 摘要最大长度
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ==================== 索引模式 ====================
|
||||||
|
# use_llm_index: 是否使用LLM增强索引
|
||||||
|
# True - 使用LLM分析文档,提取关键词、摘要等(需要LLM服务,速度慢)
|
||||||
|
# False - 使用简单分词和词频统计(速度快,无需LLM)
|
||||||
|
USE_LLM_INDEX = False # 默认关闭LLM索引
|
||||||
|
|
||||||
# ==================== 索引配置 ====================
|
# ==================== 索引配置 ====================
|
||||||
INDEX_CONFIG = {
|
INDEX_CONFIG = {
|
||||||
# BM25参数
|
# BM25参数
|
||||||
|
|||||||
37
services.py
37
services.py
@@ -12,7 +12,7 @@ from collections import Counter
|
|||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from flask import current_app
|
from flask import current_app
|
||||||
|
|
||||||
from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG
|
from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG, USE_LLM_INDEX
|
||||||
from models import db, Document, DocumentChunk, InvertedIndex, IndexStats, QueryLog
|
from models import db, Document, DocumentChunk, InvertedIndex, IndexStats, QueryLog
|
||||||
|
|
||||||
|
|
||||||
@@ -252,8 +252,12 @@ class DocumentIndexer:
|
|||||||
doc.content = content
|
doc.content = content
|
||||||
doc.word_count = len(content)
|
doc.word_count = len(content)
|
||||||
|
|
||||||
|
# 检查是否使用LLM增强索引
|
||||||
|
use_llm = USE_LLM_INDEX
|
||||||
|
|
||||||
|
if use_llm:
|
||||||
# 使用LLM分析整个文档
|
# 使用LLM分析整个文档
|
||||||
print(f" 正在分析文档: {doc.filename}")
|
print(f" 正在使用LLM分析文档: {doc.filename}")
|
||||||
analysis = self.llm.analyze_document(content, doc.title)
|
analysis = self.llm.analyze_document(content, doc.title)
|
||||||
|
|
||||||
doc.summary = analysis.get('summary', '')
|
doc.summary = analysis.get('summary', '')
|
||||||
@@ -261,6 +265,12 @@ class DocumentIndexer:
|
|||||||
doc.set_topics(analysis.get('topics', []))
|
doc.set_topics(analysis.get('topics', []))
|
||||||
doc.category = analysis.get('category', '')
|
doc.category = analysis.get('category', '')
|
||||||
doc.set_entities(analysis.get('entities', {}))
|
doc.set_entities(analysis.get('entities', {}))
|
||||||
|
else:
|
||||||
|
# 简单模式:不使用LLM
|
||||||
|
print(f" 正在索引文档(简单模式): {doc.filename}")
|
||||||
|
# 从内容中提取简单关键词
|
||||||
|
simple_keywords = self._extract_simple_keywords(content)
|
||||||
|
doc.set_keywords(simple_keywords[:20])
|
||||||
|
|
||||||
# 分块处理
|
# 分块处理
|
||||||
chunks = self._split_content(content)
|
chunks = self._split_content(content)
|
||||||
@@ -279,10 +289,14 @@ class DocumentIndexer:
|
|||||||
end_char=len(chunk_content)
|
end_char=len(chunk_content)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if use_llm:
|
||||||
# LLM分析分块
|
# LLM分析分块
|
||||||
chunk_analysis = self.llm.analyze_chunk(chunk_content)
|
chunk_analysis = self.llm.analyze_chunk(chunk_content)
|
||||||
chunk.summary = chunk_analysis.get('summary', '')
|
chunk.summary = chunk_analysis.get('summary', '')
|
||||||
chunk.set_keywords(chunk_analysis.get('keywords', []))
|
chunk.set_keywords(chunk_analysis.get('keywords', []))
|
||||||
|
else:
|
||||||
|
# 简单模式:从分块提取关键词
|
||||||
|
chunk.set_keywords(self._extract_simple_keywords(chunk_content)[:10])
|
||||||
|
|
||||||
# 计算词频
|
# 计算词频
|
||||||
term_freq = self._compute_term_freq(chunk_content)
|
term_freq = self._compute_term_freq(chunk_content)
|
||||||
@@ -313,6 +327,25 @@ class DocumentIndexer:
|
|||||||
print(f" ✗ 索引失败: {e}")
|
print(f" ✗ 索引失败: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _extract_simple_keywords(self, content):
|
||||||
|
"""简单提取关键词(不使用LLM)"""
|
||||||
|
import re
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
# 提取中文词组(2-4字)
|
||||||
|
chinese_words = re.findall(r'[\u4e00-\u9fff]{2,4}', content)
|
||||||
|
|
||||||
|
# 过滤常见无意义词
|
||||||
|
stopwords = {'的', '是', '在', '了', '和', '与', '或', '等', '这', '那',
|
||||||
|
'有', '为', '对', '也', '被', '把', '给', '向', '从', '到',
|
||||||
|
'一个', '一些', '这种', '那种', '什么', '怎么', '如何', '为什么'}
|
||||||
|
|
||||||
|
# 统计词频
|
||||||
|
word_freq = Counter(w for w in chinese_words if w not in stopwords)
|
||||||
|
|
||||||
|
# 返回高频词
|
||||||
|
return [w for w, _ in word_freq.most_common(30)]
|
||||||
|
|
||||||
def _read_document(self, filepath):
|
def _read_document(self, filepath):
|
||||||
"""读取文档内容"""
|
"""读取文档内容"""
|
||||||
ext = os.path.splitext(filepath)[1].lower()
|
ext = os.path.splitext(filepath)[1].lower()
|
||||||
|
|||||||
Reference in New Issue
Block a user