4 Commits
v1.1.0 ... main

Author SHA1 Message Date
ab6469b9dd feat: 添加索引模式开关 - 支持简单索引模式(不使用LLM)
新增配置:
- USE_LLM_INDEX = False (默认关闭LLM增强索引)

简单索引模式:
- 不调用LLM,速度快
- 使用词频统计提取关键词
- 适合快速建立索引

LLM增强模式(USE_LLM_INDEX=True):
- 使用LLM分析文档,提取摘要、关键词、实体
- 索引质量更高,但速度慢、需要LLM服务
2026-04-09 17:28:37 +08:00
bdbfa2a176 fix: 修复文档读取编码问题 - 支持GBK/GB2312等中文编码
问题: 上传的GBK编码文档显示乱码
解决: 自动检测并尝试多种编码(utf-8, gbk, gb2312, gb18030, big5等)
2026-04-09 17:13:02 +08:00
8baecc520a fix: 修复文档列表显示问题和添加文档详情页面
修复:
- Document.to_dict() 添加 file_type 和 file_size 字段
- 文档列表正确显示类型和大小

新增:
- /documents/<id> 文档详情页面
- document_detail.html 模板(显示文档信息、分块内容、关键词等)
2026-04-09 17:02:17 +08:00
d54a105e55 fix: 修复搜索API报错 - 添加缺失的QueryLog导入 2026-04-09 16:52:54 +08:00
5 changed files with 316 additions and 27 deletions

10
app.py
View File

@@ -177,7 +177,7 @@ def api_upload_document():
@app.route('/api/documents/<int:doc_id>', methods=['GET'])
def api_get_document(doc_id):
"""获取文档详情"""
"""获取文档详情API"""
doc = Document.query.get_or_404(doc_id)
chunks = DocumentChunk.query.filter_by(document_id=doc_id).all()
@@ -188,6 +188,14 @@ def api_get_document(doc_id):
})
@app.route('/documents/<int:doc_id>')
def document_detail_page(doc_id):
"""文档详情页面"""
doc = Document.query.get_or_404(doc_id)
chunks = DocumentChunk.query.filter_by(document_id=doc_id).order_by(DocumentChunk.chunk_index).all()
return render_template('document_detail.html', doc=doc, chunks=chunks)
@app.route('/api/documents/<int:doc_id>', methods=['DELETE'])
def api_delete_document(doc_id):
"""删除文档"""

View File

@@ -33,6 +33,12 @@ DOC_CONFIG = {
"max_summary_length": 500, # 摘要最大长度
}
# ==================== 索引模式 ====================
# use_llm_index: 是否使用LLM增强索引
# True - 使用LLM分析文档提取关键词、摘要等需要LLM服务速度慢
# False - 使用简单分词和词频统计速度快无需LLM
USE_LLM_INDEX = False # 默认关闭LLM索引
# ==================== 索引配置 ====================
INDEX_CONFIG = {
# BM25参数

View File

@@ -92,6 +92,8 @@ class Document(db.Model):
return {
'id': self.id,
'filename': self.filename,
'file_type': self.file_type,
'file_size': self.file_size,
'title': self.title,
'status': self.status,
'summary': self.summary,

View File

@@ -12,8 +12,8 @@ from collections import Counter
from openai import OpenAI
from flask import current_app
from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG
from models import db, Document, DocumentChunk, InvertedIndex, IndexStats
from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG, USE_LLM_INDEX
from models import db, Document, DocumentChunk, InvertedIndex, IndexStats, QueryLog
def get_llm_config():
@@ -252,15 +252,25 @@ class DocumentIndexer:
doc.content = content
doc.word_count = len(content)
# 使用LLM分析整个文档
print(f" 正在分析文档: {doc.filename}")
analysis = self.llm.analyze_document(content, doc.title)
# 检查是否使用LLM增强索引
use_llm = USE_LLM_INDEX
doc.summary = analysis.get('summary', '')
doc.set_keywords(analysis.get('keywords', []))
doc.set_topics(analysis.get('topics', []))
doc.category = analysis.get('category', '')
doc.set_entities(analysis.get('entities', {}))
if use_llm:
# 使用LLM分析整个文档
print(f" 正在使用LLM分析文档: {doc.filename}")
analysis = self.llm.analyze_document(content, doc.title)
doc.summary = analysis.get('summary', '')
doc.set_keywords(analysis.get('keywords', []))
doc.set_topics(analysis.get('topics', []))
doc.category = analysis.get('category', '')
doc.set_entities(analysis.get('entities', {}))
else:
# 简单模式不使用LLM
print(f" 正在索引文档(简单模式): {doc.filename}")
# 从内容中提取简单关键词
simple_keywords = self._extract_simple_keywords(content)
doc.set_keywords(simple_keywords[:20])
# 分块处理
chunks = self._split_content(content)
@@ -279,10 +289,14 @@ class DocumentIndexer:
end_char=len(chunk_content)
)
# LLM分析分块
chunk_analysis = self.llm.analyze_chunk(chunk_content)
chunk.summary = chunk_analysis.get('summary', '')
chunk.set_keywords(chunk_analysis.get('keywords', []))
if use_llm:
# LLM分析分块
chunk_analysis = self.llm.analyze_chunk(chunk_content)
chunk.summary = chunk_analysis.get('summary', '')
chunk.set_keywords(chunk_analysis.get('keywords', []))
else:
# 简单模式:从分块提取关键词
chunk.set_keywords(self._extract_simple_keywords(chunk_content)[:10])
# 计算词频
term_freq = self._compute_term_freq(chunk_content)
@@ -313,23 +327,70 @@ class DocumentIndexer:
print(f" ✗ 索引失败: {e}")
return False
def _extract_simple_keywords(self, content):
"""简单提取关键词不使用LLM"""
import re
from collections import Counter
# 提取中文词组2-4字
chinese_words = re.findall(r'[\u4e00-\u9fff]{2,4}', content)
# 过滤常见无意义词
stopwords = {'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'一个', '一些', '这种', '那种', '什么', '怎么', '如何', '为什么'}
# 统计词频
word_freq = Counter(w for w in chinese_words if w not in stopwords)
# 返回高频词
return [w for w, _ in word_freq.most_common(30)]
def _read_document(self, filepath):
"""读取文档内容"""
ext = os.path.splitext(filepath)[1].lower()
# 尝试读取文本文件(包括没有扩展名的)
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
if content.strip(): # 如果能读取到内容
return content
except:
pass
# 尝试多种编码读取文本文件
encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5', 'utf-16', 'latin-1']
# 按扩展名处理特定格式
if ext in ['.txt', '.md', '.json', '.html']:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def try_read_with_encoding(encodings_list):
for enc in encodings_list:
try:
with open(filepath, 'r', encoding=enc) as f:
content = f.read()
# 检查是否有有效的中文字符
if content.strip() and len(content) > 0:
# 简单验证:检查是否有乱码
# 如果内容看起来合理,返回它
return content, enc
except (UnicodeDecodeError, UnicodeError):
continue
return None, None
# 按扩展名处理
if ext in ['.txt', '.md', '.json', '.html', '']:
# 先尝试常见编码
content, used_enc = try_read_with_encoding(encodings)
if content:
print(f" 使用编码 {used_enc} 读取文件")
return content
# 如果都失败,尝试二进制读取后解码
try:
with open(filepath, 'rb') as f:
raw = f.read()
# 尝试chardet检测编码
try:
import chardet
detected = chardet.detect(raw)
if detected['encoding']:
return raw.decode(detected['encoding'])
except:
pass
# 最后尝试
return raw.decode('utf-8', errors='replace')
except:
return ""
elif ext == '.pdf':
try:

View File

@@ -0,0 +1,212 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{{ doc.title or doc.filename }} - 文档详情</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.10.0/font/bootstrap-icons.css" rel="stylesheet">
<style>
body { background-color: #f8f9fa; }
.chunk-card { border-left: 4px solid #667eea; }
</style>
</head>
<body>
<nav class="navbar navbar-expand-lg navbar-dark bg-dark">
<div class="container">
<a class="navbar-brand" href="/"><i class="bi bi-search"></i> LLM Index RAG</a>
<div class="navbar-nav ms-auto">
<a class="nav-link" href="/">首页</a>
<a class="nav-link" href="/documents">文档管理</a>
<a class="nav-link" href="/search">知识检索</a>
<a class="nav-link" href="/settings">系统设置</a>
</div>
</div>
</nav>
<div class="container py-4">
<nav aria-label="breadcrumb" class="mb-3">
<ol class="breadcrumb">
<li class="breadcrumb-item"><a href="/documents">文档管理</a></li>
<li class="breadcrumb-item active">{{ doc.title or doc.filename }}</li>
</ol>
</nav>
<!-- 文档信息 -->
<div class="card mb-4">
<div class="card-header d-flex justify-content-between align-items-center">
<h5 class="mb-0"><i class="bi bi-file-earmark-text"></i> 文档信息</h5>
<div>
{% if doc.status == 'indexed' %}
<span class="badge bg-success">已索引</span>
{% elif doc.status == 'processing' %}
<span class="badge bg-warning">处理中</span>
{% elif doc.status == 'failed' %}
<span class="badge bg-danger">失败</span>
{% else %}
<span class="badge bg-secondary">待索引</span>
{% endif %}
</div>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-6">
<table class="table table-sm">
<tr>
<th width="120">文件名</th>
<td>{{ doc.filename }}</td>
</tr>
<tr>
<th>标题</th>
<td>{{ doc.title or '-' }}</td>
</tr>
<tr>
<th>文件类型</th>
<td>{{ doc.file_type }}</td>
</tr>
<tr>
<th>文件大小</th>
<td>{{ (doc.file_size / 1024)|round(1) }} KB</td>
</tr>
</table>
</div>
<div class="col-md-6">
<table class="table table-sm">
<tr>
<th width="120">分块数</th>
<td>{{ doc.chunk_count }}</td>
</tr>
<tr>
<th>字数</th>
<td>{{ doc.word_count|default(0) }}</td>
</tr>
<tr>
<th>上传时间</th>
<td>{{ doc.created_at.strftime('%Y-%m-%d %H:%M') if doc.created_at else '-' }}</td>
</tr>
<tr>
<th>索引时间</th>
<td>{{ doc.indexed_at.strftime('%Y-%m-%d %H:%M') if doc.indexed_at else '-' }}</td>
</tr>
</table>
</div>
</div>
{% if doc.summary %}
<div class="mt-3">
<h6><i class="bi bi-card-text"></i> 文档摘要</h6>
<p class="text-muted">{{ doc.summary }}</p>
</div>
{% endif %}
{% if doc.keywords %}
<div class="mt-3">
<h6><i class="bi bi-tags"></i> 关键词</h6>
<div>
{% for kw in doc.get_keywords() %}
<span class="badge bg-primary me-1">{{ kw }}</span>
{% endfor %}
</div>
</div>
{% endif %}
{% if doc.category %}
<div class="mt-3">
<h6><i class="bi bi-folder"></i> 分类</h6>
<span class="badge bg-info">{{ doc.category }}</span>
</div>
{% endif %}
</div>
</div>
<!-- 文档分块 -->
<div class="card">
<div class="card-header">
<h5 class="mb-0"><i class="bi bi-puzzle"></i> 文档分块 ({{ chunks|length }} 个)</h5>
</div>
<div class="card-body">
{% if chunks %}
<div class="accordion" id="chunksAccordion">
{% for chunk in chunks %}
<div class="accordion-item">
<h2 class="accordion-header">
<button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#chunk{{ loop.index }}">
<strong>分块 {{ loop.index }}</strong>
<span class="ms-3 text-muted small">
{{ chunk.content[:50] }}...
</span>
</button>
</h2>
<div id="chunk{{ loop.index }}" class="accordion-collapse collapse" data-bs-parent="#chunksAccordion">
<div class="accordion-body">
{% if chunk.summary %}
<div class="alert alert-info small">
<strong>摘要:</strong> {{ chunk.summary }}
</div>
{% endif %}
{% if chunk.keywords %}
<div class="mb-2">
{% for kw in chunk.get_keywords() %}
<span class="badge bg-secondary me-1">{{ kw }}</span>
{% endfor %}
</div>
{% endif %}
<pre class="bg-light p-3 rounded" style="white-space: pre-wrap; word-wrap: break-word;">{{ chunk.content }}</pre>
</div>
</div>
</div>
{% endfor %}
</div>
{% else %}
<p class="text-muted text-center py-4">暂无分块数据,请先索引文档</p>
{% endif %}
</div>
</div>
<div class="mt-4">
<a href="/documents" class="btn btn-secondary">
<i class="bi bi-arrow-left"></i> 返回列表
</a>
{% if doc.status == 'pending' %}
<button class="btn btn-primary" onclick="indexDoc({{ doc.id }})">
<i class="bi bi-arrow-repeat"></i> 索引文档
</button>
{% endif %}
<button class="btn btn-danger" onclick="deleteDoc({{ doc.id }})">
<i class="bi bi-trash"></i> 删除文档
</button>
</div>
</div>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
<script>
function indexDoc(id) {
if (!confirm('确定要索引此文档吗?')) return;
fetch(`/api/index/${id}`, { method: 'POST' })
.then(r => r.json())
.then(data => {
alert(data.success ? '索引完成' : '索引失败: ' + (data.error || '未知错误'));
if (data.success) location.reload();
});
}
function deleteDoc(id) {
if (!confirm('确定要删除此文档吗?此操作不可恢复!')) return;
fetch(`/api/documents/${id}`, { method: 'DELETE' })
.then(r => r.json())
.then(data => {
if (data.success) {
alert('删除成功');
window.location.href = '/documents';
} else {
alert('删除失败');
}
});
}
</script>
</body>
</html>