Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ab6469b9dd | |||
| bdbfa2a176 | |||
| 8baecc520a |
10
app.py
10
app.py
@@ -177,7 +177,7 @@ def api_upload_document():
|
||||
|
||||
@app.route('/api/documents/<int:doc_id>', methods=['GET'])
|
||||
def api_get_document(doc_id):
|
||||
"""获取文档详情"""
|
||||
"""获取文档详情API"""
|
||||
doc = Document.query.get_or_404(doc_id)
|
||||
|
||||
chunks = DocumentChunk.query.filter_by(document_id=doc_id).all()
|
||||
@@ -188,6 +188,14 @@ def api_get_document(doc_id):
|
||||
})
|
||||
|
||||
|
||||
@app.route('/documents/<int:doc_id>')
|
||||
def document_detail_page(doc_id):
|
||||
"""文档详情页面"""
|
||||
doc = Document.query.get_or_404(doc_id)
|
||||
chunks = DocumentChunk.query.filter_by(document_id=doc_id).order_by(DocumentChunk.chunk_index).all()
|
||||
return render_template('document_detail.html', doc=doc, chunks=chunks)
|
||||
|
||||
|
||||
@app.route('/api/documents/<int:doc_id>', methods=['DELETE'])
|
||||
def api_delete_document(doc_id):
|
||||
"""删除文档"""
|
||||
|
||||
@@ -33,6 +33,12 @@ DOC_CONFIG = {
|
||||
"max_summary_length": 500, # 摘要最大长度
|
||||
}
|
||||
|
||||
# ==================== 索引模式 ====================
|
||||
# use_llm_index: 是否使用LLM增强索引
|
||||
# True - 使用LLM分析文档,提取关键词、摘要等(需要LLM服务,速度慢)
|
||||
# False - 使用简单分词和词频统计(速度快,无需LLM)
|
||||
USE_LLM_INDEX = False # 默认关闭LLM索引
|
||||
|
||||
# ==================== 索引配置 ====================
|
||||
INDEX_CONFIG = {
|
||||
# BM25参数
|
||||
|
||||
@@ -92,6 +92,8 @@ class Document(db.Model):
|
||||
return {
|
||||
'id': self.id,
|
||||
'filename': self.filename,
|
||||
'file_type': self.file_type,
|
||||
'file_size': self.file_size,
|
||||
'title': self.title,
|
||||
'status': self.status,
|
||||
'summary': self.summary,
|
||||
|
||||
111
services.py
111
services.py
@@ -12,7 +12,7 @@ from collections import Counter
|
||||
from openai import OpenAI
|
||||
from flask import current_app
|
||||
|
||||
from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG
|
||||
from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG, USE_LLM_INDEX
|
||||
from models import db, Document, DocumentChunk, InvertedIndex, IndexStats, QueryLog
|
||||
|
||||
|
||||
@@ -252,15 +252,25 @@ class DocumentIndexer:
|
||||
doc.content = content
|
||||
doc.word_count = len(content)
|
||||
|
||||
# 使用LLM分析整个文档
|
||||
print(f" 正在分析文档: {doc.filename}")
|
||||
analysis = self.llm.analyze_document(content, doc.title)
|
||||
# 检查是否使用LLM增强索引
|
||||
use_llm = USE_LLM_INDEX
|
||||
|
||||
doc.summary = analysis.get('summary', '')
|
||||
doc.set_keywords(analysis.get('keywords', []))
|
||||
doc.set_topics(analysis.get('topics', []))
|
||||
doc.category = analysis.get('category', '')
|
||||
doc.set_entities(analysis.get('entities', {}))
|
||||
if use_llm:
|
||||
# 使用LLM分析整个文档
|
||||
print(f" 正在使用LLM分析文档: {doc.filename}")
|
||||
analysis = self.llm.analyze_document(content, doc.title)
|
||||
|
||||
doc.summary = analysis.get('summary', '')
|
||||
doc.set_keywords(analysis.get('keywords', []))
|
||||
doc.set_topics(analysis.get('topics', []))
|
||||
doc.category = analysis.get('category', '')
|
||||
doc.set_entities(analysis.get('entities', {}))
|
||||
else:
|
||||
# 简单模式:不使用LLM
|
||||
print(f" 正在索引文档(简单模式): {doc.filename}")
|
||||
# 从内容中提取简单关键词
|
||||
simple_keywords = self._extract_simple_keywords(content)
|
||||
doc.set_keywords(simple_keywords[:20])
|
||||
|
||||
# 分块处理
|
||||
chunks = self._split_content(content)
|
||||
@@ -279,10 +289,14 @@ class DocumentIndexer:
|
||||
end_char=len(chunk_content)
|
||||
)
|
||||
|
||||
# LLM分析分块
|
||||
chunk_analysis = self.llm.analyze_chunk(chunk_content)
|
||||
chunk.summary = chunk_analysis.get('summary', '')
|
||||
chunk.set_keywords(chunk_analysis.get('keywords', []))
|
||||
if use_llm:
|
||||
# LLM分析分块
|
||||
chunk_analysis = self.llm.analyze_chunk(chunk_content)
|
||||
chunk.summary = chunk_analysis.get('summary', '')
|
||||
chunk.set_keywords(chunk_analysis.get('keywords', []))
|
||||
else:
|
||||
# 简单模式:从分块提取关键词
|
||||
chunk.set_keywords(self._extract_simple_keywords(chunk_content)[:10])
|
||||
|
||||
# 计算词频
|
||||
term_freq = self._compute_term_freq(chunk_content)
|
||||
@@ -313,23 +327,70 @@ class DocumentIndexer:
|
||||
print(f" ✗ 索引失败: {e}")
|
||||
return False
|
||||
|
||||
def _extract_simple_keywords(self, content):
|
||||
"""简单提取关键词(不使用LLM)"""
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
# 提取中文词组(2-4字)
|
||||
chinese_words = re.findall(r'[\u4e00-\u9fff]{2,4}', content)
|
||||
|
||||
# 过滤常见无意义词
|
||||
stopwords = {'的', '是', '在', '了', '和', '与', '或', '等', '这', '那',
|
||||
'有', '为', '对', '也', '被', '把', '给', '向', '从', '到',
|
||||
'一个', '一些', '这种', '那种', '什么', '怎么', '如何', '为什么'}
|
||||
|
||||
# 统计词频
|
||||
word_freq = Counter(w for w in chinese_words if w not in stopwords)
|
||||
|
||||
# 返回高频词
|
||||
return [w for w, _ in word_freq.most_common(30)]
|
||||
|
||||
def _read_document(self, filepath):
|
||||
"""读取文档内容"""
|
||||
ext = os.path.splitext(filepath)[1].lower()
|
||||
|
||||
# 尝试读取文本文件(包括没有扩展名的)
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
if content.strip(): # 如果能读取到内容
|
||||
return content
|
||||
except:
|
||||
pass
|
||||
# 尝试多种编码读取文本文件
|
||||
encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5', 'utf-16', 'latin-1']
|
||||
|
||||
# 按扩展名处理特定格式
|
||||
if ext in ['.txt', '.md', '.json', '.html']:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
return f.read()
|
||||
def try_read_with_encoding(encodings_list):
|
||||
for enc in encodings_list:
|
||||
try:
|
||||
with open(filepath, 'r', encoding=enc) as f:
|
||||
content = f.read()
|
||||
# 检查是否有有效的中文字符
|
||||
if content.strip() and len(content) > 0:
|
||||
# 简单验证:检查是否有乱码
|
||||
# 如果内容看起来合理,返回它
|
||||
return content, enc
|
||||
except (UnicodeDecodeError, UnicodeError):
|
||||
continue
|
||||
return None, None
|
||||
|
||||
# 按扩展名处理
|
||||
if ext in ['.txt', '.md', '.json', '.html', '']:
|
||||
# 先尝试常见编码
|
||||
content, used_enc = try_read_with_encoding(encodings)
|
||||
if content:
|
||||
print(f" 使用编码 {used_enc} 读取文件")
|
||||
return content
|
||||
|
||||
# 如果都失败,尝试二进制读取后解码
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
raw = f.read()
|
||||
# 尝试chardet检测编码
|
||||
try:
|
||||
import chardet
|
||||
detected = chardet.detect(raw)
|
||||
if detected['encoding']:
|
||||
return raw.decode(detected['encoding'])
|
||||
except:
|
||||
pass
|
||||
# 最后尝试
|
||||
return raw.decode('utf-8', errors='replace')
|
||||
except:
|
||||
return ""
|
||||
|
||||
elif ext == '.pdf':
|
||||
try:
|
||||
|
||||
212
templates/document_detail.html
Normal file
212
templates/document_detail.html
Normal file
@@ -0,0 +1,212 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{{ doc.title or doc.filename }} - 文档详情</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.10.0/font/bootstrap-icons.css" rel="stylesheet">
|
||||
<style>
|
||||
body { background-color: #f8f9fa; }
|
||||
.chunk-card { border-left: 4px solid #667eea; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<nav class="navbar navbar-expand-lg navbar-dark bg-dark">
|
||||
<div class="container">
|
||||
<a class="navbar-brand" href="/"><i class="bi bi-search"></i> LLM Index RAG</a>
|
||||
<div class="navbar-nav ms-auto">
|
||||
<a class="nav-link" href="/">首页</a>
|
||||
<a class="nav-link" href="/documents">文档管理</a>
|
||||
<a class="nav-link" href="/search">知识检索</a>
|
||||
<a class="nav-link" href="/settings">系统设置</a>
|
||||
</div>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<div class="container py-4">
|
||||
<nav aria-label="breadcrumb" class="mb-3">
|
||||
<ol class="breadcrumb">
|
||||
<li class="breadcrumb-item"><a href="/documents">文档管理</a></li>
|
||||
<li class="breadcrumb-item active">{{ doc.title or doc.filename }}</li>
|
||||
</ol>
|
||||
</nav>
|
||||
|
||||
<!-- 文档信息 -->
|
||||
<div class="card mb-4">
|
||||
<div class="card-header d-flex justify-content-between align-items-center">
|
||||
<h5 class="mb-0"><i class="bi bi-file-earmark-text"></i> 文档信息</h5>
|
||||
<div>
|
||||
{% if doc.status == 'indexed' %}
|
||||
<span class="badge bg-success">已索引</span>
|
||||
{% elif doc.status == 'processing' %}
|
||||
<span class="badge bg-warning">处理中</span>
|
||||
{% elif doc.status == 'failed' %}
|
||||
<span class="badge bg-danger">失败</span>
|
||||
{% else %}
|
||||
<span class="badge bg-secondary">待索引</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<table class="table table-sm">
|
||||
<tr>
|
||||
<th width="120">文件名</th>
|
||||
<td>{{ doc.filename }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>标题</th>
|
||||
<td>{{ doc.title or '-' }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>文件类型</th>
|
||||
<td>{{ doc.file_type }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>文件大小</th>
|
||||
<td>{{ (doc.file_size / 1024)|round(1) }} KB</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<table class="table table-sm">
|
||||
<tr>
|
||||
<th width="120">分块数</th>
|
||||
<td>{{ doc.chunk_count }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>字数</th>
|
||||
<td>{{ doc.word_count|default(0) }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>上传时间</th>
|
||||
<td>{{ doc.created_at.strftime('%Y-%m-%d %H:%M') if doc.created_at else '-' }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>索引时间</th>
|
||||
<td>{{ doc.indexed_at.strftime('%Y-%m-%d %H:%M') if doc.indexed_at else '-' }}</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% if doc.summary %}
|
||||
<div class="mt-3">
|
||||
<h6><i class="bi bi-card-text"></i> 文档摘要</h6>
|
||||
<p class="text-muted">{{ doc.summary }}</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if doc.keywords %}
|
||||
<div class="mt-3">
|
||||
<h6><i class="bi bi-tags"></i> 关键词</h6>
|
||||
<div>
|
||||
{% for kw in doc.get_keywords() %}
|
||||
<span class="badge bg-primary me-1">{{ kw }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if doc.category %}
|
||||
<div class="mt-3">
|
||||
<h6><i class="bi bi-folder"></i> 分类</h6>
|
||||
<span class="badge bg-info">{{ doc.category }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 文档分块 -->
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5 class="mb-0"><i class="bi bi-puzzle"></i> 文档分块 ({{ chunks|length }} 个)</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
{% if chunks %}
|
||||
<div class="accordion" id="chunksAccordion">
|
||||
{% for chunk in chunks %}
|
||||
<div class="accordion-item">
|
||||
<h2 class="accordion-header">
|
||||
<button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#chunk{{ loop.index }}">
|
||||
<strong>分块 {{ loop.index }}</strong>
|
||||
<span class="ms-3 text-muted small">
|
||||
{{ chunk.content[:50] }}...
|
||||
</span>
|
||||
</button>
|
||||
</h2>
|
||||
<div id="chunk{{ loop.index }}" class="accordion-collapse collapse" data-bs-parent="#chunksAccordion">
|
||||
<div class="accordion-body">
|
||||
{% if chunk.summary %}
|
||||
<div class="alert alert-info small">
|
||||
<strong>摘要:</strong> {{ chunk.summary }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if chunk.keywords %}
|
||||
<div class="mb-2">
|
||||
{% for kw in chunk.get_keywords() %}
|
||||
<span class="badge bg-secondary me-1">{{ kw }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<pre class="bg-light p-3 rounded" style="white-space: pre-wrap; word-wrap: break-word;">{{ chunk.content }}</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% else %}
|
||||
<p class="text-muted text-center py-4">暂无分块数据,请先索引文档</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="mt-4">
|
||||
<a href="/documents" class="btn btn-secondary">
|
||||
<i class="bi bi-arrow-left"></i> 返回列表
|
||||
</a>
|
||||
{% if doc.status == 'pending' %}
|
||||
<button class="btn btn-primary" onclick="indexDoc({{ doc.id }})">
|
||||
<i class="bi bi-arrow-repeat"></i> 索引文档
|
||||
</button>
|
||||
{% endif %}
|
||||
<button class="btn btn-danger" onclick="deleteDoc({{ doc.id }})">
|
||||
<i class="bi bi-trash"></i> 删除文档
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
|
||||
<script>
|
||||
function indexDoc(id) {
|
||||
if (!confirm('确定要索引此文档吗?')) return;
|
||||
|
||||
fetch(`/api/index/${id}`, { method: 'POST' })
|
||||
.then(r => r.json())
|
||||
.then(data => {
|
||||
alert(data.success ? '索引完成' : '索引失败: ' + (data.error || '未知错误'));
|
||||
if (data.success) location.reload();
|
||||
});
|
||||
}
|
||||
|
||||
function deleteDoc(id) {
|
||||
if (!confirm('确定要删除此文档吗?此操作不可恢复!')) return;
|
||||
|
||||
fetch(`/api/documents/${id}`, { method: 'DELETE' })
|
||||
.then(r => r.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
alert('删除成功');
|
||||
window.location.href = '/documents';
|
||||
} else {
|
||||
alert('删除失败');
|
||||
}
|
||||
});
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user