diff --git a/.gitignore b/.gitignore index d1fba09..845f5ca 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,7 @@ instance/ *.db __pycache__/ *.pyc -.envdocuments/ +.env +documents/ indexes/ +user_config.json \ No newline at end of file diff --git a/app.py b/app.py index a94eb09..45a74af 100644 --- a/app.py +++ b/app.py @@ -13,6 +13,38 @@ from config import * from models import db, Document, DocumentChunk, InvertedIndex, QueryLog, IndexStats from services import DocumentIndexer, SearchEngine, RAGGenerator +# ==================== 配置文件路径 ==================== +CONFIG_FILE = os.path.join(os.path.dirname(__file__), 'config.py') +USER_CONFIG_FILE = os.path.join(os.path.dirname(__file__), 'user_config.json') + + +def load_user_config(): + """加载用户配置""" + if os.path.exists(USER_CONFIG_FILE): + with open(USER_CONFIG_FILE, 'r', encoding='utf-8') as f: + return json.load(f) + return {} + + +def save_user_config(config_type, config_data): + """保存用户配置""" + user_config = load_user_config() + user_config[config_type] = config_data + with open(USER_CONFIG_FILE, 'w', encoding='utf-8') as f: + json.dump(user_config, f, ensure_ascii=False, indent=2) + + +def get_effective_config(): + """获取有效配置(用户配置覆盖默认配置)""" + user_config = load_user_config() + + return { + 'llm': {**LLM_CONFIG, **user_config.get('llm', {})}, + 'index': {**INDEX_CONFIG, **user_config.get('index', {})}, + 'doc': {**DOC_CONFIG, **user_config.get('doc', {})} + } + + # ==================== 创建应用 ==================== app = Flask(__name__) app.config['SECRET_KEY'] = SECRET_KEY @@ -66,6 +98,13 @@ def search_page(): return render_template('search.html') +@app.route('/settings') +def settings_page(): + """设置页""" + config = get_effective_config() + return render_template('settings.html', config=config) + + # ==================== API路由 ==================== # === 文档管理 === @@ -315,6 +354,70 @@ def api_log_feedback(log_id): return jsonify({'success': True}) +# === 配置管理 === + +@app.route('/api/config', methods=['GET']) +def api_get_config(): + """获取当前配置""" + return jsonify(get_effective_config()) + + +@app.route('/api/config/llm', methods=['POST']) +def api_save_llm_config(): + """保存LLM配置""" + data = request.json + save_user_config('llm', data) + return jsonify({'success': True}) + + +@app.route('/api/config/index', methods=['POST']) +def api_save_index_config(): + """保存索引配置""" + data = request.json + save_user_config('index', data) + return jsonify({'success': True}) + + +@app.route('/api/config/doc', methods=['POST']) +def api_save_doc_config(): + """保存文档配置""" + data = request.json + save_user_config('doc', data) + return jsonify({'success': True}) + + +@app.route('/api/config/test', methods=['POST']) +def api_test_config(): + """测试LLM连接""" + config = get_effective_config() + llm_config = config['llm'] + + try: + from openai import OpenAI + client = OpenAI( + api_key=llm_config['api_key'], + base_url=llm_config['api_base'], + ) + + # 发送简单测试请求 + response = client.chat.completions.create( + model=llm_config['model'], + messages=[{"role": "user", "content": "Hello"}], + max_tokens=10 + ) + + return jsonify({ + 'success': True, + 'model': llm_config['model'], + 'response': response.choices[0].message.content + }) + except Exception as e: + return jsonify({ + 'success': False, + 'error': str(e) + }) + + # ==================== 启动 ==================== if __name__ == '__main__': init_app() diff --git a/services.py b/services.py index dbbc039..159e4e6 100644 --- a/services.py +++ b/services.py @@ -16,17 +16,53 @@ from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG from models import db, Document, DocumentChunk, InvertedIndex, IndexStats +def get_llm_config(): + """获取有效的LLM配置(支持动态更新)""" + user_config_file = os.path.join(os.path.dirname(__file__), 'user_config.json') + if os.path.exists(user_config_file): + with open(user_config_file, 'r', encoding='utf-8') as f: + user_config = json.load(f) + return {**LLM_CONFIG, **user_config.get('llm', {})} + return LLM_CONFIG + + +def get_doc_config(): + """获取有效的文档配置""" + user_config_file = os.path.join(os.path.dirname(__file__), 'user_config.json') + if os.path.exists(user_config_file): + with open(user_config_file, 'r', encoding='utf-8') as f: + user_config = json.load(f) + return {**DOC_CONFIG, **user_config.get('doc', {})} + return DOC_CONFIG + + +def get_index_config(): + """获取有效的索引配置""" + user_config_file = os.path.join(os.path.dirname(__file__), 'user_config.json') + if os.path.exists(user_config_file): + with open(user_config_file, 'r', encoding='utf-8') as f: + user_config = json.load(f) + return {**INDEX_CONFIG, **user_config.get('index', {})} + return INDEX_CONFIG + + class LLMService: """LLM服务封装""" def __init__(self): - self.client = OpenAI( - api_key=LLM_CONFIG['api_key'], - base_url=LLM_CONFIG['api_base'], + pass # 不再在初始化时设置配置 + + def _get_client(self): + """获取LLM客户端""" + config = get_llm_config() + return OpenAI( + api_key=config['api_key'], + base_url=config['api_base'], ) - self.model = LLM_CONFIG['model'] - self.max_tokens = LLM_CONFIG['max_tokens'] - self.temperature = LLM_CONFIG['temperature'] + + def _get_config(self): + """获取当前配置""" + return get_llm_config() def analyze_document(self, content, title=None): """ @@ -60,8 +96,10 @@ class LLMService: 只返回JSON,不要其他内容。""" try: - response = self.client.chat.completions.create( - model=self.model, + config = self._get_config() + client = self._get_client() + response = client.chat.completions.create( + model=config['model'], messages=[{"role": "user", "content": prompt}], max_tokens=1000, temperature=0.3, @@ -106,8 +144,10 @@ class LLMService: 只返回JSON。""" try: - response = self.client.chat.completions.create( - model=self.model, + config = self._get_config() + client = self._get_client() + response = client.chat.completions.create( + model=config['model'], messages=[{"role": "user", "content": prompt}], max_tokens=500, temperature=0.3, @@ -150,8 +190,10 @@ class LLMService: 只返回JSON。""" try: - response = self.client.chat.completions.create( - model=self.model, + config = self._get_config() + client = self._get_client() + response = client.chat.completions.create( + model=config['model'], messages=[{"role": "user", "content": prompt}], max_tokens=500, temperature=0.3, @@ -177,8 +219,11 @@ class DocumentIndexer: def __init__(self): self.llm = LLMService() - self.chunk_size = DOC_CONFIG['chunk_size'] - self.chunk_overlap = DOC_CONFIG['chunk_overlap'] + + def _get_chunk_config(self): + """获取分块配置""" + config = get_doc_config() + return config['chunk_size'], config['chunk_overlap'] def index_document(self, doc_id): """ @@ -330,6 +375,7 @@ class DocumentIndexer: Returns: list: 内容块列表 """ + chunk_size, _ = self._get_chunk_config() chunks = [] # 按段落分割 @@ -337,7 +383,7 @@ class DocumentIndexer: current_chunk = "" for para in paragraphs: - if len(current_chunk) + len(para) < self.chunk_size: + if len(current_chunk) + len(para) < chunk_size: current_chunk += para + '\n\n' else: if current_chunk.strip(): @@ -347,7 +393,7 @@ class DocumentIndexer: if current_chunk.strip(): chunks.append(current_chunk.strip()) - return chunks if chunks else [content[:self.chunk_size]] + return chunks if chunks else [content[:chunk_size]] def _compute_term_freq(self, content): """计算词频""" @@ -424,8 +470,11 @@ class SearchEngine: def __init__(self): self.llm = LLMService() - self.k1 = INDEX_CONFIG['bm25_k1'] - self.b = INDEX_CONFIG['bm25_b'] + + def _get_bm25_params(self): + """获取BM25参数""" + config = get_index_config() + return config['bm25_k1'], config['bm25_b'] def search(self, query, top_k=10): """ @@ -542,6 +591,7 @@ class SearchEngine: continue # BM25计算 + k1, b = self._get_bm25_params() score = 0 doc_len = doc.word_count or 1000 @@ -557,8 +607,8 @@ class SearchEngine: tf = data['terms'].get(term, 0) # BM25公式 - tf_component = (tf * (self.k1 + 1)) / ( - tf + self.k1 * (1 - self.b + self.b * doc_len / avg_doc_len) + tf_component = (tf * (k1 + 1)) / ( + tf + k1 * (1 - b + b * doc_len / avg_doc_len) ) score += idf * tf_component @@ -653,13 +703,14 @@ class RAGGenerator: 请给出准确、简洁的回答,并标注信息来源。""" try: + llm_config = get_llm_config() client = OpenAI( - api_key=LLM_CONFIG['api_key'], - base_url=LLM_CONFIG['api_base'], + api_key=llm_config['api_key'], + base_url=llm_config['api_base'], ) response = client.chat.completions.create( - model=LLM_CONFIG['model'], + model=llm_config['model'], messages=[{"role": "user", "content": prompt}], max_tokens=1000, temperature=0.5, diff --git a/static/js/main.js b/static/js/main.js new file mode 100644 index 0000000..2e9efdb --- /dev/null +++ b/static/js/main.js @@ -0,0 +1,247 @@ +/** + * LLM Index RAG - 前端交互脚本 + */ + +// 搜索表单处理 +document.getElementById('searchForm')?.addEventListener('submit', async function(e) { + e.preventDefault(); + + const query = document.getElementById('queryInput').value.trim(); + const mode = document.querySelector('input[name="mode"]:checked').value; + + if (!query) { + alert('请输入查询内容'); + return; + } + + // 显示加载状态 + const resultsSection = document.getElementById('resultsSection'); + const ragSection = document.getElementById('ragSection'); + const resultsContainer = document.getElementById('resultsContainer'); + const ragAnswer = document.getElementById('ragAnswer'); + const ragSources = document.getElementById('ragSources'); + const resultCount = document.getElementById('resultCount'); + + resultsSection.style.display = 'none'; + ragSection.style.display = 'none'; + resultsContainer.innerHTML = '

正在检索...

'; + resultsSection.style.display = 'block'; + + try { + if (mode === 'search') { + // 文档检索模式 + const response = await fetch('/api/search', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({query: query, top_k: 10}) + }); + + const data = await response.json(); + + if (data.error) { + resultsContainer.innerHTML = `
${data.error}
`; + return; + } + + resultCount.textContent = data.total; + + if (data.results && data.results.length > 0) { + resultsContainer.innerHTML = data.results.map(r => ` +
+
+ ${r.title || r.document_title || '文档'} +
+

${r.summary || r.content?.substring(0, 200) + '...' || ''}

+
+ ${r.source || '本地文档'} + ${(r.score * 100).toFixed(1)}% +
+
+ `).join(''); + } else { + resultsContainer.innerHTML = ` +
+ +

未找到相关结果

+

请尝试其他关键词,或先上传并索引文档

+
+ `; + } + } else { + // RAG问答模式 + resultsContainer.innerHTML = '

正在生成回答...

'; + + const response = await fetch('/api/rag/answer', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({query: query, top_k: 5}) + }); + + const data = await response.json(); + + if (data.error) { + resultsContainer.innerHTML = `
${data.error}
`; + return; + } + + resultsSection.style.display = 'none'; + ragSection.style.display = 'block'; + + // 显示回答 + ragAnswer.innerHTML = ` +
+
+ +
+
+
${data.answer || '抱歉,无法生成回答。'}
+
+
+ `; + + // 显示来源 + if (data.sources && data.sources.length > 0) { + ragSources.innerHTML = data.sources.map(s => ` +
+
${s.title || s.document_title || '参考文档'}
+

${s.content?.substring(0, 150) + '...' || ''}

+
+ `).join(''); + } else { + ragSources.innerHTML = '

无参考来源

'; + } + } + } catch (err) { + resultsContainer.innerHTML = `
请求失败: ${err.message}
`; + } +}); + +// 文档上传 +document.getElementById('uploadForm')?.addEventListener('submit', async function(e) { + e.preventDefault(); + + const formData = new FormData(this); + const fileInput = document.getElementById('fileInput'); + + if (!fileInput.files.length) { + alert('请选择文件'); + return; + } + + const uploadBtn = document.getElementById('uploadBtn'); + uploadBtn.disabled = true; + uploadBtn.innerHTML = ' 上传中...'; + + try { + const response = await fetch('/api/documents', { + method: 'POST', + body: formData + }); + + const data = await response.json(); + + if (data.success) { + alert('上传成功!'); + location.reload(); + } else { + alert('上传失败: ' + (data.error || '未知错误')); + } + } catch (err) { + alert('上传失败: ' + err.message); + } finally { + uploadBtn.disabled = false; + uploadBtn.innerHTML = ' 上传'; + } +}); + +// 索引文档 +async function indexDocument(docId) { + if (!confirm('确定要索引此文档吗?这可能需要一些时间。')) return; + + try { + const response = await fetch(`/api/index/${docId}`, {method: 'POST'}); + const data = await response.json(); + + if (data.success) { + alert('索引完成!'); + location.reload(); + } else { + alert('索引失败: ' + (data.error || '未知错误')); + } + } catch (err) { + alert('索引失败: ' + err.message); + } +} + +// 删除文档 +async function deleteDocument(docId) { + if (!confirm('确定要删除此文档吗?此操作不可恢复。')) return; + + try { + const response = await fetch(`/api/documents/${docId}`, {method: 'DELETE'}); + const data = await response.json(); + + if (data.success) { + alert('删除成功!'); + location.reload(); + } else { + alert('删除失败: ' + (data.error || '未知错误')); + } + } catch (err) { + alert('删除失败: ' + err.message); + } +} + +// 批量索引 +async function batchIndex() { + if (!confirm('确定要索引所有待处理文档吗?')) return; + + try { + const response = await fetch('/api/index/batch', {method: 'POST'}); + const data = await response.json(); + + alert(`索引完成!成功: ${data.success}, 失败: ${data.failed}`); + location.reload(); + } catch (err) { + alert('批量索引失败: ' + err.message); + } +} + +// 重建索引 +async function rebuildIndex() { + if (!confirm('重建索引将清除所有现有索引,确定继续吗?')) return; + + try { + const response = await fetch('/api/index/rebuild', {method: 'POST'}); + const data = await response.json(); + + alert(`重建完成!成功: ${data.success}, 失败: ${data.failed}`); + location.reload(); + } catch (err) { + alert('重建索引失败: ' + err.message); + } +} + +// 加载统计信息 +async function loadStats() { + try { + const response = await fetch('/api/stats'); + const stats = await response.json(); + + document.getElementById('statDocs').textContent = stats.total_documents || 0; + document.getElementById('statChunks').textContent = stats.total_chunks || 0; + document.getElementById('statTerms').textContent = stats.total_terms || 0; + document.getElementById('statWords').textContent = (stats.total_words || 0).toLocaleString(); + } catch (err) { + console.error('加载统计失败:', err); + } +} + +// 页面加载时刷新统计 +document.addEventListener('DOMContentLoaded', function() { + // 如果在首页,定时刷新统计 + if (document.getElementById('statDocs')) { + loadStats(); + setInterval(loadStats, 30000); // 每30秒刷新 + } +}); \ No newline at end of file diff --git a/templates/index.html b/templates/index.html index 8dc96c6..7ffd769 100644 --- a/templates/index.html +++ b/templates/index.html @@ -28,6 +28,7 @@ 首页 文档管理 知识检索 + 系统设置 diff --git a/templates/settings.html b/templates/settings.html new file mode 100644 index 0000000..605474d --- /dev/null +++ b/templates/settings.html @@ -0,0 +1,284 @@ + + + + + + 系统设置 - LLM Index RAG + + + + + + + + +
+

系统设置

+ + +
+
+
大模型配置
+
+
+
+
+
+ + + LLM API的基础URL +
+
+ + + API密钥 +
+
+
+
+ + +
+
+ + +
+
+ + +
+
+
+ + +
+
+ + +
+
+
+
+
+ + +
+
+
索引配置
+
+
+
+
+
+ + + 词频饱和参数(推荐1.2-2.0) +
+
+ + + 文档长度归一化(推荐0.75) +
+
+ + +
+
+
+
+ + +
+
+ + +
+
+ + +
+
+ +
+
+
+ + +
+
+
文档处理配置
+
+
+
+
+
+ + + 字符数 +
+
+ + +
+
+ + +
+
+ +
+
+
+
+ + + + + \ No newline at end of file