llm-index-rag/services.py

"""
文档索引服务
使用LLM分析文档并构建索引
"""

import os
import re
import json
import math
from datetime import datetime
from collections import Counter
from openai import OpenAI
from flask import current_app

from config import LLM_CONFIG, DOC_CONFIG, INDEX_CONFIG
from models import db, Document, DocumentChunk, InvertedIndex, IndexStats, QueryLog


def get_llm_config():
    """获取有效的LLM配置（支持动态更新）"""
    user_config_file = os.path.join(os.path.dirname(__file__), 'user_config.json')
    if os.path.exists(user_config_file):
        with open(user_config_file, 'r', encoding='utf-8') as f:
            user_config = json.load(f)
            return {**LLM_CONFIG, **user_config.get('llm', {})}
    return LLM_CONFIG


def get_doc_config():
    """获取有效的文档配置"""
    user_config_file = os.path.join(os.path.dirname(__file__), 'user_config.json')
    if os.path.exists(user_config_file):
        with open(user_config_file, 'r', encoding='utf-8') as f:
            user_config = json.load(f)
            return {**DOC_CONFIG, **user_config.get('doc', {})}
    return DOC_CONFIG


def get_index_config():
    """获取有效的索引配置"""
    user_config_file = os.path.join(os.path.dirname(__file__), 'user_config.json')
    if os.path.exists(user_config_file):
        with open(user_config_file, 'r', encoding='utf-8') as f:
            user_config = json.load(f)
            return {**INDEX_CONFIG, **user_config.get('index', {})}
    return INDEX_CONFIG


class LLMService:
    """LLM服务封装"""

    def __init__(self):
        pass  # 不再在初始化时设置配置

    def _get_client(self):
        """获取LLM客户端"""
        config = get_llm_config()
        return OpenAI(
            api_key=config['api_key'],
            base_url=config['api_base'],
        )

    def _get_config(self):
        """获取当前配置"""
        return get_llm_config()

    def analyze_document(self, content, title=None):
        """
        分析文档，提取关键信息

        Returns:
            dict: {summary, keywords, topics, entities, category}
        """
        prompt = f"""请分析以下文档内容，提取关键信息。

{'文档标题：' + title if title else ''}

文档内容（前3000字）：
{content[:3000]}

请以JSON格式返回以下信息：
{{
    "summary": "文档摘要（100-200字）",
    "keywords": ["关键词1", "关键词2", ...最多10个],
    "topics": ["主题1", "主题2", ...最多5个],
    "category": "文档分类",
    "entities": {{
        "persons": ["人名"],
        "organizations": ["组织名"],
        "locations": ["地点"],
        "dates": ["日期"],
        "others": ["其他实体"]
    }}
}}

只返回JSON，不要其他内容。"""

        try:
            config = self._get_config()
            client = self._get_client()
            response = client.chat.completions.create(
                model=config['model'],
                messages=[{"role": "user", "content": prompt}],
                max_tokens=1000,
                temperature=0.3,
            )

            result = response.choices[0].message.content.strip()
            # 清理可能的markdown标记
            result = re.sub(r'^```json\s*', '', result)
            result = re.sub(r'\s*```$', '', result)

            return json.loads(result)

        except Exception as e:
            print(f"LLM分析失败: {e}")
            return {
                "summary": "",
                "keywords": [],
                "topics": [],
                "category": "",
                "entities": {}
            }

    def analyze_chunk(self, content):
        """
        分析文档块，提取关键词

        Returns:
            dict: {summary, keywords, topics}
        """
        prompt = f"""分析以下文本片段，提取关键信息。

文本：
{content[:1500]}

请以JSON格式返回：
{{
    "summary": "片段摘要（50字以内）",
    "keywords": ["关键词", ...最多8个],
    "topics": ["主题", ...最多3个]
}}

只返回JSON。"""

        try:
            config = self._get_config()
            client = self._get_client()
            response = client.chat.completions.create(
                model=config['model'],
                messages=[{"role": "user", "content": prompt}],
                max_tokens=500,
                temperature=0.3,
            )

            result = response.choices[0].message.content.strip()
            result = re.sub(r'^```json\s*', '', result)
            result = re.sub(r'\s*```$', '', result)

            return json.loads(result)

        except Exception as e:
            return {"summary": "", "keywords": [], "topics": []}

    def process_query(self, query):
        """
        处理查询，提取意图和关键词

        Returns:
            dict: {intent, keywords, expanded_terms, entities}
        """
        prompt = f"""分析以下查询，提取搜索意图和关键词。

查询：{query}

请以JSON格式返回：
{{
    "intent": "查询意图（如：查找信息、比较、解释、列表等）",
    "keywords": ["主要关键词", ...最多10个],
    "expanded_terms": ["同义词或相关词", ...最多5个],
    "entities": {{
        "persons": [],
        "organizations": [],
        "locations": [],
        "dates": [],
        "others": []
    }}
}}

只返回JSON。"""

        try:
            config = self._get_config()
            client = self._get_client()
            response = client.chat.completions.create(
                model=config['model'],
                messages=[{"role": "user", "content": prompt}],
                max_tokens=500,
                temperature=0.3,
            )

            result = response.choices[0].message.content.strip()
            result = re.sub(r'^```json\s*', '', result)
            result = re.sub(r'\s*```$', '', result)

            return json.loads(result)

        except Exception as e:
            return {
                "intent": "search",
                "keywords": query.split(),
                "expanded_terms": [],
                "entities": {}
            }


class DocumentIndexer:
    """文档索引器"""

    def __init__(self):
        self.llm = LLMService()

    def _get_chunk_config(self):
        """获取分块配置"""
        config = get_doc_config()
        return config['chunk_size'], config['chunk_overlap']

    def index_document(self, doc_id):
        """
        索引单个文档

        Args:
            doc_id: 文档ID

        Returns:
            bool: 是否成功
        """
        doc = Document.query.get(doc_id)
        if not doc:
            return False

        try:
            doc.status = 'processing'
            db.session.commit()

            # 读取文档内容
            content = self._read_document(doc.filepath)
            if not content:
                raise Exception("无法读取文档内容")

            # 存储原文
            doc.content = content
            doc.word_count = len(content)

            # 使用LLM分析整个文档
            print(f"  正在分析文档: {doc.filename}")
            analysis = self.llm.analyze_document(content, doc.title)

            doc.summary = analysis.get('summary', '')
            doc.set_keywords(analysis.get('keywords', []))
            doc.set_topics(analysis.get('topics', []))
            doc.category = analysis.get('category', '')
            doc.set_entities(analysis.get('entities', {}))

            # 分块处理
            chunks = self._split_content(content)
            doc.chunk_count = len(chunks)

            # 清理旧分块
            DocumentChunk.query.filter_by(document_id=doc.id).delete()

            # 索引每个分块
            for i, chunk_content in enumerate(chunks):
                chunk = DocumentChunk(
                    document_id=doc.id,
                    chunk_index=i,
                    content=chunk_content,
                    start_char=0,
                    end_char=len(chunk_content)
                )

                # LLM分析分块
                chunk_analysis = self.llm.analyze_chunk(chunk_content)
                chunk.summary = chunk_analysis.get('summary', '')
                chunk.set_keywords(chunk_analysis.get('keywords', []))

                # 计算词频
                term_freq = self._compute_term_freq(chunk_content)
                chunk.set_term_freq(term_freq)

                db.session.add(chunk)

            db.session.commit()

            # 更新倒排索引
            self._update_inverted_index(doc.id)

            # 标记完成
            doc.status = 'indexed'
            doc.indexed_at = datetime.utcnow()
            db.session.commit()

            # 更新统计
            IndexStats.get_stats().update_stats()

            print(f"  ✓ 文档索引完成: {doc.filename}")
            return True

        except Exception as e:
            doc.status = 'failed'
            doc.error_message = str(e)
            db.session.commit()
            print(f"  ✗ 索引失败: {e}")
            return False

    def _read_document(self, filepath):
        """读取文档内容"""
        ext = os.path.splitext(filepath)[1].lower()

        # 尝试读取文本文件（包括没有扩展名的）
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                if content.strip():  # 如果能读取到内容
                    return content
        except:
            pass

        # 按扩展名处理特定格式
        if ext in ['.txt', '.md', '.json', '.html']:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()

        elif ext == '.pdf':
            try:
                from pypdf import PdfReader
                reader = PdfReader(filepath)
                text = ''
                for page in reader.pages:
                    text += page.extract_text() + '\n'
                return text
            except:
                pass

        elif ext == '.docx':
            try:
                from docx import Document as DocxDocument
                doc = DocxDocument(filepath)
                return '\n'.join([p.text for p in doc.paragraphs])
            except:
                pass

        # 最后尝试以二进制方式读取并解码
        try:
            with open(filepath, 'rb') as f:
                content = f.read()
                # 尝试多种编码
                for encoding in ['utf-8', 'gbk', 'gb2312', 'latin-1']:
                    try:
                        return content.decode(encoding)
                    except:
                        continue
        except:
            pass

        return None

    def _split_content(self, content):
        """
        分割内容为块

        Args:
            content: 文档内容

        Returns:
            list: 内容块列表
        """
        chunk_size, _ = self._get_chunk_config()
        chunks = []

        # 按段落分割
        paragraphs = content.split('\n\n')

        current_chunk = ""
        for para in paragraphs:
            if len(current_chunk) + len(para) < chunk_size:
                current_chunk += para + '\n\n'
            else:
                if current_chunk.strip():
                    chunks.append(current_chunk.strip())
                current_chunk = para + '\n\n'

        if current_chunk.strip():
            chunks.append(current_chunk.strip())

        return chunks if chunks else [content[:chunk_size]]

    def _compute_term_freq(self, content):
        """计算词频"""
        # 简单分词（中英文混合）
        # 中文按字符，英文按空格
        terms = []

        # 提取中文词汇（简单按字，实际可用jieba）
        chinese = re.findall(r'[\u4e00-\u9fff]+', content)
        for text in chinese:
            # 简单的双字词分割
            if len(text) >= 2:
                for i in range(len(text) - 1):
                    terms.append(text[i:i+2])
            terms.extend(list(text))

        # 提取英文单词
        english = re.findall(r'[a-zA-Z]+', content.lower())
        terms.extend(english)

        # 统计词频
        return dict(Counter(terms))

    def _update_inverted_index(self, doc_id):
        """更新倒排索引"""
        chunks = DocumentChunk.query.filter_by(document_id=doc_id).all()

        # 收集所有词及其位置
        term_postings = {}

        for chunk in chunks:
            # 从词频获取词
            tf = chunk.get_term_freq()

            # 从关键词获取词
            keywords = chunk.get_keywords()

            for kw in keywords:
                if kw not in term_postings:
                    term_postings[kw] = []
                term_postings[kw].append({
                    'doc_id': doc_id,
                    'chunk_id': chunk.id,
                    'tf': tf.get(kw, 1),
                    'weight': INDEX_CONFIG['keyword_weight']
                })

            for term, freq in tf.items():
                if term not in term_postings:
                    term_postings[term] = []
                term_postings[term].append({
                    'doc_id': doc_id,
                    'chunk_id': chunk.id,
                    'tf': freq,
                    'weight': INDEX_CONFIG['content_weight']
                })

        # 更新数据库
        for term, postings in term_postings.items():
            index = InvertedIndex.get_or_create(term)
            existing = index.get_postings()

            # 合并postings（去除旧的同一文档的记录）
            existing = [p for p in existing if p['doc_id'] != doc_id]
            existing.extend(postings)

            index.set_postings(existing)

        db.session.commit()


class SearchEngine:
    """搜索引擎"""

    def __init__(self):
        self.llm = LLMService()

    def _get_bm25_params(self):
        """获取BM25参数"""
        config = get_index_config()
        return config['bm25_k1'], config['bm25_b']

    def search(self, query, top_k=10):
        """
        搜索文档

        Args:
            query: 查询字符串
            top_k: 返回结果数

        Returns:
            list: 搜索结果 [{doc, score, highlights}]
        """
        start_time = datetime.now()

        # 1. LLM处理查询
        print(f"处理查询: {query}")
        query_analysis = self.llm.process_query(query)

        keywords = query_analysis.get('keywords', [])
        expanded = query_analysis.get('expanded_terms', [])

        # 合并关键词
        all_terms = keywords + expanded

        print(f"  关键词: {keywords}")
        print(f"  扩展词: {expanded}")

        # 2. 检索
        results = self._retrieve(all_terms)

        # 3. 计算BM25分数
        scored_results = self._score_results(results, all_terms)

        # 4. 排序
        scored_results.sort(key=lambda x: x['score'], reverse=True)

        # 5. 返回top_k
        final_results = scored_results[:top_k]

        retrieval_time = (datetime.now() - start_time).total_seconds()

        # 6. 记录日志
        self._log_query(query, query_analysis, final_results, retrieval_time)

        return final_results

    def _retrieve(self, terms):
        """
        检索包含关键词的文档

        Args:
            terms: 关键词列表

        Returns:
            dict: {doc_id: {chunks: [], terms: []}}
        """
        results = {}

        for term in terms:
            # 查询倒排索引
            index = InvertedIndex.query.filter(
                InvertedIndex.term.ilike(f'%{term}%')
            ).all()

            for idx in index:
                postings = idx.get_postings()

                for p in postings:
                    doc_id = p['doc_id']

                    if doc_id not in results:
                        results[doc_id] = {
                            'chunks': set(),
                            'terms': {},
                            'postings': []
                        }

                    results[doc_id]['chunks'].add(p['chunk_id'])
                    results[doc_id]['terms'][term] = results[doc_id]['terms'].get(term, 0) + p.get('tf', 1)
                    results[doc_id]['postings'].append({
                        'term': term,
                        'chunk_id': p['chunk_id'],
                        'tf': p.get('tf', 1),
                        'weight': p.get('weight', 1.0)
                    })

        return results

    def _score_results(self, results, query_terms):
        """
        使用BM25计算分数

        Args:
            results: 检索结果
            query_terms: 查询词

        Returns:
            list: [{doc, score, chunks}]
        """
        scored = []

        # 计算平均文档长度
        total_docs = Document.query.filter_by(status='indexed').count()
        if total_docs == 0:
            return []

        avg_doc_len = db.session.query(
            db.func.avg(Document.word_count)
        ).filter(Document.status == 'indexed').scalar() or 1000

        for doc_id, data in results.items():
            doc = Document.query.get(doc_id)
            if not doc or doc.status != 'indexed':
                continue

            # BM25计算
            k1, b = self._get_bm25_params()
            score = 0
            doc_len = doc.word_count or 1000

            for term in query_terms:
                # 查询倒排索引获取文档频率
                index = InvertedIndex.query.filter_by(term=term).first()
                df = index.doc_freq if index else 1

                # IDF
                idf = math.log((total_docs - df + 0.5) / (df + 0.5) + 1)

                # TF
                tf = data['terms'].get(term, 0)

                # BM25公式
                tf_component = (tf * (k1 + 1)) / (
                    tf + k1 * (1 - b + b * doc_len / avg_doc_len)
                )

                score += idf * tf_component

            # 获取匹配的chunk内容
            chunk_ids = list(data['chunks'])[:3]  # 最多取3个chunk
            chunks = DocumentChunk.query.filter(DocumentChunk.id.in_(chunk_ids)).all()

            scored.append({
                'doc': doc.to_dict(),
                'score': score,
                'matched_chunks': [c.to_dict() for c in chunks],
                'matched_terms': list(data['terms'].keys())
            })

        return scored

    def _log_query(self, query, analysis, results, retrieval_time):
        """记录查询日志"""
        log = QueryLog(
            original_query=query,
            processed_query=' '.join(analysis.get('keywords', [])),
            expanded_terms=json.dumps(analysis.get('expanded_terms', [])),
            intent=analysis.get('intent'),
            entities=json.dumps(analysis.get('entities', {})),
            result_count=len(results),
            top_doc_ids=json.dumps([r['doc']['id'] for r in results[:5]]),
            retrieval_time=retrieval_time,
            total_time=retrieval_time
        )
        db.session.add(log)
        db.session.commit()


class RAGGenerator:
    """RAG生成器"""

    def __init__(self):
        self.llm = LLMService()
        self.search_engine = SearchEngine()

    def answer(self, query, top_k=5):
        """
        RAG回答

        Args:
            query: 用户查询
            top_k: 检索文档数

        Returns:
            dict: {answer, sources, confidence}
        """
        # 1. 检索相关文档
        results = self.search_engine.search(query, top_k)

        if not results:
            return {
                'answer': '抱歉，没有找到相关信息。',
                'sources': [],
                'confidence': 0
            }

        # 2. 构建上下文
        context_parts = []
        sources = []

        for i, r in enumerate(results[:3]):  # 最多使用3个文档
            doc = r['doc']
            chunks = r['matched_chunks']

            context_parts.append(f"【文档{i+1}】{doc.get('title', doc['filename'])}")

            for chunk in chunks[:2]:  # 每个文档最多2个chunk
                context_parts.append(chunk.get('content', '')[:500])

            sources.append({
                'id': doc['id'],
                'title': doc.get('title', doc['filename']),
                'score': r['score']
            })

        context = '\n\n'.join(context_parts)

        # 3. LLM生成回答
        prompt = f"""基于以下参考信息回答问题。如果参考信息中没有相关内容，请说明。

问题：{query}

参考信息：
{context}

请给出准确、简洁的回答，并标注信息来源。"""

        try:
            llm_config = get_llm_config()
            client = OpenAI(
                api_key=llm_config['api_key'],
                base_url=llm_config['api_base'],
            )

            response = client.chat.completions.create(
                model=llm_config['model'],
                messages=[{"role": "user", "content": prompt}],
                max_tokens=1000,
                temperature=0.5,
            )

            answer = response.choices[0].message.content

        except Exception as e:
            answer = f"生成回答时出错: {e}"

        # 4. 返回结果
        return {
            'answer': answer,
            'sources': sources,
            'confidence': min(1.0, results[0]['score'] / 10) if results else 0
        }