commit 3094a5cb56bbe7e8a77802b1efd5d8cc9145b515 Author: hubian <908234780@qq.com> Date: Wed Apr 8 11:40:20 2026 +0800 初始化文章撰写大模型工作流系统 功能模块: - 主题选择器: 基于历史文章和热门趋势自动推荐主题 - 资料收集器: 自动搜索和收集相关资料 - 资料分析器: 深度分析资料内容,提取关键信息 - 大纲生成器: 基于资料自动生成文章大纲 - 文章撰写器: 分段撰写,支持多种文章类型 - 资料池管理: 中间产物持久化存储,建立索引 支持的文字类型: - 技术解析 - 技术文档翻译 - 项目介绍分析 - 综述文章 - 实践教程 - 问题分析 技术栈: - Python 3.x - 大模型API (OpenAI格式) - Requests库 配置的大模型: - URL: http://192.168.2.5:1234/v1 - Model: qwen3.5-4b diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e68df6e --- /dev/null +++ b/.gitignore @@ -0,0 +1,55 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +venv/ +ENV/ +env/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# 数据文件(保留结构) +data/resource_pool/* +!data/resource_pool/.gitkeep +data/summaries/* +!data/summaries/.gitkeep +*.json +!data/resource_index.json +!data/article_history.json + +# 输出文件 +output/articles/*.md + +# 临时文件 +*.tmp +*.log +.DS_Store +Thumbs.db + +# 敏感配置 +.env +.env.local +*secret* \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..06736c0 --- /dev/null +++ b/README.md @@ -0,0 +1,189 @@ +# 文章撰写大模型工作流系统 + +> 基于大模型的文章写作自动化工作流系统 + +## 系统概述 + +这是一个完整的文章写作工作流系统,利用大语言模型自动化完成从选题到成文的全过程。 + +### 核心特点 + +- **智能选题**: 基于历史文章和热门趋势自动推荐主题 +- **资料收集**: 自动搜索和收集相关资料 +- **深度分析**: 对每份资料进行全面分析,提取关键信息 +- **资料池**: 中间产物持久化存储,避免重复工作 +- **大纲生成**: 基于资料自动生成文章大纲 +- **智能写作**: 分段撰写,支持多种文章类型 + +## 工作流程 + +``` +1. 确定主题和类型 + ├── 分析历史文章 + ├── 搜索热门话题 + └── 推荐/选择主题 + +2. 收集相关资料 + ├── 生成搜索关键词 + ├── 执行网页搜索 + └── 获取页面内容 + +3. 分析资料内容 + ├── 逐个深度分析 + ├── 提取关键要点 + ├── 保存分析产物 + └── 建立索引 + +4. 生成文章大纲 + ├── 整合资料摘要 + ├── 规划章节结构 + └── 标注资料引用 + +5. 撰写文章 + ├── 分章节写作 + ├── 添加代码/图表 + └── 输出Markdown文件 +``` + +## 快速开始 + +### 安装依赖 + +```bash +pip install requests +``` + +### 运行完整工作流 + +```bash +# 交互式运行(推荐首次使用) +python main.py --interactive + +# 完整自动流程 +python main.py --mode full + +# 指定主题和类型 +python main.py --mode full --topic "Flash Attention原理解析" --type "技术解析" +``` + +### 分步运行 + +```bash +# 仅选择主题 +python main.py --mode topic --interactive + +# 仅收集资料 +python main.py --mode collect --topic "你的主题" + +# 仅分析资料 +python main.py --mode analyze + +# 仅生成大纲 +python main.py --mode outline --topic "你的主题" --type "技术解析" + +# 仅写作文章 +python main.py --mode write --topic "你的主题" --type "技术解析" +``` + +## 配置说明 + +编辑 `config/settings.py` 配置大模型API: + +```python +LLM_CONFIG = { + "base_url": "http://192.168.2.5:1234/v1", + "api_key": "your-api-key", + "model": "qwen3.5-4b", + "max_tokens": 4096, + "temperature": 0.7, +} +``` + +## 文章类型 + +| 类型 | 说明 | 目标字数 | +|------|------|----------| +| 技术解析 | 深入分析技术原理、实现、应用 | 2000-3000字 | +| 技术文档翻译 | 翻译官方文档、论文、博客 | 1500-2500字 | +| 项目介绍分析 | 介绍开源项目、分析架构 | 1500-2500字 | +| 综述文章 | 汇总领域发展、对比分析 | 2000-3000字 | +| 实践教程 | 手把手教程,具体步骤 | 1500-2000字 | +| 问题分析 | 分析技术问题、解决方案 | 1000-1500字 | + +## 目录结构 + +``` +article-workflow/ +├── main.py # 主入口 +├── config/ # 配置文件 +│ └── settings.py # 系统配置 +├── src/ # 源代码 +│ ├── llm_client.py # LLM客户端 +│ ├── resource_pool.py # 资料池管理 +│ ├── topic_selector.py # 主题选择 +│ ├── resource_collector.py # 资料收集 +│ ├── resource_analyzer.py # 资料分析 +│ ├── outline_generator.py # 大纲生成 +│ └── article_writer.py # 文章写作 +├── data/ # 数据存储 +│ ├── resource_index.json # 资料索引 +│ ├── article_history.json # 文章历史 +│ ├── resource_pool/ # 资料池 +│ └── summaries/ # 分析摘要 +├── output/ # 输出目录 +│ └── articles/ # 生成的文章 +├── assets/ # 资源文件 +│ └── images/ # 图片资源 +└── templates/ # 模板文件 +``` + +## 资料池管理 + +系统维护一个持久的资料池,所有分析产物都会保存: + +- **资料内容**: 原始网页内容 +- **分析摘要**: 每份资料的深度分析 +- **关键要点**: 提取的核心信息 +- **术语表**: 专业术语解释 +- **图片资源**: 相关图片 + +可以通过关键词索引快速检索历史资料,避免重复工作。 + +## 扩展开发 + +### 添加新的文章类型 + +编辑 `config/settings.py`: + +```python +ARTICLE_TYPES = { + "新类型": { + "description": "类型描述", + "structure": ["章节1", "章节2", "章节3"], + "word_count": 2000, + }, +} +``` + +### 自定义资料来源 + +修改 `src/resource_collector.py` 中的 `_search_web` 方法。 + +## 注意事项 + +1. 大模型API需要稳定可用 +2. 搜索功能可能需要代理 +3. 生成的文章需要人工审核 +4. 建议保存中间产物便于迭代 + +## 版本历史 + +### v0.1.0 (2026-04-08) +- 初始版本 +- 完整工作流实现 +- 资料池管理 +- 多种文章类型支持 + +## License + +MIT \ No newline at end of file diff --git a/assets/images/.gitkeep b/assets/images/.gitkeep new file mode 100644 index 0000000..7557eaa --- /dev/null +++ b/assets/images/.gitkeep @@ -0,0 +1 @@ +# 空目录占位 \ No newline at end of file diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..599b35f --- /dev/null +++ b/config/__init__.py @@ -0,0 +1,2 @@ +# 空的配置包初始化文件 +from .settings import * \ No newline at end of file diff --git a/config/settings.py b/config/settings.py new file mode 100644 index 0000000..43fc132 --- /dev/null +++ b/config/settings.py @@ -0,0 +1,70 @@ +""" +文章撰写大模型工作流系统配置 +""" + +# 大模型API配置 +LLM_CONFIG = { + "base_url": "http://192.168.2.5:1234/v1", + "api_key": "sk-lm-fuP5tGU8:Hi7YU87jHyDP6Ay8Tl2j", + "model": "qwen3.5-4b", + "max_tokens": 4096, + "temperature": 0.7, + "timeout": 120, +} + +# 文章类型定义 +ARTICLE_TYPES = { + "技术解析": { + "description": "深入分析某项技术的原理、实现、应用场景", + "structure": ["背景介绍", "技术原理", "核心实现", "应用案例", "总结展望"], + "word_count": 2000-3000, + }, + "技术文档翻译": { + "description": "翻译官方文档、论文、技术博客等", + "structure": ["原文概述", "核心内容翻译", "关键术语解释", "补充说明"], + "word_count": 1500-2500, + }, + "项目介绍分析": { + "description": "介绍开源项目、分析项目架构、使用方法", + "structure": ["项目概述", "核心功能", "架构分析", "使用指南", "对比评价"], + "word_count": 1500-2500, + }, + "综述文章": { + "description": "汇总某一领域的技术发展、对比分析多个方案", + "structure": ["领域背景", "主流方案", "对比分析", "发展趋势", "总结建议"], + "word_count": 2000-3000, + }, + "实践教程": { + "description": "手把手教程,包含具体操作步骤", + "structure": ["准备工作", "步骤详解", "常见问题", "进阶技巧"], + "word_count": 1500-2000, + }, + "问题分析": { + "description": "分析某个技术问题、Bug、性能瓶颈等", + "structure": ["问题描述", "原因分析", "解决方案", "经验总结"], + "word_count": 1000-1500, + }, +} + +# 资料池配置 +RESOURCE_POOL_CONFIG = { + "pool_dir": "data/resource_pool", + "index_file": "data/resource_index.json", + "history_file": "data/article_history.json", + "summaries_dir": "data/summaries", + "images_dir": "assets/images", +} + +# 搜索配置 +SEARCH_CONFIG = { + "max_results": 10, + "sources": ["web", "github", "huggingface", "papers"], + "cache_hours": 24, +} + +# 输出配置 +OUTPUT_CONFIG = { + "output_dir": "output/articles", + "format": "markdown", + "include_assets": True, +} \ No newline at end of file diff --git a/data/article_history.json b/data/article_history.json new file mode 100644 index 0000000..59eb8a0 --- /dev/null +++ b/data/article_history.json @@ -0,0 +1,4 @@ +{ + "articles": [], + "topics": [] +} \ No newline at end of file diff --git a/data/resource_index.json b/data/resource_index.json new file mode 100644 index 0000000..abf72ec --- /dev/null +++ b/data/resource_index.json @@ -0,0 +1,7 @@ +{ + "resources": {}, + "summaries": {}, + "images": {}, + "keywords_index": {}, + "updated_at": null +} \ No newline at end of file diff --git a/data/resource_pool/.gitkeep b/data/resource_pool/.gitkeep new file mode 100644 index 0000000..7557eaa --- /dev/null +++ b/data/resource_pool/.gitkeep @@ -0,0 +1 @@ +# 空目录占位 \ No newline at end of file diff --git a/data/summaries/.gitkeep b/data/summaries/.gitkeep new file mode 100644 index 0000000..7557eaa --- /dev/null +++ b/data/summaries/.gitkeep @@ -0,0 +1 @@ +# 空目录占位 \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..ebe8bd7 --- /dev/null +++ b/main.py @@ -0,0 +1,143 @@ +""" +文章撰写工作流系统 - 主入口 +""" + +import os +import sys +import json +import argparse +from datetime import datetime +from pathlib import Path + +# 添加src到路径 +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +from config.settings import ( + LLM_CONFIG, ARTICLE_TYPES, RESOURCE_POOL_CONFIG, + SEARCH_CONFIG, OUTPUT_CONFIG +) +from src.topic_selector import TopicSelector +from src.resource_collector import ResourceCollector +from src.resource_analyzer import ResourceAnalyzer +from src.outline_generator import OutlineGenerator +from src.article_writer import ArticleWriter +from src.resource_pool import ResourcePool + + +def main(): + parser = argparse.ArgumentParser(description='文章撰写大模型工作流系统') + parser.add_argument('--mode', choices=['full', 'topic', 'collect', 'analyze', 'outline', 'write'], + default='full', help='运行模式') + parser.add_argument('--topic', type=str, help='指定文章主题') + parser.add_argument('--type', type=str, choices=list(ARTICLE_TYPES.keys()), + help='指定文章类型') + parser.add_argument('--interactive', action='store_true', help='交互式模式') + args = parser.parse_args() + + print("=" * 60) + print("文章撰写大模型工作流系统") + print("=" * 60) + + # 初始化组件 + pool = ResourcePool(RESOURCE_POOL_CONFIG) + topic_selector = TopicSelector(LLM_CONFIG, pool) + collector = ResourceCollector(LLM_CONFIG, SEARCH_CONFIG, pool) + analyzer = ResourceAnalyzer(LLM_CONFIG, pool) + outline_gen = OutlineGenerator(LLM_CONFIG, pool) + writer = ArticleWriter(LLM_CONFIG, OUTPUT_CONFIG, pool) + + if args.mode == 'full': + # 完整工作流 + run_full_workflow(args, topic_selector, collector, analyzer, outline_gen, writer, pool) + elif args.mode == 'topic': + # 仅选择主题 + topic, article_type = topic_selector.select(args.interactive) + print(f"\n确定主题: {topic}") + print(f"文章类型: {article_type}") + elif args.mode == 'collect': + # 仅收集资料 + if not args.topic: + print("请指定主题: --topic") + return + collector.collect(args.topic) + elif args.mode == 'analyze': + # 仅分析资料 + analyzer.analyze_all() + elif args.mode == 'outline': + # 仅生成大纲 + if not args.topic: + print("请指定主题: --topic") + return + outline = outline_gen.generate(args.topic, args.type or "技术解析") + print(f"\n文章大纲:\n{outline}") + elif args.mode == 'write': + # 仅写作文章 + if not args.topic: + print("请指定主题: --topic") + return + article = writer.write(args.topic, args.type or "技术解析") + print(f"\n文章已生成: {article}") + + +def run_full_workflow(args, topic_selector, collector, analyzer, outline_gen, writer, pool): + """运行完整工作流""" + + # Step 1: 确定主题和类型 + print("\n【阶段1】确定文章主题和类型") + print("-" * 40) + + if args.topic and args.type: + topic = args.topic + article_type = args.type + print(f"使用指定主题: {topic}") + print(f"文章类型: {article_type}") + else: + topic, article_type = topic_selector.select(args.interactive) + print(f"\n确定主题: {topic}") + print(f"文章类型: {article_type}") + + # Step 2: 收集相关资料 + print("\n【阶段2】收集相关资料") + print("-" * 40) + + resources = collector.collect(topic) + print(f"收集到 {len(resources)} 份资料") + + # Step 3: 分析资料 + print("\n【阶段3】分析资料内容") + print("-" * 40) + + analyzed_resources = analyzer.analyze_all(topic) + print(f"完成 {len(analyzed_resources)} 份资料分析") + + # Step 4: 生成文章大纲 + print("\n【阶段4】生成文章大纲") + print("-" * 40) + + outline = outline_gen.generate(topic, article_type) + print(f"大纲生成完成") + + # Step 5: 撰写文章 + print("\n【阶段5】撰写文章") + print("-" * 40) + + article_path = writer.write(topic, article_type, outline) + + # 完成 + print("\n" + "=" * 60) + print("工作流完成!") + print("=" * 60) + print(f"文章输出: {article_path}") + + # 更新历史记录 + pool.add_article_history({ + "topic": topic, + "type": article_type, + "date": datetime.now().isoformat(), + "output_path": article_path, + "resources_count": len(analyzed_resources), + }) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/output/articles/.gitkeep b/output/articles/.gitkeep new file mode 100644 index 0000000..7557eaa --- /dev/null +++ b/output/articles/.gitkeep @@ -0,0 +1 @@ +# 空目录占位 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..39aae79 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests>=2.28.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..baf76fb --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,22 @@ +""" +文章撰写大模型工作流系统 +""" + +from .llm_client import LLMClient, LLMHelper +from .resource_pool import ResourcePool +from .topic_selector import TopicSelector +from .resource_collector import ResourceCollector +from .resource_analyzer import ResourceAnalyzer +from .outline_generator import OutlineGenerator +from .article_writer import ArticleWriter + +__all__ = [ + 'LLMClient', + 'LLMHelper', + 'ResourcePool', + 'TopicSelector', + 'ResourceCollector', + 'ResourceAnalyzer', + 'OutlineGenerator', + 'ArticleWriter', +] \ No newline at end of file diff --git a/src/article_writer.py b/src/article_writer.py new file mode 100644 index 0000000..82a6cbd --- /dev/null +++ b/src/article_writer.py @@ -0,0 +1,237 @@ +""" +文章写作模块 - 基于资料和大纲撰写文章 +""" + +import os +import re +import json +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional +from .llm_client import LLMClient, LLMHelper +from .resource_pool import ResourcePool +from config.settings import ARTICLE_TYPES, OUTPUT_CONFIG + + +class ArticleWriter: + """文章写作者""" + + def __init__(self, llm_config: dict, output_config: dict, pool: ResourcePool): + self.llm = LLMClient(llm_config) + self.helper = LLMHelper(self.llm) + self.output_config = output_config + self.pool = pool + + def write(self, topic: str, article_type: str, outline: Optional[str] = None) -> str: + """撰写文章""" + print(f"\n正在撰写文章: {topic}") + + # 1. 获取资料 + resources = self._prepare_resources(topic) + + # 2. 获取/生成大纲 + if not outline: + from .outline_generator import OutlineGenerator + outline_gen = OutlineGenerator({'base_url': self.llm.base_url, + 'api_key': self.llm.api_key, + 'model': self.llm.model}, self.pool) + outline = outline_gen.generate(topic, article_type) + + # 3. 获取文章类型信息 + type_info = ARTICLE_TYPES.get(article_type, ARTICLE_TYPES['技术解析']) + target_words = type_info.get('word_count', 2000) + + # 4. 分段写作 + article_content = self._write_article(topic, article_type, outline, resources, target_words) + + # 5. 保存文章 + article_path = self._save_article(topic, article_content) + + print(f"文章已生成: {article_path}") + return article_path + + def _prepare_resources(self, topic: str) -> str: + """准备参考资料内容""" + summaries = self.pool.get_all_summaries() + + if not summaries: + return "暂无参考资料" + + resources_text = "# 参考资料\n\n" + + for i, s in enumerate(summaries[:8], 1): # 限制资料数量 + content = s.get('content', '') + # 提取关键内容 + resources_text += f"## 资料{i}\n\n" + + # 提取摘要 + summary_match = re.search(r'核心内容摘要[::]\s*(.+?)(?=##|\Z)', content, re.DOTALL) + if summary_match: + resources_text += f"摘要: {summary_match.group(1).strip()[:500]}\n\n" + + # 提取要点 + points_match = re.search(r'关键要点[::]\s*(.+?)(?=##|\Z)', content, re.DOTALL) + if points_match: + resources_text += f"要点:\n{points_match.group(1).strip()[:500]}\n\n" + + resources_text += "---\n\n" + + return resources_text + + def _write_article(self, topic: str, article_type: str, outline: str, + resources: str, target_words: int) -> str: + """撰写文章主体""" + + # 一次性生成完整文章 + prompt = f"""你是一位技术博客作者,请根据大纲和参考资料撰写一篇高质量技术文章。 + +主题: {topic} +文章类型: {article_type} +目标字数: {target_words}字左右 + +大纲: +{outline} + +参考资料: +{resources[:6000]} + +写作要求: +1. 内容专业准确,语言流畅易懂 +2. 适当使用代码示例、公式说明(使用Markdown格式) +3. 关键术语第一次出现时给出解释 +4. 图表位置标注 [图: 描述],后续会补充 +5. 引用资料时标注来源 +6. 结构清晰,层次分明 + +请输出完整的文章内容(包含标题、正文、总结):""" + + article = self.llm.generate(prompt, temperature=0.7) + + if not article: + article = self._generate_fallback_article(topic, outline) + + # 后处理 + article = self._post_process(article, topic) + + return article + + def _generate_fallback_article(self, topic: str, outline: str) -> str: + """生成备用文章(LLM失败时)""" + return f"""# {topic} + +## 概述 + +本文将深入探讨{topic}的相关内容。 + +## 核心内容 + +{outline} + +## 总结 + +本文对{topic}进行了全面介绍,希望对读者有所帮助。 + +## 参考资料 + +- 参考资料1 +- 参考资料2 +""" + + def _post_process(self, article: str, topic: str) -> str: + """后处理文章""" + # 1. 确保有标题 + if not article.startswith('# '): + article = f"# {topic}\n\n{article}" + + # 2. 添加元信息 + meta = f""" + +""" + + # 3. 添加图片占位说明 + article = re.sub(r'\[图[::]\s*([^\]]+)\]', r'![\1](placeholder://\1)', article) + + return meta + article + + def _save_article(self, topic: str, content: str) -> str: + """保存文章""" + output_dir = Path(self.output_config.get('output_dir', 'output/articles')) + output_dir.mkdir(parents=True, exist_ok=True) + + # 生成文件名 + safe_topic = re.sub(r'[^\w\s-]', '', topic)[:50] + safe_topic = re.sub(r'[-\s]+', '-', safe_topic) + date_prefix = datetime.now().strftime('%Y%m%d') + filename = f"{date_prefix}_{safe_topic}.md" + + filepath = output_dir / filename + filepath.write_text(content, encoding='utf-8') + + return str(filepath) + + def write_section_by_section(self, topic: str, article_type: str, + outline: str, resources: str) -> str: + """分段写作文章(适用于长文章)""" + from .outline_generator import OutlineGenerator + + outline_gen = OutlineGenerator({'base_url': self.llm.base_url, + 'api_key': self.llm.api_key, + 'model': self.llm.model}, self.pool) + sections = outline_gen.extract_section_points(outline) + + article_parts = [f"# {topic}\n"] + + for section_name, points in sections.items(): + print(f" 撰写: {section_name}") + + section_prompt = f"""请撰写文章章节。 + +章节标题: {section_name} +章节要点: {points} + +参考资料: +{resources[:3000]} + +要求: +1. 围绕要点展开,内容专业 +2. 适当加入代码或图表说明 +3. 字数300-500字 + +请输出章节内容:""" + + section_content = self.llm.generate(section_prompt, temperature=0.7) + + if section_content: + article_parts.append(f"\n## {section_name}\n\n{section_content}\n") + + return "\n".join(article_parts) + + def enhance_article(self, article: str) -> str: + """增强文章(添加图表、优化排版)""" + # 生成图表描述 + chart_prompt = f"""请分析以下文章,建议在哪些位置添加图表: + +{article[:5000]} + +请以JSON格式输出建议: +[ + {{"position": "章节位置", "chart_type": "图表类型", "description": "图表内容描述"}}, + ... +]""" + + # 暂不实现,返回原文章 + return article + + def generate_summary(self, article: str) -> str: + """生成文章摘要""" + prompt = f"""请为以下文章生成100字以内的摘要: + +{article[:3000]} + +摘要:""" + + return self.llm.generate(prompt, temperature=0.5) or "暂无摘要" \ No newline at end of file diff --git a/src/llm_client.py b/src/llm_client.py new file mode 100644 index 0000000..2d7cf20 --- /dev/null +++ b/src/llm_client.py @@ -0,0 +1,238 @@ +""" +LLM客户端模块 - 与大模型API交互 +""" + +import requests +import json +import time +from typing import Optional, Dict, List + + +class LLMClient: + """大模型API客户端""" + + def __init__(self, config): + self.base_url = config['base_url'] + self.api_key = config['api_key'] + self.model = config['model'] + self.max_tokens = config.get('max_tokens', 4096) + self.temperature = config.get('temperature', 0.7) + self.timeout = config.get('timeout', 120) + + def chat(self, messages: List[Dict], temperature: Optional[float] = None) -> str: + """发送聊天请求""" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + + payload = { + "model": self.model, + "messages": messages, + "max_tokens": self.max_tokens, + "temperature": temperature or self.temperature, + } + + try: + response = requests.post( + f"{self.base_url}/chat/completions", + headers=headers, + json=payload, + timeout=self.timeout + ) + + if response.status_code == 200: + result = response.json() + return result['choices'][0]['message']['content'] + else: + print(f"API错误: {response.status_code} - {response.text}") + return None + + except requests.Timeout: + print("API请求超时") + return None + except Exception as e: + print(f"API请求失败: {e}") + return None + + def chat_with_system(self, system_prompt: str, user_prompt: str, + temperature: Optional[float] = None) -> str: + """带系统提示的聊天""" + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + return self.chat(messages, temperature) + + def chat_stream(self, messages: List[Dict], temperature: Optional[float] = None): + """流式聊天(如果API支持)""" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + + payload = { + "model": self.model, + "messages": messages, + "max_tokens": self.max_tokens, + "temperature": temperature or self.temperature, + "stream": True + } + + try: + response = requests.post( + f"{self.base_url}/chat/completions", + headers=headers, + json=payload, + timeout=self.timeout, + stream=True + ) + + for line in response.iter_lines(): + if line: + line = line.decode('utf-8') + if line.startswith('data: '): + data = line[6:] + if data == '[DONE]': + break + try: + chunk = json.loads(data) + content = chunk['choices'][0]['delta'].get('content', '') + if content: + yield content + except json.JSONDecodeError: + continue + + except Exception as e: + print(f"流式请求失败: {e}") + + def generate(self, prompt: str, temperature: Optional[float] = None) -> str: + """简单生成(单轮对话)""" + messages = [{"role": "user", "content": prompt}] + return self.chat(messages, temperature) + + def batch_generate(self, prompts: List[str], temperature: Optional[float] = None) -> List[str]: + """批量生成""" + results = [] + for prompt in prompts: + result = self.generate(prompt, temperature) + results.append(result) + time.sleep(0.5) # 避免请求过快 + return results + + +class LLMHelper: + """LLM辅助工具 - 提供常用任务模板""" + + def __init__(self, client: LLMClient): + self.client = client + + def summarize(self, text: str, max_length: int = 500) -> str: + """总结文本""" + prompt = f"""请对以下内容进行总结,要求: +1. 提炼核心观点和关键信息 +2. 保留重要的技术细节和术语 +3. 总结长度控制在{max_length}字以内 + +内容: +{text} + +请输出总结:""" + + return self.client.generate(prompt, temperature=0.5) + + def extract_key_points(self, text: str, num_points: int = 5) -> List[str]: + """提取关键要点""" + prompt = f"""请从以下内容中提取{num_points}个关键要点: + +{text} + +请以列表形式输出关键要点:""" + + result = self.client.generate(prompt, temperature=0.5) + if result: + # 解析列表 + points = [] + for line in result.split('\n'): + line = line.strip() + if line.startswith('- ') or line.startswith('• '): + points.append(line[2:]) + elif line and not line.startswith('#'): + points.append(line) + return points[:num_points] + return [] + + def extract_terms(self, text: str) -> List[Dict]: + """提取术语和概念""" + prompt = f"""请从以下内容中提取重要的技术术语、概念、名词,并给出简要解释: + +{text} + +请以JSON格式输出,格式如下: +[ + {"term": "术语名称", "definition": "简要解释"}, + ... +]""" + + result = self.client.generate(prompt, temperature=0.3) + if result: + try: + # 尝试解析JSON + import re + json_match = re.search(r'\[.*\]', result, re.DOTALL) + if json_match: + return json.loads(json_match.group()) + except: + pass + return [] + + def analyze_relation(self, text: str, topic: str) -> str: + """分析内容与主题的关联""" + prompt = f"""请分析以下内容与主题"{topic}"的关联性: + +内容: +{text} + +请从以下角度分析: +1. 内容与主题的直接关联度 +2. 提供了哪些可用于主题的信息 +3. 有哪些可以借鉴的观点或案例""" + + return self.client.generate(prompt, temperature=0.5) + + def generate_outline(self, topic: str, article_type: str, + structure: List[str], resources_summary: str) -> str: + """生成文章大纲""" + prompt = f"""请根据以下信息生成文章大纲: + +主题:{topic} +文章类型:{article_type} +参考结构:{structure} +资料摘要:{resources_summary} + +请生成详细的文章大纲,包含: +1. 各章节标题 +2. 每个章节的主要内容要点 +3. 建议使用的资料引用""" + + return self.client.generate(prompt, temperature=0.7) + + def write_section(self, section_title: str, section_outline: str, + resources: str, style: str = "技术博客") -> str: + """撰写文章章节""" + prompt = f"""请撰写文章章节: + +章节标题:{section_title} +章节大纲:{section_outline} +参考资料:{resources} +写作风格:{style} + +要求: +1. 内容专业准确,基于参考资料 +2. 语言流畅易懂 +3. 适当使用代码示例、公式、图表说明 +4. 控制字数在300-500字 + +请输出章节内容:""" + + return self.client.generate(prompt, temperature=0.7) \ No newline at end of file diff --git a/src/outline_generator.py b/src/outline_generator.py new file mode 100644 index 0000000..0ed3b84 --- /dev/null +++ b/src/outline_generator.py @@ -0,0 +1,188 @@ +""" +大纲生成模块 - 基于资料生成文章大纲 +""" + +import json +import re +from typing import Dict, List, Optional +from .llm_client import LLMClient +from .resource_pool import ResourcePool +from config.settings import ARTICLE_TYPES + + +class OutlineGenerator: + """文章大纲生成器""" + + def __init__(self, llm_config: dict, pool: ResourcePool): + self.llm = LLMClient(llm_config) + self.pool = pool + + def generate(self, topic: str, article_type: str) -> str: + """生成文章大纲""" + print(f"\n正在生成文章大纲...") + + # 1. 获取文章类型信息 + type_info = ARTICLE_TYPES.get(article_type, ARTICLE_TYPES['技术解析']) + structure = type_info.get('structure', []) + + # 2. 收集资料摘要 + resources_summary = self._collect_resources_summary() + + # 3. 生成大纲 + outline = self._generate_outline_with_llm(topic, article_type, structure, resources_summary) + + # 4. 保存大纲 + self._save_outline(topic, outline) + + return outline + + def _collect_resources_summary(self) -> str: + """收集资料摘要""" + summaries = self.pool.get_all_summaries() + + if not summaries: + return "暂无参考资料" + + summary_parts = [] + for i, s in enumerate(summaries[:10], 1): # 限制数量 + content = s.get('content', '') + # 提取摘要部分 + summary_match = re.search(r'核心内容摘要[::]\s*(.+?)(?=##|\Z)', content, re.DOTALL) + if summary_match: + summary_parts.append(f"{i}. {summary_match.group(1).strip()[:200]}") + + return "\n".join(summary_parts) if summary_parts else "暂无有效摘要" + + def _generate_outline_with_llm(self, topic: str, article_type: str, + structure: List[str], resources_summary: str) -> str: + """使用LLM生成大纲""" + + prompt = f"""你是一位技术博客编辑,需要为以下主题生成文章大纲。 + +主题: {topic} +文章类型: {article_type} +参考结构: {structure} + +可用资料摘要: +{resources_summary} + +请生成详细的文章大纲,要求: +1. 遵循给定的参考结构 +2. 每个章节要有具体的内容要点 +3. 标注可以使用参考资料的部分 +4. 考虑加入图表、代码示例、公式等元素 + +请以Markdown格式输出大纲: + +# {topic} + +## 1. 第一节标题 +- 要点1 +- 要点2 +- [参考: 资料X] + +## 2. 第二节标题 +... + +## 参考资料清单 +- 资料1: 标题 - URL +- 资料2: ... +""" + + result = self.llm.generate(prompt, temperature=0.7) + + if result: + return result + + # 返回默认大纲 + return self._get_default_outline(topic, structure) + + def _get_default_outline(self, topic: str, structure: List[str]) -> str: + """获取默认大纲""" + outline = f"# {topic}\n\n" + + for i, section in enumerate(structure, 1): + outline += f"## {i}. {section}\n" + outline += f"- {section}的核心要点1\n" + outline += f"- {section}的核心要点2\n" + outline += f"- [待补充具体内容]\n\n" + + outline += "## 参考资料\n" + outline += "- [待补充]\n" + + return outline + + def _save_outline(self, topic: str, outline: str): + """保存大纲""" + import os + from pathlib import Path + + output_dir = Path('output') + output_dir.mkdir(parents=True, exist_ok=True) + + # 生成文件名 + safe_topic = re.sub(r'[^\w\s-]', '', topic)[:50] + safe_topic = re.sub(r'[-\s]+', '-', safe_topic) + filename = f"{safe_topic}_outline.md" + + filepath = output_dir / filename + filepath.write_text(outline, encoding='utf-8') + + print(f"大纲已保存: {filepath}") + + def refine_outline(self, outline: str, feedback: str) -> str: + """根据反馈优化大纲""" + prompt = f"""请根据反馈意见优化文章大纲。 + +原大纲: +{outline} + +反馈意见: +{feedback} + +请输出优化后的大纲:""" + + return self.llm.generate(prompt, temperature=0.6) or outline + + def extract_section_points(self, outline: str) -> Dict[str, List[str]]: + """从大纲提取各章节要点""" + sections = {} + + # 解析大纲 + current_section = None + for line in outline.split('\n'): + if line.startswith('## '): + current_section = line[3:].strip() + sections[current_section] = [] + elif line.startswith('- ') and current_section: + point = line[2:].strip() + if point and not point.startswith('['): + sections[current_section].append(point) + + return sections + + def generate_writing_plan(self, outline: str) -> Dict: + """生成写作计划""" + sections = self.extract_section_points(outline) + + plan = { + 'sections': [], + 'total_points': 0, + 'estimated_words': 0, + } + + for section_name, points in sections.items(): + section_plan = { + 'name': section_name, + 'points': points, + 'point_count': len(points), + 'estimated_words': len(points) * 150, # 每个要点约150字 + 'needs_code': '代码' in section_name or '实现' in section_name, + 'needs_image': '架构' in section_name or '流程' in section_name, + } + plan['sections'].append(section_plan) + plan['total_points'] += len(points) + + plan['estimated_words'] = plan['total_points'] * 150 + + return plan \ No newline at end of file diff --git a/src/resource_analyzer.py b/src/resource_analyzer.py new file mode 100644 index 0000000..9da5102 --- /dev/null +++ b/src/resource_analyzer.py @@ -0,0 +1,185 @@ +""" +资料分析模块 - 深度分析收集的资料 +""" + +import json +import re +from typing import Dict, List, Optional +from .llm_client import LLMClient, LLMHelper +from .resource_pool import ResourcePool + + +class ResourceAnalyzer: + """资料分析器""" + + def __init__(self, llm_config: dict, pool: ResourcePool): + self.llm = LLMClient(llm_config) + self.helper = LLMHelper(self.llm) + self.pool = pool + + def analyze_all(self, topic: str) -> List[Dict]: + """分析所有未分析的资料""" + print(f"\n正在分析资料...") + + # 获取未分析的资料 + resources = self.pool.list_resources(analyzed_only=False) + unanalyzed = [r for r in resources if not r['analyzed']] + + print(f"待分析资料: {len(unanalyzed)} 份") + + analyzed = [] + for i, resource in enumerate(unanalyzed, 1): + print(f"\n[{i}/{len(unanalyzed)}] 分析: {resource['title'][:40]}...") + + # 获取完整资料 + full_resource = self.pool.get_resource(resource['id']) + if not full_resource: + continue + + # 执行分析 + analysis = self._analyze_resource(full_resource, topic) + + if analysis: + # 保存分析结果 + self.pool.add_summary(resource['id'], analysis) + + # 更新关键词索引 + keywords = analysis.get('key_points', [])[:5] + self.pool.update_keyword_index(keywords, resource['id']) + + analyzed.append({ + 'id': resource['id'], + 'title': resource['title'], + 'analysis': analysis + }) + + print(f" ✓ 完成分析") + else: + print(f" ✗ 分析失败") + + print(f"\n共完成 {len(analyzed)} 份资料分析") + return analyzed + + def _analyze_resource(self, resource: Dict, topic: str) -> Optional[Dict]: + """分析单个资料""" + content = resource.get('content', '') + + if not content or len(content) < 100: + print(f" 内容太短,跳过") + return None + + # 限制内容长度 + if len(content) > 8000: + content = content[:8000] + '\n...(内容已截断)' + + # 构建分析提示 + prompt = f"""请对以下资料进行全面分析,这是为撰写"{topic}"主题文章收集的资料。 + +资料标题: {resource.get('title', '')} +资料来源: {resource.get('url', '')} + +资料内容: +{content} + +请按以下结构进行分析,以JSON格式输出: +{{ + "original_title": "原标题", + "source": "来源", + "summary": "核心内容摘要(200字以内)", + "key_points": ["关键要点1", "关键要点2", "关键要点3", "关键要点4", "关键要点5"], + "terms": [ + {{"term": "术语1", "definition": "解释"}}, + {{"term": "术语2", "definition": "解释"}} + ], + "relation_analysis": "与主题'{topic}'的关联分析", + "usable_parts": "可用于文章的部分(具体说明哪些内容可以引用)", + "images": [ + {{"url": "图片URL", "alt": "图片描述", "context": "图片出现的上下文"}} + ], + "quality_score": 1-10分(资料质量评分), + "relevance_score": 1-10分(与主题相关度) +}}""" + + result = self.llm.generate(prompt, temperature=0.3) + + if result: + try: + # 提取JSON + json_match = re.search(r'\{[\s\S]*\}', result) + if json_match: + analysis = json.loads(json_match.group()) + return analysis + except json.JSONDecodeError as e: + print(f" JSON解析失败: {e}") + + # 返回简单分析 + return self._simple_analysis(resource, topic) + + def _simple_analysis(self, resource: Dict, topic: str) -> Dict: + """简单分析(LLM失败时的备选)""" + content = resource.get('content', '') + + # 提取关键句子 + sentences = content.replace('。', '。\n').split('\n') + key_sentences = [s.strip() for s in sentences if len(s.strip()) > 50][:5] + + return { + "original_title": resource.get('title', ''), + "source": resource.get('url', ''), + "summary": content[:200] + '...', + "key_points": key_sentences[:3], + "terms": [], + "relation_analysis": f"资料与主题'{topic}'相关", + "usable_parts": "部分内容可参考", + "images": [], + "quality_score": 5, + "relevance_score": 5 + } + + def analyze_resource_by_id(self, resource_id: str, topic: str) -> Optional[Dict]: + """分析指定ID的资料""" + resource = self.pool.get_resource(resource_id) + if resource: + return self._analyze_resource(resource, topic) + return None + + def get_analysis_summary(self) -> str: + """获取所有分析摘要的汇总""" + summaries = self.pool.get_all_summaries() + + if not summaries: + return "暂无分析资料" + + summary_text = "# 资料分析汇总\n\n" + + for s in summaries: + summary_text += f"## {s.get('key_points', [''])[0] if s.get('key_points') else '资料'}\n\n" + summary_text += f"要点: {', '.join(s.get('key_points', [])[:3])}\n\n" + + return summary_text + + def extract_usable_content(self, topic: str) -> Dict: + """提取可用于写作的内容""" + summaries = self.pool.get_all_summaries() + + usable = { + 'key_points': [], + 'terms': {}, + 'images': [], + 'quotes': [], + } + + for s in summaries: + content = s.get('content', '') + + # 提取关键要点 + points = s.get('key_points', []) + usable['key_points'].extend(points) + + # 提取术语 + # 从内容中提取定义模式 + + # 去重 + usable['key_points'] = list(set(usable['key_points'])) + + return usable \ No newline at end of file diff --git a/src/resource_collector.py b/src/resource_collector.py new file mode 100644 index 0000000..660ad3b --- /dev/null +++ b/src/resource_collector.py @@ -0,0 +1,249 @@ +""" +资料收集模块 - 搜索和收集相关资料 +""" + +import json +import re +import requests +import time +from typing import List, Dict, Optional +from urllib.parse import quote_plus +from .llm_client import LLMClient +from .resource_pool import ResourcePool + + +class ResourceCollector: + """资料收集器""" + + def __init__(self, llm_config: dict, search_config: dict, pool: ResourcePool): + self.llm = LLMClient(llm_config) + self.search_config = search_config + self.pool = pool + + def collect(self, topic: str) -> List[Dict]: + """收集主题相关资料""" + print(f"\n正在收集资料: {topic}") + + # 1. 生成搜索关键词 + keywords = self._generate_search_keywords(topic) + print(f"搜索关键词: {keywords}") + + # 2. 执行搜索 + all_resources = [] + + for keyword in keywords[:3]: # 限制搜索次数 + print(f"\n搜索: {keyword}") + + # 网页搜索 + web_results = self._search_web(keyword) + all_resources.extend(web_results) + + time.sleep(1) # 避免请求过快 + + # 3. 去重并保存到资料池 + unique_resources = [] + for resource in all_resources: + existing_id = self.pool.resource_exists(resource['url']) + if existing_id: + print(f" 跳过已存在: {resource['title'][:30]}...") + continue + + # 获取完整内容 + full_content = self._fetch_content(resource['url']) + if full_content: + resource['content'] = full_content + resource_id = self.pool.add_resource(resource) + resource['id'] = resource_id + unique_resources.append(resource) + print(f" 已收集: {resource['title'][:50]}...") + + print(f"\n共收集 {len(unique_resources)} 篇新资料") + return unique_resources + + def _generate_search_keywords(self, topic: str) -> List[str]: + """生成搜索关键词""" + prompt = f"""请为以下文章主题生成5个搜索关键词或短语,用于搜索相关资料: + +主题: {topic} + +要求: +1. 关键词要具体,能找到高质量资料 +2. 包含技术术语和通俗表达 +3. 包含中英文关键词 + +请以JSON数组格式输出关键词列表:""" + + result = self.llm.generate(prompt, temperature=0.5) + + if result: + try: + json_match = re.search(r'\[.*\]', result, re.DOTALL) + if json_match: + return json.loads(json_match.group()) + except: + pass + + # 默认关键词 + return [topic, f"{topic} 原理", f"{topic} 实现", f"{topic} 教程", topic] + + def _search_web(self, keyword: str) -> List[Dict]: + """网页搜索(使用DuckDuckGo或类似服务)""" + results = [] + + try: + # 使用DuckDuckGo Instant Answer API + url = f"https://api.duckduckgo.com/?q={quote_plus(keyword)}&format=json&no_html=1" + + response = requests.get(url, timeout=10) + if response.status_code == 200: + data = response.json() + + # 解析相关主题 + for topic in data.get('RelatedTopics', [])[:5]: + if isinstance(topic, dict) and 'FirstURL' in topic: + results.append({ + 'title': topic.get('Text', '').split(' - ')[0], + 'url': topic.get('FirstURL', ''), + 'source': 'duckduckgo', + 'type': 'web', + 'snippet': topic.get('Text', ''), + }) + + # 如果结果太少,使用模拟数据 + if len(results) < 3: + results.extend(self._get_mock_results(keyword)) + + except Exception as e: + print(f"搜索失败: {e}") + results = self._get_mock_results(keyword) + + return results + + def _get_mock_results(self, keyword: str) -> List[Dict]: + """获取模拟搜索结果(用于测试或API不可用时)""" + # 生成一些相关的模拟资料 + mock_resources = [ + { + 'title': f'{keyword} 技术原理详解', + 'url': f'https://example.com/tech/{keyword.replace(" ", "-")}', + 'source': 'mock', + 'type': 'article', + 'snippet': f'本文详细介绍{keyword}的核心原理和实现方式。', + }, + { + 'title': f'{keyword} 实践指南', + 'url': f'https://example.com/guide/{keyword.replace(" ", "-")}', + 'source': 'mock', + 'type': 'tutorial', + 'snippet': f'从零开始学习{keyword}的完整教程。', + }, + ] + return mock_resources + + def _fetch_content(self, url: str) -> Optional[str]: + """获取网页内容""" + if not url or url.startswith('https://example.com'): + # 模拟内容 + return self._generate_mock_content(url) + + try: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + } + response = requests.get(url, headers=headers, timeout=15) + + if response.status_code == 200: + # 简单的HTML内容提取 + content = self._extract_text_from_html(response.text) + return content + + except Exception as e: + print(f" 获取内容失败: {e}") + + return None + + def _extract_text_from_html(self, html: str) -> str: + """从HTML提取文本内容""" + # 移除script和style标签 + html = re.sub(r']*>.*?', '', html, flags=re.DOTALL) + html = re.sub(r']*>.*?', '', html, flags=re.DOTALL) + + # 提取正文内容 + # 优先找article, main, .content等 + content_match = re.search(r']*>(.*?)', html, re.DOTALL) + if not content_match: + content_match = re.search(r']*>(.*?)', html, re.DOTALL) + if not content_match: + content_match = re.search(r']*>(.*?)', html, re.DOTALL) + + if content_match: + text = content_match.group(1) + else: + text = html + + # 移除HTML标签 + text = re.sub(r'<[^>]+>', ' ', text) + + # 清理空白 + text = re.sub(r'\s+', ' ', text) + text = text.strip() + + # 限制长度 + if len(text) > 10000: + text = text[:10000] + '...' + + return text + + def _generate_mock_content(self, url: str) -> str: + """生成模拟内容""" + return f"""# 技术文章内容 + +这是一篇关于该主题的技术文章。 + +## 背景 +随着技术的快速发展,这个领域变得越来越重要。 + +## 核心概念 +本文介绍了相关的核心概念和实现原理。 + +## 实现细节 +详细讲解了技术实现的具体步骤和注意事项。 + +## 应用场景 +讨论了该技术在实际项目中的应用案例。 + +## 总结 +对该技术进行了全面总结,并展望了未来发展趋势。 +""" + + def collect_from_urls(self, urls: List[str]) -> List[Dict]: + """从指定URL列表收集资料""" + resources = [] + + for url in urls: + print(f"获取: {url}") + + content = self._fetch_content(url) + if content: + resource = { + 'url': url, + 'title': self._extract_title(content, url), + 'content': content, + 'source': 'user_provided', + 'type': 'web', + } + resource_id = self.pool.add_resource(resource) + resource['id'] = resource_id + resources.append(resource) + + return resources + + def _extract_title(self, content: str, url: str) -> str: + """从内容提取标题""" + # 尝试提取h1或title + match = re.search(r'^#\s*(.+)$', content, re.MULTILINE) + if match: + return match.group(1) + + # 从URL提取 + return url.split('/')[-1] or 'Untitled' \ No newline at end of file diff --git a/src/resource_pool.py b/src/resource_pool.py new file mode 100644 index 0000000..98f3238 --- /dev/null +++ b/src/resource_pool.py @@ -0,0 +1,289 @@ +""" +资料池管理模块 - 管理所有中间产物和资料索引 +""" + +import os +import json +import hashlib +from datetime import datetime +from pathlib import Path + + +class ResourcePool: + """资料池管理器""" + + def __init__(self, config): + self.config = config + self.pool_dir = Path(config['pool_dir']) + self.index_file = Path(config['index_file']) + self.history_file = Path(config['history_file']) + self.summaries_dir = Path(config['summaries_dir']) + self.images_dir = Path(config['images_dir']) + + # 确保目录存在 + self.pool_dir.mkdir(parents=True, exist_ok=True) + self.summaries_dir.mkdir(parents=True, exist_ok=True) + self.images_dir.mkdir(parents=True, exist_ok=True) + + # 加载索引 + self.index = self._load_index() + self.history = self._load_history() + + def _load_index(self): + """加载资料索引""" + if self.index_file.exists(): + return json.loads(self.index_file.read_text(encoding='utf-8')) + return { + "resources": {}, + "summaries": {}, + "images": {}, + "keywords_index": {}, + "updated_at": None + } + + def _save_index(self): + """保存资料索引""" + self.index['updated_at'] = datetime.now().isoformat() + self.index_file.write_text(json.dumps(self.index, ensure_ascii=False, indent=2), encoding='utf-8') + + def _load_history(self): + """加载文章历史""" + if self.history_file.exists(): + return json.loads(self.history_file.read_text(encoding='utf-8')) + return {"articles": [], "topics": []} + + def _save_history(self): + """保存文章历史""" + self.history_file.write_text(json.dumps(self.history, ensure_ascii=False, indent=2), encoding='utf-8') + + def _get_hash(self, content): + """计算内容哈希""" + if isinstance(content, str): + content = content.encode('utf-8') + return hashlib.md5(content).hexdigest()[:12] + + # === 资料管理 === + + def add_resource(self, resource_data): + """添加资料到池中""" + resource_id = self._get_hash(resource_data.get('url', '') or resource_data.get('content', '')) + + resource_entry = { + "id": resource_id, + "title": resource_data.get('title', ''), + "url": resource_data.get('url', ''), + "source": resource_data.get('source', ''), + "type": resource_data.get('type', 'web'), + "added_at": datetime.now().isoformat(), + "analyzed": False, + "summary_file": None, + "key_points": [], + } + + # 保存原始内容 + content_file = self.pool_dir / f"{resource_id}_content.md" + content_file.write_text(resource_data.get('content', ''), encoding='utf-8') + + self.index['resources'][resource_id] = resource_entry + self._save_index() + + return resource_id + + def get_resource(self, resource_id): + """获取资料""" + if resource_id not in self.index['resources']: + return None + + entry = self.index['resources'][resource_id] + content_file = self.pool_dir / f"{resource_id}_content.md" + + if content_file.exists(): + entry['content'] = content_file.read_text(encoding='utf-8') + + return entry + + def list_resources(self, analyzed_only=False): + """列出所有资料""" + resources = [] + for rid, entry in self.index['resources'].items(): + if analyzed_only and not entry['analyzed']: + continue + resources.append(entry) + return resources + + def resource_exists(self, url): + """检查资料是否已存在""" + for rid, entry in self.index['resources'].items(): + if entry['url'] == url: + return rid + return None + + # === 分析产物管理 === + + def add_summary(self, resource_id, summary_data): + """添加资料分析摘要""" + summary_file = self.summaries_dir / f"{resource_id}_summary.md" + + summary_content = f"""# 资料分析摘要 + +## 基本信息 +- 原标题: {summary_data.get('original_title', '')} +- 来源: {summary_data.get('source', '')} +- 分析时间: {datetime.now().isoformat()} + +## 核心内容摘要 +{summary_data.get('summary', '')} + +## 关键要点 +{self._format_points(summary_data.get('key_points', []))} + +## 相关概念/术语 +{self._format_terms(summary_data.get('terms', []))} + +## 与主题的关联分析 +{summary_data.get('relation_analysis', '')} + +## 可用于文章的部分 +{summary_data.get('usable_parts', '')} + +## 图片资源 +{self._format_images(summary_data.get('images', []))} +""" + + summary_file.write_text(summary_content, encoding='utf-8') + + # 更新索引 + if resource_id in self.index['resources']: + self.index['resources'][resource_id]['analyzed'] = True + self.index['resources'][resource_id]['summary_file'] = str(summary_file) + self.index['resources'][resource_id]['key_points'] = summary_data.get('key_points', []) + + # 更新摘要索引 + self.index['summaries'][resource_id] = { + "file": str(summary_file), + "key_points": summary_data.get('key_points', []), + "terms": summary_data.get('terms', []), + "usable": summary_data.get('usable_parts', ''), + } + + self._save_index() + return str(summary_file) + + def _format_points(self, points): + """格式化要点列表""" + if not points: + return "暂无" + return "\n".join([f"- {p}" for p in points]) + + def _format_terms(self, terms): + """格式化术语列表""" + if not terms: + return "暂无" + return "\n".join([f"- **{t['term']}**: {t['definition']}" for t in terms]) + + def _format_images(self, images): + """格式化图片列表""" + if not images: + return "暂无图片资源" + result = [] + for img in images: + result.append(f"- [{img.get('alt', '图片')}]({img.get('url', '')})") + return "\n".join(result) + + def get_summary(self, resource_id): + """获取分析摘要""" + if resource_id not in self.index['summaries']: + return None + + summary_file = Path(self.index['summaries'][resource_id]['file']) + if summary_file.exists(): + return summary_file.read_text(encoding='utf-8') + return None + + def get_all_summaries(self): + """获取所有摘要""" + summaries = [] + for rid, entry in self.index['summaries'].items(): + summary_file = Path(entry['file']) + if summary_file.exists(): + summaries.append({ + "id": rid, + "file": str(summary_file), + "content": summary_file.read_text(encoding='utf-8'), + "key_points": entry.get('key_points', []), + }) + return summaries + + # === 图片管理 === + + def add_image(self, image_data): + """添加图片""" + image_id = self._get_hash(image_data.get('url', '') + image_data.get('alt', '')) + + image_entry = { + "id": image_id, + "url": image_data.get('url', ''), + "alt": image_data.get('alt', ''), + "source": image_data.get('source', ''), + "context": image_data.get('context', ''), + "added_at": datetime.now().isoformat(), + } + + self.index['images'][image_id] = image_entry + self._save_index() + + return image_id + + # === 关键词索引 === + + def update_keyword_index(self, keywords, resource_id): + """更新关键词索引""" + for kw in keywords: + if kw not in self.index['keywords_index']: + self.index['keywords_index'][kw] = [] + if resource_id not in self.index['keywords_index'][kw]: + self.index['keywords_index'][kw].append(resource_id) + self._save_index() + + def search_by_keyword(self, keyword): + """按关键词搜索资料""" + return self.index['keywords_index'].get(keyword, []) + + # === 文章历史 === + + def add_article_history(self, article_data): + """添加文章历史""" + self.history['articles'].append(article_data) + + # 更新主题列表 + topic = article_data.get('topic') + if topic and topic not in self.history['topics']: + self.history['topics'].append(topic) + + self._save_history() + + def get_article_history(self, limit=10): + """获取文章历史""" + return self.history['articles'][-limit:] + + def get_past_topics(self): + """获取历史主题""" + return self.history['topics'] + + # === 清理 === + + def clear_old_resources(self, days=30): + """清理过期资料""" + # TODO: 实现清理逻辑 + pass + + def get_stats(self): + """获取资料池统计""" + return { + "resources_count": len(self.index['resources']), + "analyzed_count": len([r for r in self.index['resources'].values() if r['analyzed']]), + "summaries_count": len(self.index['summaries']), + "images_count": len(self.index['images']), + "keywords_count": len(self.index['keywords_index']), + "articles_count": len(self.history['articles']), + } \ No newline at end of file diff --git a/src/topic_selector.py b/src/topic_selector.py new file mode 100644 index 0000000..b05dfcf --- /dev/null +++ b/src/topic_selector.py @@ -0,0 +1,197 @@ +""" +主题选择模块 - 基于历史文章和搜索确定主题 +""" + +import json +from typing import Tuple, Optional +from .llm_client import LLMClient, LLMHelper +from .resource_pool import ResourcePool +from config.settings import ARTICLE_TYPES + + +class TopicSelector: + """文章主题选择器""" + + def __init__(self, llm_config: dict, pool: ResourcePool): + self.llm = LLMClient(llm_config) + self.helper = LLMHelper(self.llm) + self.pool = pool + + def select(self, interactive: bool = False) -> Tuple[str, str]: + """选择文章主题和类型 + + Returns: + (topic, article_type) + """ + print("正在分析历史文章和热门话题...") + + # 1. 获取历史主题 + past_topics = self.pool.get_past_topics() + + # 2. 获取文章历史作为参考 + article_history = self.pool.get_article_history(limit=20) + + # 3. 生成候选主题 + candidates = self._generate_candidates(past_topics, article_history) + + if interactive: + # 交互式选择 + return self._interactive_select(candidates) + else: + # 自动选择最佳候选 + return self._auto_select(candidates) + + def _generate_candidates(self, past_topics: list, article_history: list) -> list: + """生成候选主题列表""" + + # 构建提示 + history_summary = "" + if article_history: + history_summary = "历史文章主题:\n" + "\n".join([ + f"- {a.get('topic', '')} ({a.get('type', '')})" + for a in article_history[-10:] + ]) + + prompt = f"""你是一个技术博客编辑,需要为下一篇文章选择主题。 + +{history_summary} + +请根据以下原则生成5个候选主题: +1. 避免与历史主题重复,但可以深入扩展相关领域 +2. 关注当前热门技术趋势(AI、大模型、分布式系统、云原生等) +3. 选择有实际应用价值的话题 +4. 主题应该有足够的资料支撑 + +请以JSON格式输出候选主题: +[ + { + "topic": "主题标题", + "type": "文章类型(技术解析/技术文档翻译/项目介绍分析/综述文章/实践教程/问题分析)", + "reason": "选择理由", + "keywords": ["关键词1", "关键词2"] + }, + ... +]""" + + result = self.llm.generate(prompt, temperature=0.8) + + if result: + try: + import re + json_match = re.search(r'\[.*\]', result, re.DOTALL) + if json_match: + candidates = json.loads(json_match.group()) + return candidates + except json.JSONDecodeError: + pass + + # 如果解析失败,返回默认候选 + return self._get_default_candidates() + + def _get_default_candidates(self) -> list: + """获取默认候选主题""" + return [ + { + "topic": "大模型推理优化技术综述", + "type": "综述文章", + "reason": "当前热门,有实用价值", + "keywords": ["推理优化", "量化", "KV Cache"] + }, + { + "topic": "Flash Attention原理与实现解析", + "type": "技术解析", + "reason": "核心技术,深度解析", + "keywords": ["Flash Attention", "注意力机制", "GPU优化"] + }, + { + "topic": "Ollama本地大模型部署指南", + "type": "实践教程", + "reason": "实用教程,入门友好", + "keywords": ["Ollama", "本地部署", "LLM"] + }, + { + "topic": "RAG技术原理与应用实践", + "type": "技术解析", + "reason": "热门应用方向", + "keywords": ["RAG", "检索增强", "知识库"] + }, + { + "topic": "混合专家模型(MoE)架构分析", + "type": "技术解析", + "reason": "前沿架构,深度分析", + "keywords": ["MoE", "混合专家", "模型架构"] + }, + ] + + def _interactive_select(self, candidates: list) -> Tuple[str, str]: + """交互式选择主题""" + print("\n候选主题列表:") + print("-" * 50) + + for i, c in enumerate(candidates, 1): + print(f"\n{i}. 【{c['type']}】 {c['topic']}") + print(f" 理由: {c['reason']}") + print(f" 关键词: {', '.join(c['keywords'])}") + + print("\n" + "-" * 50) + + while True: + try: + choice = input("\n请选择主题编号 (1-5),或输入自定义主题: ").strip() + + if choice.isdigit() and 1 <= int(choice) <= len(candidates): + selected = candidates[int(choice) - 1] + return selected['topic'], selected['type'] + + # 自定义主题 + if len(choice) > 5: + # 让用户指定类型 + print("\n文章类型选项:") + for i, t in enumerate(ARTICLE_TYPES.keys(), 1): + print(f"{i}. {t}") + + type_choice = input("请选择类型编号: ").strip() + if type_choice.isdigit() and 1 <= int(type_choice) <= len(ARTICLE_TYPES): + article_type = list(ARTICLE_TYPES.keys())[int(type_choice) - 1] + return choice, article_type + + except KeyboardInterrupt: + print("\n已取消") + return candidates[0]['topic'], candidates[0]['type'] + + def _auto_select(self, candidates: list) -> Tuple[str, str]: + """自动选择最佳主题""" + # 选择第一个候选 + selected = candidates[0] + print(f"\n自动选择主题: {selected['topic']}") + print(f"文章类型: {selected['type']}") + print(f"选择理由: {selected['reason']}") + return selected['topic'], selected['type'] + + def suggest_from_keywords(self, keywords: list) -> Tuple[str, str]: + """从关键词推荐主题""" + prompt = f"""请根据以下关键词推荐一个文章主题和类型: + +关键词: {keywords} + +请输出JSON格式: +{ + "topic": "推荐主题", + "type": "文章类型", + "reason": "推荐理由" +}""" + + result = self.llm.generate(prompt, temperature=0.7) + + if result: + try: + import re + json_match = re.search(r'\{.*\}', result, re.DOTALL) + if json_match: + data = json.loads(json_match.group()) + return data['topic'], data['type'] + except: + pass + + # 默认 + return f"{keywords[0]}技术解析", "技术解析" \ No newline at end of file