123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 |
- #!/usr/bin/env python3
- """
- 知识库集成示例脚本
- 演示如何将转换后的JSON文档集成到RAG服务中
- """
- import json
- import asyncio
- import os
- import sys
- from typing import List, Dict, Any
- # 添加项目根目录到Python路径
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
- def load_converted_knowledge(json_file: str) -> List[Dict[str, Any]]:
- """
- 从JSON文件加载转换后的知识库文档
-
- Args:
- json_file: JSON文件路径
-
- Returns:
- 文档列表
- """
- try:
- with open(json_file, 'r', encoding='utf-8') as f:
- documents = json.load(f)
-
- print(f"从 {json_file} 加载了 {len(documents)} 个文档")
- return documents
-
- except Exception as e:
- print(f"加载JSON文件失败: {e}")
- return []
- def analyze_documents(documents: List[Dict[str, Any]]):
- """
- 分析文档统计信息
-
- Args:
- documents: 文档列表
- """
- print(f"\n=== 文档分析结果 ===")
- print(f"总文档数: {len(documents)}")
-
- # 按类型统计
- type_count = {}
- section_count = {}
- level_count = {}
-
- for doc in documents:
- metadata = doc['metadata']
- doc_type = metadata['type']
- section = metadata['section']
- level = metadata['level']
-
- type_count[doc_type] = type_count.get(doc_type, 0) + 1
- section_count[section] = section_count.get(section, 0) + 1
- level_count[level] = level_count.get(level, 0) + 1
-
- print(f"\n各类型文档数量:")
- for doc_type, count in sorted(type_count.items()):
- print(f" {doc_type}: {count} 个")
-
- print(f"\n标题层级分布:")
- for level, count in sorted(level_count.items()):
- level_name = {1: "一级标题", 2: "二级标题", 3: "三级标题"}.get(level, f"{level}级标题")
- print(f" {level_name}: {count} 个")
-
- print(f"\n前5个最常见的章节:")
- for section, count in sorted(section_count.items(), key=lambda x: x[1], reverse=True)[:5]:
- print(f" {section}: {count} 次")
- def group_documents_by_collection(documents: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
- """
- 按适合的集合类型对文档进行分组
-
- Args:
- documents: 文档列表
-
- Returns:
- 按集合分组的文档字典
- """
- # 类型到集合的映射
- type_to_collection = {
- 'login_guide': 'system_guide',
- 'operation_guide': 'system_guide',
- 'basic_settings': 'system_guide',
- 'system_management': 'system_guide',
- 'personal_center': 'system_guide',
-
- 'purchase_management': 'production_data',
- 'inbound_management': 'production_data',
- 'outbound_management': 'production_data',
- 'inventory_management': 'production_data',
- 'process_management': 'production_data',
- 'production_management': 'production_data',
- 'assembly_management': 'production_data',
- }
-
- collections = {}
-
- for doc in documents:
- doc_type = doc['metadata']['type']
- collection_name = type_to_collection.get(doc_type, 'system_guide')
-
- if collection_name not in collections:
- collections[collection_name] = []
-
- collections[collection_name].append(doc)
-
- return collections
- def show_sample_documents(documents: List[Dict[str, Any]], count: int = 3):
- """
- 显示示例文档
-
- Args:
- documents: 文档列表
- count: 显示数量
- """
- print(f"\n=== 前{count}个文档示例 ===")
-
- for i, doc in enumerate(documents[:count]):
- print(f"\n文档 {i+1}:")
- print(f" 章节: {doc['metadata']['section']}")
- print(f" 类型: {doc['metadata']['type']}")
- print(f" 来源: {doc['metadata']['source']}")
- print(f" 层级: {doc['metadata']['level']}")
- print(f" 内容长度: {len(doc['content'])} 字符")
- print(f" 内容预览: {doc['content'][:80]}...")
- async def integrate_to_rag_service_example():
- """
- 集成到RAG服务的示例代码(需要实际的RAG服务)
- """
- print(f"\n=== RAG服务集成示例 ===")
- print("以下是集成到RAG服务的示例代码:")
-
- example_code = '''
- # 导入RAG服务
- from src.services.rag_service import RAGService
- # 初始化RAG服务
- rag_service = RAGService(openai_api_key="your-api-key")
- # 加载转换后的文档
- with open('knowledge_base/mom_knowledge.json', 'r', encoding='utf-8') as f:
- documents = json.load(f)
- # 按集合分组
- collections = group_documents_by_collection(documents)
- # 添加到各个集合
- for collection_name, docs in collections.items():
- print(f"正在添加 {len(docs)} 个文档到 {collection_name} 集合...")
- await rag_service.add_documents(docs, collection_name)
- print("知识库集成完成!")
- # 测试查询
- query_results = await rag_service.query("如何登录系统", "system_guide", top_k=3)
- for result in query_results:
- print(f"相关性: {result.relevance_score:.3f}")
- print(f"内容: {result.content[:100]}...")
- '''
-
- print(example_code)
- def main():
- """主函数"""
- print("=== Markdown转知识库格式转换器 - 集成示例 ===")
-
- # 配置文件路径
- json_file = "knowledge_base/mom_knowledge.json"
-
- # 检查文件是否存在
- if not os.path.exists(json_file):
- print(f"错误: JSON文件 {json_file} 不存在")
- print("请先运行 md_to_knowledge_converter.py 生成JSON文件")
- return
-
- # 加载转换后的文档
- documents = load_converted_knowledge(json_file)
-
- if not documents:
- print("没有找到要分析的文档")
- return
-
- # 分析文档
- analyze_documents(documents)
-
- # 显示示例文档
- show_sample_documents(documents)
-
- # 按集合分组
- collections = group_documents_by_collection(documents)
-
- print(f"\n=== 集合分组结果 ===")
- for collection_name, docs in collections.items():
- print(f" {collection_name}: {len(docs)} 个文档")
-
- # 显示集成示例
- asyncio.run(integrate_to_rag_service_example())
-
- print(f"\n=== 使用建议 ===")
- print("1. 文档已按类型自动分类,可直接用于RAG知识库")
- print("2. 建议将不同类型的文档添加到对应的集合中")
- print("3. 可以根据查询类型选择合适的集合进行检索")
- print("4. 文档内容已清理,适合embedding和语义检索")
- if __name__ == "__main__":
- main()
|