#!/usr/bin/env python3 """ 知识库集成示例脚本 演示如何将转换后的JSON文档集成到RAG服务中 """ import json import asyncio import os import sys from typing import List, Dict, Any # 添加项目根目录到Python路径 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def load_converted_knowledge(json_file: str) -> List[Dict[str, Any]]: """ 从JSON文件加载转换后的知识库文档 Args: json_file: JSON文件路径 Returns: 文档列表 """ try: with open(json_file, 'r', encoding='utf-8') as f: documents = json.load(f) print(f"从 {json_file} 加载了 {len(documents)} 个文档") return documents except Exception as e: print(f"加载JSON文件失败: {e}") return [] def analyze_documents(documents: List[Dict[str, Any]]): """ 分析文档统计信息 Args: documents: 文档列表 """ print(f"\n=== 文档分析结果 ===") print(f"总文档数: {len(documents)}") # 按类型统计 type_count = {} section_count = {} level_count = {} for doc in documents: metadata = doc['metadata'] doc_type = metadata['type'] section = metadata['section'] level = metadata['level'] type_count[doc_type] = type_count.get(doc_type, 0) + 1 section_count[section] = section_count.get(section, 0) + 1 level_count[level] = level_count.get(level, 0) + 1 print(f"\n各类型文档数量:") for doc_type, count in sorted(type_count.items()): print(f" {doc_type}: {count} 个") print(f"\n标题层级分布:") for level, count in sorted(level_count.items()): level_name = {1: "一级标题", 2: "二级标题", 3: "三级标题"}.get(level, f"{level}级标题") print(f" {level_name}: {count} 个") print(f"\n前5个最常见的章节:") for section, count in sorted(section_count.items(), key=lambda x: x[1], reverse=True)[:5]: print(f" {section}: {count} 次") def group_documents_by_collection(documents: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: """ 按适合的集合类型对文档进行分组 Args: documents: 文档列表 Returns: 按集合分组的文档字典 """ # 类型到集合的映射 type_to_collection = { 'login_guide': 'system_guide', 'operation_guide': 'system_guide', 'basic_settings': 'system_guide', 'system_management': 'system_guide', 'personal_center': 'system_guide', 'purchase_management': 'production_data', 'inbound_management': 'production_data', 'outbound_management': 'production_data', 'inventory_management': 'production_data', 'process_management': 'production_data', 'production_management': 'production_data', 'assembly_management': 'production_data', } collections = {} for doc in documents: doc_type = doc['metadata']['type'] collection_name = type_to_collection.get(doc_type, 'system_guide') if collection_name not in collections: collections[collection_name] = [] collections[collection_name].append(doc) return collections def show_sample_documents(documents: List[Dict[str, Any]], count: int = 3): """ 显示示例文档 Args: documents: 文档列表 count: 显示数量 """ print(f"\n=== 前{count}个文档示例 ===") for i, doc in enumerate(documents[:count]): print(f"\n文档 {i+1}:") print(f" 章节: {doc['metadata']['section']}") print(f" 类型: {doc['metadata']['type']}") print(f" 来源: {doc['metadata']['source']}") print(f" 层级: {doc['metadata']['level']}") print(f" 内容长度: {len(doc['content'])} 字符") print(f" 内容预览: {doc['content'][:80]}...") async def integrate_to_rag_service_example(): """ 集成到RAG服务的示例代码(需要实际的RAG服务) """ print(f"\n=== RAG服务集成示例 ===") print("以下是集成到RAG服务的示例代码:") example_code = ''' # 导入RAG服务 from src.services.rag_service import RAGService # 初始化RAG服务 rag_service = RAGService(openai_api_key="your-api-key") # 加载转换后的文档 with open('knowledge_base/mom_knowledge.json', 'r', encoding='utf-8') as f: documents = json.load(f) # 按集合分组 collections = group_documents_by_collection(documents) # 添加到各个集合 for collection_name, docs in collections.items(): print(f"正在添加 {len(docs)} 个文档到 {collection_name} 集合...") await rag_service.add_documents(docs, collection_name) print("知识库集成完成!") # 测试查询 query_results = await rag_service.query("如何登录系统", "system_guide", top_k=3) for result in query_results: print(f"相关性: {result.relevance_score:.3f}") print(f"内容: {result.content[:100]}...") ''' print(example_code) def main(): """主函数""" print("=== Markdown转知识库格式转换器 - 集成示例 ===") # 配置文件路径 json_file = "knowledge_base/mom_knowledge.json" # 检查文件是否存在 if not os.path.exists(json_file): print(f"错误: JSON文件 {json_file} 不存在") print("请先运行 md_to_knowledge_converter.py 生成JSON文件") return # 加载转换后的文档 documents = load_converted_knowledge(json_file) if not documents: print("没有找到要分析的文档") return # 分析文档 analyze_documents(documents) # 显示示例文档 show_sample_documents(documents) # 按集合分组 collections = group_documents_by_collection(documents) print(f"\n=== 集合分组结果 ===") for collection_name, docs in collections.items(): print(f" {collection_name}: {len(docs)} 个文档") # 显示集成示例 asyncio.run(integrate_to_rag_service_example()) print(f"\n=== 使用建议 ===") print("1. 文档已按类型自动分类,可直接用于RAG知识库") print("2. 建议将不同类型的文档添加到对应的集合中") print("3. 可以根据查询类型选择合适的集合进行检索") print("4. 文档内容已清理,适合embedding和语义检索") if __name__ == "__main__": main()