integrate_knowledge_example.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. #!/usr/bin/env python3
  2. """
  3. 知识库集成示例脚本
  4. 演示如何将转换后的JSON文档集成到RAG服务中
  5. """
  6. import json
  7. import asyncio
  8. import os
  9. import sys
  10. from typing import List, Dict, Any
  11. # 添加项目根目录到Python路径
  12. sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  13. def load_converted_knowledge(json_file: str) -> List[Dict[str, Any]]:
  14. """
  15. 从JSON文件加载转换后的知识库文档
  16. Args:
  17. json_file: JSON文件路径
  18. Returns:
  19. 文档列表
  20. """
  21. try:
  22. with open(json_file, 'r', encoding='utf-8') as f:
  23. documents = json.load(f)
  24. print(f"从 {json_file} 加载了 {len(documents)} 个文档")
  25. return documents
  26. except Exception as e:
  27. print(f"加载JSON文件失败: {e}")
  28. return []
  29. def analyze_documents(documents: List[Dict[str, Any]]):
  30. """
  31. 分析文档统计信息
  32. Args:
  33. documents: 文档列表
  34. """
  35. print(f"\n=== 文档分析结果 ===")
  36. print(f"总文档数: {len(documents)}")
  37. # 按类型统计
  38. type_count = {}
  39. section_count = {}
  40. level_count = {}
  41. for doc in documents:
  42. metadata = doc['metadata']
  43. doc_type = metadata['type']
  44. section = metadata['section']
  45. level = metadata['level']
  46. type_count[doc_type] = type_count.get(doc_type, 0) + 1
  47. section_count[section] = section_count.get(section, 0) + 1
  48. level_count[level] = level_count.get(level, 0) + 1
  49. print(f"\n各类型文档数量:")
  50. for doc_type, count in sorted(type_count.items()):
  51. print(f" {doc_type}: {count} 个")
  52. print(f"\n标题层级分布:")
  53. for level, count in sorted(level_count.items()):
  54. level_name = {1: "一级标题", 2: "二级标题", 3: "三级标题"}.get(level, f"{level}级标题")
  55. print(f" {level_name}: {count} 个")
  56. print(f"\n前5个最常见的章节:")
  57. for section, count in sorted(section_count.items(), key=lambda x: x[1], reverse=True)[:5]:
  58. print(f" {section}: {count} 次")
  59. def group_documents_by_collection(documents: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
  60. """
  61. 按适合的集合类型对文档进行分组
  62. Args:
  63. documents: 文档列表
  64. Returns:
  65. 按集合分组的文档字典
  66. """
  67. # 类型到集合的映射
  68. type_to_collection = {
  69. 'login_guide': 'system_guide',
  70. 'operation_guide': 'system_guide',
  71. 'basic_settings': 'system_guide',
  72. 'system_management': 'system_guide',
  73. 'personal_center': 'system_guide',
  74. 'purchase_management': 'production_data',
  75. 'inbound_management': 'production_data',
  76. 'outbound_management': 'production_data',
  77. 'inventory_management': 'production_data',
  78. 'process_management': 'production_data',
  79. 'production_management': 'production_data',
  80. 'assembly_management': 'production_data',
  81. }
  82. collections = {}
  83. for doc in documents:
  84. doc_type = doc['metadata']['type']
  85. collection_name = type_to_collection.get(doc_type, 'system_guide')
  86. if collection_name not in collections:
  87. collections[collection_name] = []
  88. collections[collection_name].append(doc)
  89. return collections
  90. def show_sample_documents(documents: List[Dict[str, Any]], count: int = 3):
  91. """
  92. 显示示例文档
  93. Args:
  94. documents: 文档列表
  95. count: 显示数量
  96. """
  97. print(f"\n=== 前{count}个文档示例 ===")
  98. for i, doc in enumerate(documents[:count]):
  99. print(f"\n文档 {i+1}:")
  100. print(f" 章节: {doc['metadata']['section']}")
  101. print(f" 类型: {doc['metadata']['type']}")
  102. print(f" 来源: {doc['metadata']['source']}")
  103. print(f" 层级: {doc['metadata']['level']}")
  104. print(f" 内容长度: {len(doc['content'])} 字符")
  105. print(f" 内容预览: {doc['content'][:80]}...")
  106. async def integrate_to_rag_service_example():
  107. """
  108. 集成到RAG服务的示例代码(需要实际的RAG服务)
  109. """
  110. print(f"\n=== RAG服务集成示例 ===")
  111. print("以下是集成到RAG服务的示例代码:")
  112. example_code = '''
  113. # 导入RAG服务
  114. from src.services.rag_service import RAGService
  115. # 初始化RAG服务
  116. rag_service = RAGService(openai_api_key="your-api-key")
  117. # 加载转换后的文档
  118. with open('knowledge_base/mom_knowledge.json', 'r', encoding='utf-8') as f:
  119. documents = json.load(f)
  120. # 按集合分组
  121. collections = group_documents_by_collection(documents)
  122. # 添加到各个集合
  123. for collection_name, docs in collections.items():
  124. print(f"正在添加 {len(docs)} 个文档到 {collection_name} 集合...")
  125. await rag_service.add_documents(docs, collection_name)
  126. print("知识库集成完成!")
  127. # 测试查询
  128. query_results = await rag_service.query("如何登录系统", "system_guide", top_k=3)
  129. for result in query_results:
  130. print(f"相关性: {result.relevance_score:.3f}")
  131. print(f"内容: {result.content[:100]}...")
  132. '''
  133. print(example_code)
  134. def main():
  135. """主函数"""
  136. print("=== Markdown转知识库格式转换器 - 集成示例 ===")
  137. # 配置文件路径
  138. json_file = "knowledge_base/mom_knowledge.json"
  139. # 检查文件是否存在
  140. if not os.path.exists(json_file):
  141. print(f"错误: JSON文件 {json_file} 不存在")
  142. print("请先运行 md_to_knowledge_converter.py 生成JSON文件")
  143. return
  144. # 加载转换后的文档
  145. documents = load_converted_knowledge(json_file)
  146. if not documents:
  147. print("没有找到要分析的文档")
  148. return
  149. # 分析文档
  150. analyze_documents(documents)
  151. # 显示示例文档
  152. show_sample_documents(documents)
  153. # 按集合分组
  154. collections = group_documents_by_collection(documents)
  155. print(f"\n=== 集合分组结果 ===")
  156. for collection_name, docs in collections.items():
  157. print(f" {collection_name}: {len(docs)} 个文档")
  158. # 显示集成示例
  159. asyncio.run(integrate_to_rag_service_example())
  160. print(f"\n=== 使用建议 ===")
  161. print("1. 文档已按类型自动分类,可直接用于RAG知识库")
  162. print("2. 建议将不同类型的文档添加到对应的集合中")
  163. print("3. 可以根据查询类型选择合适的集合进行检索")
  164. print("4. 文档内容已清理,适合embedding和语义检索")
  165. if __name__ == "__main__":
  166. main()