stef
/
imes


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
							#!/usr/bin/env python3
"""
知识库集成示例脚本

演示如何将转换后的JSON文档集成到RAG服务中
"""

import json
import asyncio
import os
import sys
from typing import List, Dict, Any

# 添加项目根目录到Python路径
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


def load_converted_knowledge(json_file: str) -> List[Dict[str, Any]]:
    """
    从JSON文件加载转换后的知识库文档
    
    Args:
        json_file: JSON文件路径
        
    Returns:
        文档列表
    """
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            documents = json.load(f)
        
        print(f"从 {json_file} 加载了 {len(documents)} 个文档")
        return documents
        
    except Exception as e:
        print(f"加载JSON文件失败: {e}")
        return []


def analyze_documents(documents: List[Dict[str, Any]]):
    """
    分析文档统计信息
    
    Args:
        documents: 文档列表
    """
    print(f"\n=== 文档分析结果 ===")
    print(f"总文档数: {len(documents)}")
    
    # 按类型统计
    type_count = {}
    section_count = {}
    level_count = {}
    
    for doc in documents:
        metadata = doc['metadata']
        doc_type = metadata['type']
        section = metadata['section']
        level = metadata['level']
        
        type_count[doc_type] = type_count.get(doc_type, 0) + 1
        section_count[section] = section_count.get(section, 0) + 1
        level_count[level] = level_count.get(level, 0) + 1
    
    print(f"\n各类型文档数量:")
    for doc_type, count in sorted(type_count.items()):
        print(f"  {doc_type}: {count} 个")
    
    print(f"\n标题层级分布:")
    for level, count in sorted(level_count.items()):
        level_name = {1: "一级标题", 2: "二级标题", 3: "三级标题"}.get(level, f"{level}级标题")
        print(f"  {level_name}: {count} 个")
    
    print(f"\n前5个最常见的章节:")
    for section, count in sorted(section_count.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"  {section}: {count} 次")


def group_documents_by_collection(documents: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    """
    按适合的集合类型对文档进行分组
    
    Args:
        documents: 文档列表
        
    Returns:
        按集合分组的文档字典
    """
    # 类型到集合的映射
    type_to_collection = {
        'login_guide': 'system_guide',
        'operation_guide': 'system_guide',
        'basic_settings': 'system_guide',
        'system_management': 'system_guide',
        'personal_center': 'system_guide',
        
        'purchase_management': 'production_data',
        'inbound_management': 'production_data',
        'outbound_management': 'production_data',
        'inventory_management': 'production_data',
        'process_management': 'production_data',
        'production_management': 'production_data',
        'assembly_management': 'production_data',
    }
    
    collections = {}
    
    for doc in documents:
        doc_type = doc['metadata']['type']
        collection_name = type_to_collection.get(doc_type, 'system_guide')
        
        if collection_name not in collections:
            collections[collection_name] = []
        
        collections[collection_name].append(doc)
    
    return collections


def show_sample_documents(documents: List[Dict[str, Any]], count: int = 3):
    """
    显示示例文档
    
    Args:
        documents: 文档列表
        count: 显示数量
    """
    print(f"\n=== 前{count}个文档示例 ===")
    
    for i, doc in enumerate(documents[:count]):
        print(f"\n文档 {i+1}:")
        print(f"  章节: {doc['metadata']['section']}")
        print(f"  类型: {doc['metadata']['type']}")
        print(f"  来源: {doc['metadata']['source']}")
        print(f"  层级: {doc['metadata']['level']}")
        print(f"  内容长度: {len(doc['content'])} 字符")
        print(f"  内容预览: {doc['content'][:80]}...")


async def integrate_to_rag_service_example():
    """
    集成到RAG服务的示例代码（需要实际的RAG服务）
    """
    print(f"\n=== RAG服务集成示例 ===")
    print("以下是集成到RAG服务的示例代码：")
    
    example_code = '''
# 导入RAG服务
from src.services.rag_service import RAGService

# 初始化RAG服务
rag_service = RAGService(openai_api_key="your-api-key")

# 加载转换后的文档
with open('knowledge_base/mom_knowledge.json', 'r', encoding='utf-8') as f:
    documents = json.load(f)

# 按集合分组
collections = group_documents_by_collection(documents)

# 添加到各个集合
for collection_name, docs in collections.items():
    print(f"正在添加 {len(docs)} 个文档到 {collection_name} 集合...")
    await rag_service.add_documents(docs, collection_name)

print("知识库集成完成！")

# 测试查询
query_results = await rag_service.query("如何登录系统", "system_guide", top_k=3)
for result in query_results:
    print(f"相关性: {result.relevance_score:.3f}")
    print(f"内容: {result.content[:100]}...")
'''
    
    print(example_code)


def main():
    """主函数"""
    print("=== Markdown转知识库格式转换器 - 集成示例 ===")
    
    # 配置文件路径
    json_file = "knowledge_base/mom_knowledge.json"
    
    # 检查文件是否存在
    if not os.path.exists(json_file):
        print(f"错误: JSON文件 {json_file} 不存在")
        print("请先运行 md_to_knowledge_converter.py 生成JSON文件")
        return
    
    # 加载转换后的文档
    documents = load_converted_knowledge(json_file)
    
    if not documents:
        print("没有找到要分析的文档")
        return
    
    # 分析文档
    analyze_documents(documents)
    
    # 显示示例文档
    show_sample_documents(documents)
    
    # 按集合分组
    collections = group_documents_by_collection(documents)
    
    print(f"\n=== 集合分组结果 ===")
    for collection_name, docs in collections.items():
        print(f"  {collection_name}: {len(docs)} 个文档")
    
    # 显示集成示例
    asyncio.run(integrate_to_rag_service_example())
    
    print(f"\n=== 使用建议 ===")
    print("1. 文档已按类型自动分类，可直接用于RAG知识库")
    print("2. 建议将不同类型的文档添加到对应的集合中")
    print("3. 可以根据查询类型选择合适的集合进行检索")
    print("4. 文档内容已清理，适合embedding和语义检索")


if __name__ == "__main__":
    main()