stef
/
imes


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
							#!/usr/bin/env python3
"""
Markdown文档转知识库格式转换器

将md格式的文档转换为RAG知识库所需的JSON格式
包含content、metadata、source、type等字段
"""

import json
import re
import os
from typing import List, Dict, Any
from pathlib import Path


class MarkdownToKnowledgeConverter:
    """Markdown转知识库格式转换器"""
    
    def __init__(self):
        self.documents = []
    
    def parse_markdown(self, file_path: str) -> List[Dict[str, Any]]:
        """
        解析markdown文件并转换为知识库格式
        
        Args:
            file_path: markdown文件路径
            
        Returns:
            转换后的文档列表
        """
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # 提取文档标题
        title_match = re.search(r'^#\s+(.+?)$', content, re.MULTILINE)
        document_title = title_match.group(1) if title_match else "未知文档"
        
        # 按章节分割文档
        sections = self._split_by_sections(content)
        
        documents = []
        for section in sections:
            if section['content'].strip():
                doc = self._create_document(section, document_title, file_path)
                documents.append(doc)
        
        return documents
    
    def _split_by_sections(self, content: str) -> List[Dict[str, str]]:
        """按章节分割文档内容"""
        sections = []
        
        # 使用正则表达式分割章节
        # 匹配 # 标题 (一级标题)
        pattern = r'^(#{1,3})\s+(.+?)$'
        matches = list(re.finditer(pattern, content, re.MULTILINE))
        
        for i, match in enumerate(matches):
            start_pos = match.start()
            end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(content)
            
            section_content = content[start_pos:end_pos].strip()
            level = len(match.group(1))  # 标题级别
            title = match.group(2).strip()
            
            # 提取章节内容（去除标题行）
            lines = section_content.split('\n')
            content_lines = lines[1:]  # 跳过标题行
            
            # 过滤掉图片引用和空行
            filtered_lines = []
            for line in content_lines:
                line = line.strip()
                if line and not line.startswith('![') and not line.startswith('图'):
                    filtered_lines.append(line)
            
            if filtered_lines:
                sections.append({
                    'title': title,
                    'content': '\n'.join(filtered_lines),
                    'level': level
                })
        
        return sections
    
    def _create_document(self, section: Dict[str, str], document_title: str, file_path: str) -> Dict[str, Any]:
        """创建知识库文档格式"""
        # 确定文档类型
        doc_type = self._determine_type(section['title'])
        
        # 生成内容
        content = f"标题: {section['title']}\n\n{section['content']}"
        
        # 清理内容（去除多余的空行和特殊字符）
        content = self._clean_content(content)
        
        return {
            "content": content,
            "metadata": {
                "source": document_title,
                "type": doc_type,
                "section": section['title'],
                "level": section['level'],
                "file_path": file_path
            }
        }
    
    def _determine_type(self, title: str) -> str:
        """根据标题确定文档类型"""
        title_lower = title.lower()
        
        # 定义类型映射
        type_mapping = {
            "登录": "login_guide",
            "采购": "purchase_management", 
            "入库": "inbound_management",
            "出库": "outbound_management",
            "库存": "inventory_management",
            "工艺": "process_management",
            "生产": "production_management",
            "基础设置": "basic_settings",
            "系统管理": "system_management",
            "个人中心": "personal_center",
            "装配": "assembly_management"
        }
        
        for keyword, doc_type in type_mapping.items():
            if keyword in title_lower:
                return doc_type
        
        # 默认类型
        return "operation_guide"
    
    def _clean_content(self, content: str) -> str:
        """清理文档内容"""
        # 去除多余的空行
        lines = content.split('\n')
        cleaned_lines = []
        
        for line in lines:
            line = line.strip()
            if line:
                # 去除markdown链接格式但保留文本
                line = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', line)
                # 去除markdown图片引用
                line = re.sub(r'!\[.*?\]\([^\)]+\)', '', line)
                # 去除特殊格式字符
                line = re.sub(r'[{}]+', '', line)
                # 去除过多的空格
                line = re.sub(r'\s+', ' ', line)
                cleaned_lines.append(line)
        
        # 重新组合，去除连续的空行
        result = '\n'.join(cleaned_lines)
        result = re.sub(r'\n\s*\n', '\n\n', result)
        
        return result.strip()
    
    def convert_file(self, input_file: str, output_file: str = None) -> List[Dict[str, Any]]:
        """
        转换单个markdown文件
        
        Args:
            input_file: 输入的markdown文件路径
            output_file: 输出的JSON文件路径（可选）
            
        Returns:
            转换后的文档列表
        """
        print(f"正在转换文件: {input_file}")
        
        documents = self.parse_markdown(input_file)
        
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(documents, f, ensure_ascii=False, indent=2)
            print(f"转换完成，已保存到: {output_file}")
            print(f"共生成 {len(documents)} 个知识库条目")
        
        return documents
    
    def convert_directory(self, input_dir: str, output_dir: str):
        """
        转换目录下的所有markdown文件
        
        Args:
            input_dir: 输入目录路径
            output_dir: 输出目录路径
        """
        input_path = Path(input_dir)
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        for md_file in input_path.glob("**/*.md"):
            relative_path = md_file.relative_to(input_path)
            output_file = output_path / relative_path.with_suffix('.json')
            output_file.parent.mkdir(parents=True, exist_ok=True)
            
            self.convert_file(str(md_file), str(output_file))


def main():
    """主函数 - 示例用法"""
    converter = MarkdownToKnowledgeConverter()
    
    # 转换mom.md文件
    input_file = "assets/docs/mom.md"
    output_file = "knowledge_base/mom_knowledge.json"
    
    # 检查输入文件是否存在
    if not os.path.exists(input_file):
        print(f"错误：输入文件 {input_file} 不存在")
        print("请确保文件路径正确")
        return
    
    try:
        # 确保输出目录存在
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        
        # 执行转换
        documents = converter.convert_file(input_file, output_file)
        
        # 打印一些统计信息
        print(f"\n转换统计:")
        print(f"总文档数: {len(documents)}")
        
        # 按类型统计
        type_count = {}
        for doc in documents:
            doc_type = doc['metadata']['type']
            type_count[doc_type] = type_count.get(doc_type, 0) + 1
        
        print("各类型文档数量:")
        for doc_type, count in type_count.items():
            print(f"  {doc_type}: {count}")
        
        # 显示前几个文档的示例
        print(f"\n前3个文档示例:")
        for i, doc in enumerate(documents[:3]):
            print(f"\n文档 {i+1}:")
            print(f"  标题: {doc['metadata']['section']}")
            print(f"  类型: {doc['metadata']['type']}")
            print(f"  内容长度: {len(doc['content'])} 字符")
            print(f"  内容预览: {doc['content'][:100]}...")
            
    except Exception as e:
        print(f"转换过程中发生错误: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()