md_to_knowledge_converter.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. #!/usr/bin/env python3
  2. """
  3. Markdown文档转知识库格式转换器
  4. 将md格式的文档转换为RAG知识库所需的JSON格式
  5. 包含content、metadata、source、type等字段
  6. """
  7. import json
  8. import re
  9. import os
  10. from typing import List, Dict, Any
  11. from pathlib import Path
  12. class MarkdownToKnowledgeConverter:
  13. """Markdown转知识库格式转换器"""
  14. def __init__(self):
  15. self.documents = []
  16. def parse_markdown(self, file_path: str) -> List[Dict[str, Any]]:
  17. """
  18. 解析markdown文件并转换为知识库格式
  19. Args:
  20. file_path: markdown文件路径
  21. Returns:
  22. 转换后的文档列表
  23. """
  24. with open(file_path, 'r', encoding='utf-8') as f:
  25. content = f.read()
  26. # 提取文档标题
  27. title_match = re.search(r'^#\s+(.+?)$', content, re.MULTILINE)
  28. document_title = title_match.group(1) if title_match else "未知文档"
  29. # 按章节分割文档
  30. sections = self._split_by_sections(content)
  31. documents = []
  32. for section in sections:
  33. if section['content'].strip():
  34. doc = self._create_document(section, document_title, file_path)
  35. documents.append(doc)
  36. return documents
  37. def _split_by_sections(self, content: str) -> List[Dict[str, str]]:
  38. """按章节分割文档内容"""
  39. sections = []
  40. # 使用正则表达式分割章节
  41. # 匹配 # 标题 (一级标题)
  42. pattern = r'^(#{1,3})\s+(.+?)$'
  43. matches = list(re.finditer(pattern, content, re.MULTILINE))
  44. for i, match in enumerate(matches):
  45. start_pos = match.start()
  46. end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(content)
  47. section_content = content[start_pos:end_pos].strip()
  48. level = len(match.group(1)) # 标题级别
  49. title = match.group(2).strip()
  50. # 提取章节内容(去除标题行)
  51. lines = section_content.split('\n')
  52. content_lines = lines[1:] # 跳过标题行
  53. # 过滤掉图片引用和空行
  54. filtered_lines = []
  55. for line in content_lines:
  56. line = line.strip()
  57. if line and not line.startswith('![') and not line.startswith('图'):
  58. filtered_lines.append(line)
  59. if filtered_lines:
  60. sections.append({
  61. 'title': title,
  62. 'content': '\n'.join(filtered_lines),
  63. 'level': level
  64. })
  65. return sections
  66. def _create_document(self, section: Dict[str, str], document_title: str, file_path: str) -> Dict[str, Any]:
  67. """创建知识库文档格式"""
  68. # 确定文档类型
  69. doc_type = self._determine_type(section['title'])
  70. # 生成内容
  71. content = f"标题: {section['title']}\n\n{section['content']}"
  72. # 清理内容(去除多余的空行和特殊字符)
  73. content = self._clean_content(content)
  74. return {
  75. "content": content,
  76. "metadata": {
  77. "source": document_title,
  78. "type": doc_type,
  79. "section": section['title'],
  80. "level": section['level'],
  81. "file_path": file_path
  82. }
  83. }
  84. def _determine_type(self, title: str) -> str:
  85. """根据标题确定文档类型"""
  86. title_lower = title.lower()
  87. # 定义类型映射
  88. type_mapping = {
  89. "登录": "login_guide",
  90. "采购": "purchase_management",
  91. "入库": "inbound_management",
  92. "出库": "outbound_management",
  93. "库存": "inventory_management",
  94. "工艺": "process_management",
  95. "生产": "production_management",
  96. "基础设置": "basic_settings",
  97. "系统管理": "system_management",
  98. "个人中心": "personal_center",
  99. "装配": "assembly_management"
  100. }
  101. for keyword, doc_type in type_mapping.items():
  102. if keyword in title_lower:
  103. return doc_type
  104. # 默认类型
  105. return "operation_guide"
  106. def _clean_content(self, content: str) -> str:
  107. """清理文档内容"""
  108. # 去除多余的空行
  109. lines = content.split('\n')
  110. cleaned_lines = []
  111. for line in lines:
  112. line = line.strip()
  113. if line:
  114. # 去除markdown链接格式但保留文本
  115. line = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', line)
  116. # 去除markdown图片引用
  117. line = re.sub(r'!\[.*?\]\([^\)]+\)', '', line)
  118. # 去除特殊格式字符
  119. line = re.sub(r'[{}]+', '', line)
  120. # 去除过多的空格
  121. line = re.sub(r'\s+', ' ', line)
  122. cleaned_lines.append(line)
  123. # 重新组合,去除连续的空行
  124. result = '\n'.join(cleaned_lines)
  125. result = re.sub(r'\n\s*\n', '\n\n', result)
  126. return result.strip()
  127. def convert_file(self, input_file: str, output_file: str = None) -> List[Dict[str, Any]]:
  128. """
  129. 转换单个markdown文件
  130. Args:
  131. input_file: 输入的markdown文件路径
  132. output_file: 输出的JSON文件路径(可选)
  133. Returns:
  134. 转换后的文档列表
  135. """
  136. print(f"正在转换文件: {input_file}")
  137. documents = self.parse_markdown(input_file)
  138. if output_file:
  139. with open(output_file, 'w', encoding='utf-8') as f:
  140. json.dump(documents, f, ensure_ascii=False, indent=2)
  141. print(f"转换完成,已保存到: {output_file}")
  142. print(f"共生成 {len(documents)} 个知识库条目")
  143. return documents
  144. def convert_directory(self, input_dir: str, output_dir: str):
  145. """
  146. 转换目录下的所有markdown文件
  147. Args:
  148. input_dir: 输入目录路径
  149. output_dir: 输出目录路径
  150. """
  151. input_path = Path(input_dir)
  152. output_path = Path(output_dir)
  153. output_path.mkdir(parents=True, exist_ok=True)
  154. for md_file in input_path.glob("**/*.md"):
  155. relative_path = md_file.relative_to(input_path)
  156. output_file = output_path / relative_path.with_suffix('.json')
  157. output_file.parent.mkdir(parents=True, exist_ok=True)
  158. self.convert_file(str(md_file), str(output_file))
  159. def main():
  160. """主函数 - 示例用法"""
  161. converter = MarkdownToKnowledgeConverter()
  162. # 转换mom.md文件
  163. input_file = "assets/docs/mom.md"
  164. output_file = "knowledge_base/mom_knowledge.json"
  165. # 检查输入文件是否存在
  166. if not os.path.exists(input_file):
  167. print(f"错误:输入文件 {input_file} 不存在")
  168. print("请确保文件路径正确")
  169. return
  170. try:
  171. # 确保输出目录存在
  172. os.makedirs(os.path.dirname(output_file), exist_ok=True)
  173. # 执行转换
  174. documents = converter.convert_file(input_file, output_file)
  175. # 打印一些统计信息
  176. print(f"\n转换统计:")
  177. print(f"总文档数: {len(documents)}")
  178. # 按类型统计
  179. type_count = {}
  180. for doc in documents:
  181. doc_type = doc['metadata']['type']
  182. type_count[doc_type] = type_count.get(doc_type, 0) + 1
  183. print("各类型文档数量:")
  184. for doc_type, count in type_count.items():
  185. print(f" {doc_type}: {count}")
  186. # 显示前几个文档的示例
  187. print(f"\n前3个文档示例:")
  188. for i, doc in enumerate(documents[:3]):
  189. print(f"\n文档 {i+1}:")
  190. print(f" 标题: {doc['metadata']['section']}")
  191. print(f" 类型: {doc['metadata']['type']}")
  192. print(f" 内容长度: {len(doc['content'])} 字符")
  193. print(f" 内容预览: {doc['content'][:100]}...")
  194. except Exception as e:
  195. print(f"转换过程中发生错误: {e}")
  196. import traceback
  197. traceback.print_exc()
  198. if __name__ == "__main__":
  199. main()