ccprocessor · drunkpig · Sep 10, 2025 · Sep 9, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
@@ -48,19 +48,38 @@ class StructureMapper(ABC):
     Args:
         object (_type_): _description_
     """
+
     def __init__(self):
         self.__txt_para_splitter = '\n'
         self.__md_para_splitter = '\n\n'
         self.__text_end = '\n'
         self.__list_item_start = '-'  # md里的列表项前缀
         self.__list_para_prefix = '  '  # 两个空格，md里的列表项非第一个段落的前缀：如果多个段落的情况，第二个以及之后的段落前缀
         self.__md_special_chars = ['#', '`', '$']  # TODO 拼装table的时候还应该转义掉|符号
-        self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE, DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO, DocElementType.CODE, DocElementType.EQUATION_INTERLINE]
+        self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST,
+                                      DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE,
+                                      DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO,
+                                      DocElementType.CODE, DocElementType.EQUATION_INTERLINE]
         self.__inline_types_document_type = [ParagraphTextType.EQUATION_INLINE, ParagraphTextType.CODE_INLINE]
 
     def to_html(self):
         raise NotImplementedError('This method must be implemented by the subclass.')
 
+    def to_plain_md(self, exclude_nodes=DocElementType.EXCLUDE_PLAIN_MD_LIST,
+                    exclude_inline_types=DocElementType.EXCLUDE_PLAIN_MD_INLINE_LIST, use_raw_image_url=False):
+        """把content_list转化为md格式.
+
+        Args:
+            exclude_nodes (list): 需要排除的节点类型
+            exclude_inline_types: 需要排除的内联类型
+            use_raw_image_url: 是否使用原始img url
+        Returns:
+            str: md格式的文本内容
+        """
+        self.__validate_exclude_nodes(exclude_nodes, exclude_inline_types)
+        md = self.__to_md(exclude_nodes, exclude_inline_types, use_raw_image_url)
+        return md
+
     def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST, exclude_inline_types=[]):
         """把content_list转化为txt格式.
 
@@ -96,7 +115,8 @@ def __to_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=F
         for page in content_lst:
             for content_lst_node in page:
                 if content_lst_node['type'] not in exclude_nodes:
-                    txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types, use_raw_image_url)
+                    txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types,
+                                                               use_raw_image_url)
                     if txt_content and len(txt_content) > 0:
                         md_blocks.append(txt_content)
 
@@ -243,7 +263,8 @@ def __process_nested_list(self, items, list_attribute, indent_level=0, exclude_i
 
         return result
 
-    def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: list = [], use_raw_image_url=False) -> str:
+    def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: list = [],
+                                use_raw_image_url=False) -> str:
         """把content_list里定义的每种元素块转化为markdown格式.
 
         Args:
@@ -253,7 +274,8 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
         """
         node_type = content_lst_node['type']
         if node_type == DocElementType.CODE:
-            code = content_lst_node['content']['code_content']  # 这里禁止有None的content, 如果有应该消灭在模块内部。模块应该处理更精细，防止因为拼装导致掩盖了错误。
+            code = content_lst_node['content'][
+                'code_content']  # 这里禁止有None的content, 如果有应该消灭在模块内部。模块应该处理更精细，防止因为拼装导致掩盖了错误。
             # 代码不可以 strip，因为首行可能有缩进，只能 rstrip
             code = code.rstrip()
             if not code:
@@ -592,7 +614,7 @@ def get_content_list(self) -> ContentList:
         cl = self.__json_data[DataJsonKey.CONTENT_LIST]
         return cl
 
-    def get(self, key:str, default=None):
+    def get(self, key: str, default=None):
         return self.__json_data.get(key, default)
 
     def get_magic_html(self, page_layout_type=None):

diff --git a/llm_web_kit/libs/doc_element_type.py b/llm_web_kit/libs/doc_element_type.py
@@ -21,3 +21,6 @@ class DocElementType(object):
     VIDEO = 'video'
 
     MM_NODE_LIST = [IMAGE, AUDIO, VIDEO]
+
+    EXCLUDE_PLAIN_MD_LIST = [CODE, EQUATION_INTERLINE, IMAGE, COMPLEX_TABLE, AUDIO, VIDEO]
+    EXCLUDE_PLAIN_MD_INLINE_LIST = [ParagraphTextType.EQUATION_INLINE, ParagraphTextType.CODE_INLINE]
diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py
@@ -96,7 +96,8 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'e
 # SDK方法（三种使用场景）
 # ========================================
 
-def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML, language: str = 'en') -> str:
+def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML,
+                           language: str = 'en') -> str:
     """场景1: 只执行第一阶段，抽取main_html.
 
     Args:
@@ -118,7 +119,7 @@ def extract_content_from_main_html(url: str, main_html: str, output_format: str
     Args:
         url: 网页URL
         main_html: 已经抽取的主要HTML内容
-        output_format: 输出格式，'md' 或 'mm_md'
+        output_format: 输出格式，'md' 或 'mm_md' 或 'plain_md'
         language: 语言，可选：'en' 或 'zh'
 
     Returns:
@@ -131,19 +132,22 @@ def extract_content_from_main_html(url: str, main_html: str, output_format: str
         return content_list.to_nlp_md()
     elif output_format == 'mm_md':
         return content_list.to_mm_md()
+    elif output_format == 'plain_md':
+        return content_list.to_plain_md()
     elif output_format == 'json':
         return result.to_json()
     else:
         raise InvalidOutputFormatException(f'Invalid output format: {output_format}')
 
 
-def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
+def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md',
+                                              language: str = 'en') -> str:
     """场景3: 执行两个阶段，从magic_html抽取main_html，再从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
         html_content: 原始HTML内容
-        output_format: 输出格式，'md' 或 'mm_md'
+        output_format: 输出格式，'md' 或 'mm_md' 或 'plain_md'
         language: 语言，可选：'en' 或 'zh'
 
     Returns:
@@ -156,19 +160,22 @@ def extract_content_from_html_with_magic_html(url: str, html_content: str, outpu
         return content_list.to_nlp_md()
     elif output_format == 'mm_md':
         return content_list.to_mm_md()
+    elif output_format == 'plain_md':
+        return content_list.to_plain_md()
     elif output_format == 'json':
         return result.to_json()
     else:
         raise InvalidOutputFormatException(f'Invalid output format: {output_format}')
 
 
-def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
+def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md',
+                                       language: str = 'en') -> str:
     """场景3: 执行两个阶段，从llm抽取main_html，再从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
         html_content: 原始HTML内容
-        output_format: 输出格式，'md' 或 'mm_md'
+        output_format: 输出格式，'md' 或 'mm_md' 或 'plain_md'
         language: 语言，可选：'en' 或 'zh'
 
     Returns:
@@ -181,19 +188,22 @@ def extract_content_from_html_with_llm(url: str, html_content: str, output_forma
         return content_list.to_nlp_md()
     elif output_format == 'mm_md':
         return content_list.to_mm_md()
+    elif output_format == 'plain_md':
+        return content_list.to_plain_md()
     elif output_format == 'json':
         return result.to_json()
     else:
         raise InvalidOutputFormatException(f'Invalid output format: {output_format}')
 
 
-def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
+def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md',
+                                                language: str = 'en') -> str:
     """场景3: 执行两个阶段，从layout_batch抽取main_html，再从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
         html_content: 原始HTML内容
-        output_format: 输出格式，'md' 或 'mm_md'
+        output_format: 输出格式，'md' 或 'mm_md' 或 'plain_md'
         language: 语言，可选：'en' 或 'zh'
 
     Returns:
@@ -206,6 +216,8 @@ def extract_content_from_html_with_layout_batch(url: str, html_content: str, out
         return content_list.to_nlp_md()
     elif output_format == 'mm_md':
         return content_list.to_mm_md()
+    elif output_format == 'plain_md':
+        return content_list.to_plain_md()
     elif output_format == 'json':
         return result.to_json()
     else: