Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions llm_web_kit/input/datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,38 @@ class StructureMapper(ABC):
Args:
object (_type_): _description_
"""

def __init__(self):
self.__txt_para_splitter = '\n'
self.__md_para_splitter = '\n\n'
self.__text_end = '\n'
self.__list_item_start = '-' # md里的列表项前缀
self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀
self.__md_special_chars = ['#', '`', '$'] # TODO 拼装table的时候还应该转义掉|符号
self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE, DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO, DocElementType.CODE, DocElementType.EQUATION_INTERLINE]
self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST,
DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE,
DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO,
DocElementType.CODE, DocElementType.EQUATION_INTERLINE]
self.__inline_types_document_type = [ParagraphTextType.EQUATION_INLINE, ParagraphTextType.CODE_INLINE]

def to_html(self):
raise NotImplementedError('This method must be implemented by the subclass.')

def to_plain_md(self, exclude_nodes=DocElementType.EXCLUDE_PLAIN_MD_LIST,
exclude_inline_types=DocElementType.EXCLUDE_PLAIN_MD_INLINE_LIST, use_raw_image_url=False):
"""把content_list转化为md格式.

Args:
exclude_nodes (list): 需要排除的节点类型
exclude_inline_types: 需要排除的内联类型
use_raw_image_url: 是否使用原始img url
Returns:
str: md格式的文本内容
"""
self.__validate_exclude_nodes(exclude_nodes, exclude_inline_types)
md = self.__to_md(exclude_nodes, exclude_inline_types, use_raw_image_url)
return md

def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST, exclude_inline_types=[]):
"""把content_list转化为txt格式.

Expand Down Expand Up @@ -96,7 +115,8 @@ def __to_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=F
for page in content_lst:
for content_lst_node in page:
if content_lst_node['type'] not in exclude_nodes:
txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types, use_raw_image_url)
txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types,
use_raw_image_url)
if txt_content and len(txt_content) > 0:
md_blocks.append(txt_content)

Expand Down Expand Up @@ -243,7 +263,8 @@ def __process_nested_list(self, items, list_attribute, indent_level=0, exclude_i

return result

def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: list = [], use_raw_image_url=False) -> str:
def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: list = [],
use_raw_image_url=False) -> str:
"""把content_list里定义的每种元素块转化为markdown格式.

Args:
Expand All @@ -253,7 +274,8 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
"""
node_type = content_lst_node['type']
if node_type == DocElementType.CODE:
code = content_lst_node['content']['code_content'] # 这里禁止有None的content, 如果有应该消灭在模块内部。模块应该处理更精细,防止因为拼装导致掩盖了错误。
code = content_lst_node['content'][
'code_content'] # 这里禁止有None的content, 如果有应该消灭在模块内部。模块应该处理更精细,防止因为拼装导致掩盖了错误。
# 代码不可以 strip,因为首行可能有缩进,只能 rstrip
code = code.rstrip()
if not code:
Expand Down Expand Up @@ -592,7 +614,7 @@ def get_content_list(self) -> ContentList:
cl = self.__json_data[DataJsonKey.CONTENT_LIST]
return cl

def get(self, key:str, default=None):
def get(self, key: str, default=None):
return self.__json_data.get(key, default)

def get_magic_html(self, page_layout_type=None):
Expand Down
3 changes: 3 additions & 0 deletions llm_web_kit/libs/doc_element_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ class DocElementType(object):
VIDEO = 'video'

MM_NODE_LIST = [IMAGE, AUDIO, VIDEO]

EXCLUDE_PLAIN_MD_LIST = [CODE, EQUATION_INTERLINE, IMAGE, COMPLEX_TABLE, AUDIO, VIDEO]
EXCLUDE_PLAIN_MD_INLINE_LIST = [ParagraphTextType.EQUATION_INLINE, ParagraphTextType.CODE_INLINE]
28 changes: 20 additions & 8 deletions llm_web_kit/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'e
# SDK方法(三种使用场景)
# ========================================

def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML, language: str = 'en') -> str:
def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML,
language: str = 'en') -> str:
"""场景1: 只执行第一阶段,抽取main_html.

Args:
Expand All @@ -118,7 +119,7 @@ def extract_content_from_main_html(url: str, main_html: str, output_format: str
Args:
url: 网页URL
main_html: 已经抽取的主要HTML内容
output_format: 输出格式,'md' 或 'mm_md'
output_format: 输出格式,'md' 或 'mm_md' 或 'plain_md'
language: 语言,可选:'en' 或 'zh'

Returns:
Expand All @@ -131,19 +132,22 @@ def extract_content_from_main_html(url: str, main_html: str, output_format: str
return content_list.to_nlp_md()
elif output_format == 'mm_md':
return content_list.to_mm_md()
elif output_format == 'plain_md':
return content_list.to_plain_md()
elif output_format == 'json':
return result.to_json()
else:
raise InvalidOutputFormatException(f'Invalid output format: {output_format}')


def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md',
language: str = 'en') -> str:
"""场景3: 执行两个阶段,从magic_html抽取main_html,再从main_html抽取结构化内容.

Args:
url: 网页URL
html_content: 原始HTML内容
output_format: 输出格式,'md' 或 'mm_md'
output_format: 输出格式,'md' 或 'mm_md' 或 'plain_md'
language: 语言,可选:'en' 或 'zh'

Returns:
Expand All @@ -156,19 +160,22 @@ def extract_content_from_html_with_magic_html(url: str, html_content: str, outpu
return content_list.to_nlp_md()
elif output_format == 'mm_md':
return content_list.to_mm_md()
elif output_format == 'plain_md':
return content_list.to_plain_md()
elif output_format == 'json':
return result.to_json()
else:
raise InvalidOutputFormatException(f'Invalid output format: {output_format}')


def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md',
language: str = 'en') -> str:
"""场景3: 执行两个阶段,从llm抽取main_html,再从main_html抽取结构化内容.

Args:
url: 网页URL
html_content: 原始HTML内容
output_format: 输出格式,'md' 或 'mm_md'
output_format: 输出格式,'md' 或 'mm_md' 或 'plain_md'
language: 语言,可选:'en' 或 'zh'

Returns:
Expand All @@ -181,19 +188,22 @@ def extract_content_from_html_with_llm(url: str, html_content: str, output_forma
return content_list.to_nlp_md()
elif output_format == 'mm_md':
return content_list.to_mm_md()
elif output_format == 'plain_md':
return content_list.to_plain_md()
elif output_format == 'json':
return result.to_json()
else:
raise InvalidOutputFormatException(f'Invalid output format: {output_format}')


def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md',
language: str = 'en') -> str:
"""场景3: 执行两个阶段,从layout_batch抽取main_html,再从main_html抽取结构化内容.

Args:
url: 网页URL
html_content: 原始HTML内容
output_format: 输出格式,'md' 或 'mm_md'
output_format: 输出格式,'md' 或 'mm_md' 或 'plain_md'
language: 语言,可选:'en' 或 'zh'

Returns:
Expand All @@ -206,6 +216,8 @@ def extract_content_from_html_with_layout_batch(url: str, html_content: str, out
return content_list.to_nlp_md()
elif output_format == 'mm_md':
return content_list.to_mm_md()
elif output_format == 'plain_md':
return content_list.to_plain_md()
elif output_format == 'json':
return result.to_json()
else:
Expand Down
Loading