Skip to content

Commit

Permalink
Merge pull request #238 from enoch3712/232-add-multiple-image-process…
Browse files Browse the repository at this point in the history
…ing-on-extraction

Mulriple image support added for extraction
  • Loading branch information
enoch3712 authored Feb 3, 2025
2 parents b9e4090 + a469c24 commit 961f8d3
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 45 deletions.
51 changes: 31 additions & 20 deletions extract_thinker/concatenation_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,17 +161,24 @@ def _build_vision_content(self, content: Any) -> List[Dict[str, Any]]:
"text": f"##Content\n\n{item['content']}"
})

# Add image if available
if isinstance(item, dict) and "image" in item:
if item["image"]:
message_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(item['image'])}"
}
})
# Add images if available
if isinstance(item, dict):
images = []
if "images" in item and isinstance(item["images"], list):
images.extend(item["images"])
if "image" in item and item["image"] is not None:
images.append(item["image"])

for img in images:
if img:
message_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(img)}"
}
})
else:
# Fallback to original single-item handling
# Handle single item
if isinstance(content, dict):
# Add text content if available
if "content" in content:
Expand All @@ -181,16 +188,20 @@ def _build_vision_content(self, content: Any) -> List[Dict[str, Any]]:
})

# Add images
if "image" in content or "images" in content:
images = content.get("images", [content.get("image")])
for img in images:
if img:
message_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(img)}"
}
})
images = []
if "images" in content and isinstance(content["images"], list):
images.extend(content["images"])
if "image" in content and content["image"] is not None:
images.append(content["image"])

for img in images:
if img:
message_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(img)}"
}
})

return message_content

Expand Down
89 changes: 70 additions & 19 deletions extract_thinker/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(
self.llm_interceptors: List[LlmInterceptor] = []
self.is_classify_image: bool = False
self._skip_loading: bool = False
self.chunk_height: int = 1500

def add_interceptor(
self, interceptor: Union[LoaderInterceptor, LlmInterceptor]
Expand Down Expand Up @@ -184,16 +185,27 @@ def _map_to_universal_format(
Maps loaded content to a universal format that _extract can process.
The universal format is:
{
"content": str, # The text content
"images": List[bytes], # Optional list of image bytes if vision=True
"metadata": Dict[str, Any] # Optional metadata
"content": str, # The text content (joined from pages)
"images": List[bytes],
# Optional list of image bytes if vision=True (can hold multiple)
"metadata": {}
}
"""
if content is None:
return {"content": "", "images": [], "metadata": {}}

# If content is already in universal format, return as is
if isinstance(content, dict) and "content" in content:
# Ensure 'images' is a list
if "image" in content and "images" not in content:
# Merge single 'image' into 'images'
content["images"] = [content["image"]] if content["image"] else []
del content["image"]
elif "images" in content and not isinstance(content["images"], list):
# If 'images' is mistakenly a single byte blob, fix it
content["images"] = [content["images"]] if content["images"] else []
elif "images" not in content:
content["images"] = []
return content

# Handle list of pages from document loader
Expand All @@ -207,8 +219,13 @@ def _map_to_universal_format(
if 'content' in page:
text_content.append(page['content'])
# Extract images if vision mode is enabled
if vision and 'image' in page:
images.append(page['image'])
if vision:
# If there's a list of images
if 'images' in page and isinstance(page['images'], list):
images.extend(page['images'])
# Or just a single 'image'
elif 'image' in page and page['image']:
images.append(page['image'])

return {
"content": "\n\n".join(text_content) if text_content else "",
Expand All @@ -230,11 +247,18 @@ def _map_to_universal_format(
if isinstance(text_content, list):
text_content = "\n".join(text_content)

images = []
if vision:
if "images" in content and isinstance(content["images"], list):
images.extend(content["images"])
elif "image" in content and content["image"]:
images.append(content["image"])

return {
"content": text_content,
"images": content.get("images", []) if vision else [],
"images": images,
"metadata": {k: v for k, v in content.items()
if k not in ["text", "images", "content"]}
if k not in ["text", "images", "image", "content"]}
}

raise ValueError(f"Unsupported content format: {type(content)}")
Expand Down Expand Up @@ -1067,7 +1091,7 @@ def _add_images_to_message_content(
elif isinstance(content, dict):
# Handle legacy format
image_data = content.get('image') or content.get('images')
self._append_images(image_data, message_content)
self._append_images(image_data[0], message_content)

def _append_images(
self,
Expand All @@ -1078,27 +1102,54 @@ def _append_images(
Append images to the message content.
Args:
image_data: The image data to process.
image_data: The image data to process. Can be:
- A dictionary with 'image' or 'images' keys
- A list of images
- A single image
message_content: The message content to append images to.
"""
if not image_data:
return

images_list = []
if isinstance(image_data, dict):
images_list = image_data.values()
# Handle dictionary format
if "images" in image_data:
# If "images" key exists, it should be a list of images
if isinstance(image_data["images"], list):
images_list.extend(image_data["images"])
else:
# Single image in "images" key
images_list.append(image_data["images"])
elif "image" in image_data and image_data["image"] is not None:
# Single image in "image" key
images_list.append(image_data["image"])
elif isinstance(image_data, list):
images_list = image_data
# Process list of images or image dictionaries
for item in image_data:
if isinstance(item, dict):
# Handle nested image dictionaries
if "images" in item and isinstance(item["images"], list):
images_list.extend(item["images"])
elif "image" in item and item["image"] is not None:
images_list.append(item["image"])
else:
# Raw image data
images_list.append(item)
else:
images_list = [image_data]
# Single raw image
images_list.append(image_data)

# Process all collected images
for img in images_list:
base64_image = encode_image(img)
message_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
})
if img is not None: # Skip None values
base64_image = encode_image(img)
message_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
})

def _build_messages(
self,
Expand Down
16 changes: 10 additions & 6 deletions extract_thinker/pagination_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,16 +491,21 @@ def _build_vision_content(self, content: Any) -> List[Dict[str, Any]]:
"""Build content for vision request."""
message_content = []

# Add text content if available
# If there's textual 'content', push it first
if isinstance(content, dict) and "content" in content:
message_content.append({
"type": "text",
"text": f"##Content\n\n{content['content']}"
})

# Add images
if isinstance(content, dict) and ("image" in content or "images" in content):
images = content.get("images", [content.get("image")])

# Now handle multiple images
if isinstance(content, dict):
images = []
if "images" in content and isinstance(content["images"], list):
images.extend(content["images"])
if "image" in content and content["image"] is not None:
images.append(content["image"])

for img in images:
if img:
message_content.append({
Expand All @@ -509,7 +514,6 @@ def _build_vision_content(self, content: Any) -> List[Dict[str, Any]]:
"url": f"data:image/jpeg;base64,{encode_image(img)}"
}
})

return message_content

def _build_text_content(self, content: Any) -> str:
Expand Down

0 comments on commit 961f8d3

Please sign in to comment.