diff --git a/src/mobile_world/agents/implementations/autoglm_agent.py b/src/mobile_world/agents/implementations/autoglm_agent.py
new file mode 100644
index 0000000..5ec5817
--- /dev/null
+++ b/src/mobile_world/agents/implementations/autoglm_agent.py
@@ -0,0 +1,602 @@
+"""AutoGLM-Phone Agent implementation for MobileWorld.
+
+This module provides an adapter for the AutoGLM-Phone VLM to work with
+the MobileWorld evaluation framework by parsing AutoGLM's native output
+format and converting it to MobileWorld's JSONAction format.
+"""
+
+import ast
+import time
+from datetime import datetime
+from typing import Any
+
+from loguru import logger
+
+from mobile_world.agents.base import MCPAgent
+from mobile_world.agents.utils.helpers import pil_to_base64
+from mobile_world.runtime.utils.models import JSONAction
+
+
+# AutoGLM System Prompt (Chinese version from Open-AutoGLM)
+def get_autoglm_system_prompt() -> str:
+    """Get the AutoGLM system prompt with current date."""
+    today = datetime.today()
+    formatted_date = today.strftime("%Y年%m月%d日")
+
+    return (
+        "今天的日期是: "
+        + formatted_date
+        + """
+你是一个智能体分析专家，可以根据操作历史和当前状态图执行一系列操作来完成任务。
+你必须严格按照要求输出以下格式：
+<think>{think}</think>
+<answer>{action}</answer>
+
+其中：
+- {think} 是对你为什么选择这个操作的简短推理说明。
+- {action} 是本次执行的具体操作指令，必须严格遵循下方定义的指令格式。
+
+操作指令及其作用如下：
+- do(action="Launch", app="xxx")  
+    Launch是启动目标app的操作，这比通过主屏幕导航更快。此操作完成后，您将自动收到结果状态的截图。
+- do(action="Tap", element=[x,y])  
+    Tap是点击操作，点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序，或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。此操作完成后，您将自动收到结果状态的截图。
+- do(action="Tap", element=[x,y], message="重要操作")  
+    基本功能同Tap，点击涉及财产、支付、隐私等敏感按钮时触发。
+- do(action="Type", text="xxx")  
+    Type是输入操作，在当前聚焦的输入框中输入文本。使用此操作前，请确保输入框已被聚焦（先点击它）。输入的文本将像使用键盘输入一样输入。重要提示：手机可能正在使用 ADB 键盘，该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活，请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本，或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本：当你使用输入操作时，输入框中现有的任何文本（包括占位符文本和实际输入）都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后，你将自动收到结果状态的截图。
+- do(action="Type_Name", text="xxx")  
+    Type_Name是输入人名的操作，基本功能同Type。
+- do(action="Interact")  
+    Interact是当有多个满足条件的选项时而触发的交互操作，询问用户如何选择。
+- do(action="Swipe", start=[x1,y1], end=[x2,y2])  
+    Swipe是滑动操作，通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后，您将自动收到结果状态的截图。
+- do(action="Note", message="True")  
+    记录当前页面内容以便后续总结。
+- do(action="Call_API", instruction="xxx")  
+    总结或评论当前页面或已记录的内容。
+- do(action="Long Press", element=[x,y])  
+    Long Pres是长按操作，在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。此操作完成后，您将自动收到结果状态的屏幕截图。
+- do(action="Double Tap", element=[x,y])  
+    Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互，如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角（999,999)结束。此操作完成后，您将自动收到结果状态的截图。
+- do(action="Take_over", message="xxx")  
+    Take_over是接管操作，表示在登录和验证阶段需要用户协助。
+- do(action="Back")  
+    导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后，您将自动收到结果状态的截图。
+- do(action="Home") 
+    Home是回到系统桌面的操作，相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器，或从已知状态启动新任务。此操作完成后，您将自动收到结果状态的截图。
+- do(action="Wait", duration="x seconds")  
+    等待页面加载，x为需要等待多少秒。
+- finish(message="xxx")  
+    finish是结束任务的操作，表示准确完整完成任务，message是终止信息。 
+
+必须遵循的规则：
+1. 在执行任何操作前，先检查当前app是否是目标app，如果不是，先执行 Launch。
+2. 如果进入到了无关页面，先执行 Back。如果执行Back后页面没有变化，请点击页面左上角的返回键进行返回，或者右上角的X号关闭。
+3. 如果页面未加载出内容，最多连续 Wait 三次，否则执行 Back重新进入。
+4. 如果页面显示网络问题，需要重新加载，请点击重新加载。
+5. 如果当前页面找不到目标联系人、商品、店铺等信息，可以尝试 Swipe 滑动查找。
+6. 遇到价格区间、时间区间等筛选条件，如果没有完全符合的，可以放宽要求。
+7. 在结束任务前请一定要仔细检查任务是否完整准确的完成，如果出现错选、漏选、多选的情况，请返回之前的步骤进行纠正。
+"""
+    )
+
+
+def parse_autoglm_response(content: str) -> tuple[str, str]:
+    """
+    Parse the model response into thinking and action parts.
+
+    Parsing rules (following Open-AutoGLM's client.py):
+    1. If content contains 'finish(message=', everything before is thinking,
+       everything from 'finish(message=' onwards is action.
+    2. If rule 1 doesn't apply but content contains 'do(action=',
+       everything before is thinking, everything from 'do(action=' onwards is action.
+    3. Fallback: If content contains '<answer>', use legacy parsing with XML tags.
+    4. Otherwise, return empty thinking and full content as action.
+
+    Args:
+        content: Raw response content from LLM.
+
+    Returns:
+        Tuple of (thinking, action_str).
+    """
+    # Rule 1: Check for finish(message=
+    if "finish(message=" in content:
+        parts = content.split("finish(message=", 1)
+        thinking = parts[0].strip()
+        action_str = "finish(message=" + parts[1]
+        return thinking, action_str
+
+    # Rule 2: Check for do(action=
+    if "do(action=" in content:
+        parts = content.split("do(action=", 1)
+        thinking = parts[0].strip()
+        action_str = "do(action=" + parts[1]
+        return thinking, action_str
+
+    # Rule 3: Fallback to legacy XML tag parsing
+    if "<answer>" in content:
+        parts = content.split("<answer>", 1)
+        thinking = parts[0].replace("<think>", "").replace("</think>", "").strip()
+        action_str = parts[1].replace("</answer>", "").strip()
+        return thinking, action_str
+
+    # Rule 4: No markers found, return content as action
+    return "", content
+
+
+def parse_autoglm_action(action_str: str) -> dict[str, Any]:
+    """
+    Parse action string from AutoGLM response into a dictionary.
+
+    Supports formats:
+    - do(action="Tap", element=[x,y])
+    - do(action="Type", text="xxx")
+    - finish(message="xxx")
+    etc.
+
+    Args:
+        action_str: Raw action string from model.
+
+    Returns:
+        Parsed action dictionary with '_metadata' field.
+
+    Raises:
+        ValueError: If the action cannot be parsed.
+    """
+    try:
+        action_str = action_str.strip()
+        
+        # Clean up: extract only the first line if multiple lines present
+        # This handles cases where LLM outputs extra text after the action
+        lines = action_str.split('\n')
+        if len(lines) > 1:
+            # Find the line that starts with 'do(' or 'finish('
+            for line in lines:
+                line = line.strip()
+                if line.startswith('do(') or line.startswith('finish('):
+                    action_str = line
+                    break
+            else:
+                # If no line starts with do/finish, use the first non-empty line
+                action_str = lines[0].strip()
+        
+        logger.debug(f"Parsing action string: {repr(action_str)}")
+
+        # Handle Type action specially (may contain special characters in text)
+        if action_str.startswith('do(action="Type"') or action_str.startswith(
+            'do(action="Type_Name"'
+        ):
+            # Extract text value
+            text_start = action_str.find("text=")
+            if text_start != -1:
+                # Find the opening quote
+                quote_start = action_str.find('"', text_start + 5)
+                if quote_start != -1:
+                    # Find matching closing quote (handle escaped quotes)
+                    quote_end = len(action_str) - 1
+                    while quote_end > quote_start and action_str[quote_end] != '"':
+                        quote_end -= 1
+                    # Remove trailing ')' if present
+                    while quote_end > quote_start and action_str[quote_end] in ')':
+                        quote_end -= 1
+                    if action_str[quote_end] == '"':
+                        text = action_str[quote_start + 1 : quote_end]
+                        action_type = "Type_Name" if "Type_Name" in action_str else "Type"
+                        return {"_metadata": "do", "action": action_type, "text": text}
+
+            raise ValueError(f"Failed to parse Type action: {action_str}")
+
+        elif action_str.startswith("do"):
+            # Use AST parsing for safety
+            try:
+                # First, ensure the string is a single line for AST parsing
+                # Remove any actual newlines that might be in the string
+                cleaned = action_str.strip()
+                
+                # Only escape if there are literal backslashes that need escaping
+                # Don't escape newlines as we've already removed them
+                tree = ast.parse(cleaned, mode="eval")
+                if not isinstance(tree.body, ast.Call):
+                    raise ValueError("Expected a function call")
+
+                call = tree.body
+                # Extract keyword arguments safely
+                action = {"_metadata": "do"}
+                for keyword in call.keywords:
+                    key = keyword.arg
+                    value = ast.literal_eval(keyword.value)
+                    action[key] = value
+
+                return action
+            except (SyntaxError, ValueError) as e:
+                logger.error(f"AST parsing failed for: {repr(cleaned)}")
+                raise ValueError(f"Failed to parse do() action: {e}")
+
+        elif action_str.startswith("finish"):
+            # Extract message from finish(message="xxx")
+            message_start = action_str.find('message="')
+            if message_start != -1:
+                message_start += 9  # len('message="')
+                message_end = action_str.rfind('"')
+                if message_end > message_start:
+                    message = action_str[message_start:message_end]
+                    return {"_metadata": "finish", "message": message}
+
+            # Fallback: try to extract any text after 'finish(message='
+            if "finish(message=" in action_str:
+                message = action_str.replace("finish(message=", "")[1:-2]
+                return {"_metadata": "finish", "message": message}
+
+            return {"_metadata": "finish", "message": "Task completed"}
+
+        else:
+            raise ValueError(f"Unknown action format: {action_str}")
+
+    except Exception as e:
+        raise ValueError(f"Failed to parse action: {e}")
+
+
+def convert_autoglm_to_jsonaction(
+    action_dict: dict[str, Any],
+    screen_width: int,
+    screen_height: int,
+    scale_factor: int = 1000,
+) -> JSONAction:
+    """
+    Convert AutoGLM action dictionary to MobileWorld JSONAction.
+
+    Args:
+        action_dict: Parsed AutoGLM action dictionary.
+        screen_width: Width of the screenshot image.
+        screen_height: Height of the screenshot image.
+        scale_factor: Scale factor for coordinate normalization (default 1000).
+
+    Returns:
+        JSONAction object for MobileWorld runtime.
+    """
+    metadata = action_dict.get("_metadata")
+
+    # Handle finish action
+    if metadata == "finish":
+        message = action_dict.get("message", "Task completed")
+        return JSONAction(action_type="answer", text=message)
+
+    # Handle do actions
+    if metadata != "do":
+        logger.warning(f"Unknown metadata type: {metadata}")
+        return JSONAction(action_type="unknown", text=f"Unknown action: {action_dict}")
+
+    action_name = action_dict.get("action", "")
+
+    # Tap -> click
+    if action_name == "Tap":
+        element = action_dict.get("element", [0, 0])
+        x = int(element[0] * screen_width / scale_factor)
+        y = int(element[1] * screen_height / scale_factor)
+        return JSONAction(action_type="click", x=x, y=y)
+
+    # Type / Type_Name -> input_text
+    elif action_name in ("Type", "Type_Name"):
+        text = action_dict.get("text", "")
+        return JSONAction(action_type="input_text", text=text)
+
+    # Swipe -> drag
+    elif action_name == "Swipe":
+        start = action_dict.get("start", [0, 0])
+        end = action_dict.get("end", [0, 0])
+        start_x = int(start[0] * screen_width / scale_factor)
+        start_y = int(start[1] * screen_height / scale_factor)
+        end_x = int(end[0] * screen_width / scale_factor)
+        end_y = int(end[1] * screen_height / scale_factor)
+        return JSONAction(
+            action_type="drag",
+            start_x=start_x,
+            start_y=start_y,
+            end_x=end_x,
+            end_y=end_y,
+        )
+
+    # Long Press -> long_press
+    elif action_name == "Long Press":
+        element = action_dict.get("element", [0, 0])
+        x = int(element[0] * screen_width / scale_factor)
+        y = int(element[1] * screen_height / scale_factor)
+        return JSONAction(action_type="long_press", x=x, y=y)
+
+    # Double Tap -> double_tap
+    elif action_name == "Double Tap":
+        element = action_dict.get("element", [0, 0])
+        x = int(element[0] * screen_width / scale_factor)
+        y = int(element[1] * screen_height / scale_factor)
+        return JSONAction(action_type="double_tap", x=x, y=y)
+
+    # Back -> navigate_back
+    elif action_name == "Back":
+        return JSONAction(action_type="navigate_back")
+
+    # Home -> navigate_home
+    elif action_name == "Home":
+        return JSONAction(action_type="navigate_home")
+
+    # Launch -> open_app
+    elif action_name == "Launch":
+        app_name = action_dict.get("app", "")
+        return JSONAction(action_type="open_app", app_name=app_name)
+
+    # Wait -> wait
+    elif action_name == "Wait":
+        return JSONAction(action_type="wait")
+
+    # Interact -> ask_user (partial compatibility)
+    elif action_name == "Interact":
+        return JSONAction(
+            action_type="ask_user", text="有多个选项满足条件，请指定您的选择"
+        )
+
+    # Take_over -> ask_user (partial compatibility)
+    elif action_name == "Take_over":
+        message = action_dict.get("message", "需要用户协助")
+        return JSONAction(action_type="ask_user", text=message)
+
+    # Note -> wait (unsupported, log warning)
+    elif action_name == "Note":
+        logger.warning("AutoGLM 'Note' action is not supported, treating as wait")
+        return JSONAction(action_type="wait")
+
+    # Call_API -> wait (unsupported, log warning)
+    elif action_name == "Call_API":
+        logger.warning("AutoGLM 'Call_API' action is not supported, treating as wait")
+        instruction = action_dict.get("instruction", "")
+        # Could potentially return as answer if needed
+        return JSONAction(action_type="wait")
+
+    else:
+        logger.warning(f"Unknown AutoGLM action: {action_name}")
+        return JSONAction(action_type="unknown", text=f"Unknown action: {action_name}")
+
+
+class AutoGLMAgentMCP(MCPAgent):
+    """
+    AutoGLM-Phone Agent adapter for MobileWorld evaluation framework.
+
+    This agent uses the AutoGLM-Phone VLM and adapts its output format
+    to work with MobileWorld's JSONAction-based runtime.
+    """
+
+    def __init__(
+        self,
+        model_name: str,
+        llm_base_url: str,
+        api_key: str = "empty",
+        observation_type: str = "screenshot",
+        runtime_conf: dict = {
+            "history_n_images": 3,
+            "temperature": 0.0,
+            "max_tokens": 3000,
+        },
+        tools: list[dict] = [],
+        scale_factor: int = 1000,
+        **kwargs,
+    ):
+        super().__init__(tools=tools, **kwargs)
+
+        # Agent parameters
+        self.model_name = model_name
+        self.llm_base_url = llm_base_url
+        self.api_key = api_key
+        self.observation_type = observation_type
+        self.runtime_conf = runtime_conf
+        self.scale_factor = scale_factor
+
+        logger.debug(f"AutoGLM Agent runtime_conf = {self.runtime_conf}")
+        logger.debug(f"AutoGLM Agent scale_factor = {self.scale_factor}")
+
+        self.build_openai_client(self.llm_base_url, self.api_key)
+        logger.debug(f"AutoGLM Agent base_url={self.llm_base_url} model={self.model_name}")
+
+        self.history_n_images = self.runtime_conf.pop("history_n_images", 3)
+        self.history_images = []
+        self.history_responses = []
+        self.actions = []
+
+    def initialize_hook(self, instruction: str) -> None:
+        """Hook for initializing the agent with instruction."""
+        logger.info(f"Initializing AutoGLM agent with instruction: {instruction}")
+        self.reset()
+
+    def _get_user_message(
+        self, img_data, tool_call_res, ask_user_response_res, is_first: bool = False
+    ) -> dict:
+        """Build user message for conversation."""
+        if tool_call_res is not None:
+            return {
+                "role": "user",
+                "content": [{"type": "text", "text": f"Tool call result: {tool_call_res}"}],
+            }
+        elif ask_user_response_res is not None:
+            return {
+                "role": "user",
+                "content": [{"type": "text", "text": ask_user_response_res}],
+            }
+        else:
+            # Build screen info (similar to AutoGLM's MessageBuilder)
+            text_content = ""
+            if is_first:
+                text_content = f"用户任务: {self.instruction}\n\n当前屏幕截图如下:"
+            else:
+                text_content = "** Screen Info **\n\n当前屏幕截图如下:"
+
+            return {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": img_data,
+                    },
+                    {"type": "text", "text": text_content},
+                ],
+            }
+
+    def _hide_history_images(self, messages: list[dict]) -> list[dict]:
+        """Keep only recent N images in history to save context."""
+        num_images_used = 0
+        for i in range(len(messages)):
+            reverse_i = len(messages) - i - 1
+            msg = messages[reverse_i]
+            if msg["role"] == "user" and isinstance(msg["content"], list):
+                has_image = any(
+                    item.get("type") == "image_url" for item in msg["content"]
+                )
+                if has_image:
+                    if num_images_used < self.history_n_images:
+                        # Encode image to base64
+                        for item in msg["content"]:
+                            if item.get("type") == "image_url":
+                                img = item["image_url"]
+                                if not isinstance(img, dict):
+                                    encoded = pil_to_base64(img)
+                                    item["image_url"] = {
+                                        "url": f"data:image/png;base64,{encoded}"
+                                    }
+                        num_images_used += 1
+                    else:
+                        # Remove image, keep text only
+                        messages[reverse_i]["content"] = [
+                            item
+                            for item in msg["content"]
+                            if item.get("type") != "image_url"
+                        ] or [{"type": "text", "text": "(Previous turn, screen not shown)"}]
+        return messages
+
+    def predict(
+        self,
+        observation: dict[str, Any],
+    ) -> tuple[str, JSONAction]:
+        """
+        Generate action based on the current observation.
+
+        Args:
+            observation: Observation containing screenshot.
+
+        Returns:
+            Tuple of (raw_response, JSONAction).
+        """
+        obs_image = observation["screenshot"]
+        orig_width, orig_height = obs_image.size
+        tool_call = observation.get("tool_call", None)
+        ask_user_response = observation.get("ask_user_response", None)
+
+        is_first = len(self.history_images) == 0
+        self.history_images.append((obs_image, tool_call, ask_user_response))
+
+        logger.debug(f"Current history images count: {len(self.history_images)}")
+        logger.debug(f"Current history responses count: {len(self.history_responses)}")
+
+        assert len(self.history_images) == len(self.history_responses) + 1
+
+        # Build messages with AutoGLM system prompt
+        messages = [
+            {
+                "role": "system",
+                "content": get_autoglm_system_prompt(),
+            },
+            self._get_user_message(
+                self.history_images[0][0],
+                self.history_images[0][1],
+                self.history_images[0][2],
+                is_first=True,
+            ),
+        ]
+
+        # Add history
+        for i, history_resp in enumerate(self.history_responses):
+            history_img_data, tool_call_res, ask_user_response_res = self.history_images[
+                i + 1
+            ]
+
+            # Assistant response (in AutoGLM format)
+            response_message = {
+                "role": "assistant",
+                "content": history_resp.get("content", ""),
+            }
+            messages.append(response_message)
+
+            # User message with next screenshot
+            user_message = self._get_user_message(
+                history_img_data, tool_call_res, ask_user_response_res, is_first=False
+            )
+            messages.append(user_message)
+
+        logger.debug(f"Constructed {len(messages) // 2} history turns.")
+        messages = self._hide_history_images(messages)
+
+        # Try to get response
+        try_times = 3
+        response = None
+        thinking = None
+        action_str = None
+
+        while try_times > 0:
+            try:
+                response = self.openai_chat_completions_create(
+                    model=self.model_name,
+                    messages=messages,
+                    retry_times=1,
+                    **self.runtime_conf,
+                )
+
+                # Parse AutoGLM format response
+                thinking, action_str = parse_autoglm_response(response)
+
+                logger.info(f"\nRaw LLM response received:\n{response}")
+                logger.info(f"Parsed thinking: {thinking[:200]}..." if len(thinking) > 200 else f"Parsed thinking: {thinking}")
+                logger.info(f"Parsed action_str: {repr(action_str)}")
+                break
+
+            except Exception as e:
+                logger.warning(
+                    f"Error fetching response from agent: {self.model_name}, {self.llm_base_url}"
+                )
+                error_msg = str(e)
+                try_times -= 1
+                logger.warning(
+                    f"Error: {error_msg}. Retrying... ({try_times} attempts left)"
+                )
+                if "timeout" in error_msg.lower() or "connection" in error_msg.lower():
+                    time.sleep(2)
+
+        if response is None:
+            raise ValueError("AutoGLM Agent LLM failed")
+
+        if action_str is None:
+            return "Agent LLM failed", JSONAction(
+                action_type="unknown", text="Agent LLM failed"
+            )
+
+        # Parse and convert action
+        try:
+            action_dict = parse_autoglm_action(action_str)
+            json_action = convert_autoglm_to_jsonaction(
+                action_dict, orig_width, orig_height, self.scale_factor
+            )
+        except Exception as e:
+            logger.error(f"Error parsing AutoGLM action: {e}")
+            logger.debug(f"Action string was: {action_str}")
+            return response, JSONAction(action_type="unknown", text=f"Parse error: {e}")
+
+        logger.info(f"Converted to JSONAction: {json_action}")
+
+        # Update history
+        self.history_responses.append({"role": "assistant", "content": response})
+        self.actions.append(json_action.model_dump())
+        logger.debug("Agent state updated for next turn.")
+
+        return response, json_action
+
+    def reset(self):
+        """Reset the agent for the next task."""
+        self.history_images = []
+        self.history_responses = []
+        self.actions = []
+        logger.debug("AutoGLM Agent reset completed")
diff --git a/src/mobile_world/agents/registry.py b/src/mobile_world/agents/registry.py
index b3280ce..9e2d265 100644
--- a/src/mobile_world/agents/registry.py
+++ b/src/mobile_world/agents/registry.py
@@ -11,6 +11,7 @@
 from loguru import logger
 
 from mobile_world.agents.base import BaseAgent
+from mobile_world.agents.implementations.autoglm_agent import AutoGLMAgentMCP
 from mobile_world.agents.implementations.gelab_agent import GelabAgent
 from mobile_world.agents.implementations.general_e2e_agent import GeneralE2EAgentMCP
 from mobile_world.agents.implementations.mai_ui_agent import MAIUINaivigationAgent
@@ -37,6 +38,9 @@
     "gelab_agent": {
         "class": GelabAgent,
     },
+    "autoglm": {
+        "class": AutoGLMAgentMCP,
+    },
 }