fix multiple tab

alexwarm · alexwarm · commit d70db733a4bd · 2025-04-12T21:05:02.000+08:00
diff --git a/src/agent/custom_message_manager.py b/src/agent/custom_message_manager.py
@@ -74,7 +74,8 @@ def cut_messages(self):
         min_message_len = 2 if self.context_content is not None else 1
 
         while diff > 0 and len(self.state.history.messages) > min_message_len:
-            self.state.history.remove_message(min_message_len)  # always remove the oldest message
+            msg = self.state.history.messages.pop(min_message_len)
+            self.state.history.current_tokens -= msg.metadata.tokens
             diff = self.state.history.current_tokens - self.settings.max_input_tokens
 
     def add_state_message(
@@ -104,6 +105,7 @@ def _remove_state_message_by_index(self, remove_ind=-1) -> None:
             if isinstance(self.state.history.messages[i].message, HumanMessage):
                 remove_cnt += 1
             if remove_cnt == abs(remove_ind):
-                self.state.history.messages.pop(i)
+                msg = self.state.history.messages.pop(i)
+                self.state.history.current_tokens -= msg.metadata.tokens
                 break
             i -= 1
diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py
@@ -21,6 +21,18 @@ def _load_prompt_template(self) -> None:
         except Exception as e:
             raise RuntimeError(f'Failed to load system prompt template: {e}')
 
+    def get_system_message(self) -> SystemMessage:
+        """
+        Get the system prompt for the agent.
+
+        Returns:
+            SystemMessage: Formatted system prompt
+        """
+        prompt = self.prompt_template.format(max_actions=self.max_actions_per_step,
+                                             available_actions=self.default_action_description)
+
+        return SystemMessage(content=prompt)
+
 
 class CustomAgentMessagePrompt(AgentMessagePrompt):
     def __init__(
diff --git a/src/agent/custom_system_prompt.md b/src/agent/custom_system_prompt.md
@@ -30,7 +30,7 @@ Example:
  ]
 }}
 
-2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
+2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {max_actions} actions per sequence.
 Common action sequences:
 - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
 - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
@@ -39,6 +39,7 @@ Common action sequences:
 - Only provide the action sequence until an action which changes the page state significantly.
 - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
 - only use multiple actions if it makes sense.
+- Only chose from below available actions.
 
 3. ELEMENT INTERACTION:
 - Only use indexes of the interactive elements
@@ -73,4 +74,7 @@ Common action sequences:
 
 9. Extraction:
 - If your task is to find information - call extract_content on the specific pages to get and store the information.
-Your responses must be always JSON with the specified format. 
+Your responses must be always JSON with the specified format. 
+
+Available Actions:
+{available_actions}
diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py
@@ -118,26 +118,26 @@ async def test_browser_use_custom():
     #     api_key=os.getenv("OPENAI_API_KEY", ""),
     # )
 
+    llm = utils.get_llm_model(
+        provider="azure_openai",
+        model_name="gpt-4o",
+        temperature=0.5,
+        base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+        api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    )
+
     # llm = utils.get_llm_model(
-    #     provider="azure_openai",
-    #     model_name="gpt-4o",
+    #     provider="google",
+    #     model_name="gemini-2.0-flash",
     #     temperature=0.6,
-    #     base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
-    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    #     api_key=os.getenv("GOOGLE_API_KEY", "")
     # )
 
-    llm = utils.get_llm_model(
-        provider="google",
-        model_name="gemini-2.0-flash",
-        temperature=0.6,
-        api_key=os.getenv("GOOGLE_API_KEY", "")
-    )
-
-    llm = utils.get_llm_model(
-        provider="deepseek",
-        model_name="deepseek-reasoner",
-        temperature=0.8
-    )
+    # llm = utils.get_llm_model(
+    #     provider="deepseek",
+    #     model_name="deepseek-reasoner",
+    #     temperature=0.8
+    # )
 
     # llm = utils.get_llm_model(
     #     provider="deepseek",
@@ -156,9 +156,9 @@ async def test_browser_use_custom():
     controller = CustomController()
     use_own_browser = True
     disable_security = True
-    use_vision = False  # Set to False when using DeepSeek
+    use_vision = True  # Set to False when using DeepSeek
 
-    max_actions_per_step = 1
+    max_actions_per_step = 10
     playwright = None
     browser = None
     browser_context = None
@@ -193,7 +193,7 @@ async def test_browser_use_custom():
             )
         )
         agent = CustomAgent(
-            task="Give me stock price of Nvidia",
+            task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3",
             add_infos="",  # some hints for llm to complete the task
             llm=llm,
             browser=browser,
diff --git a/webui.py b/webui.py
@@ -332,7 +332,7 @@ async def run_org_agent(
     try:
         global _global_browser, _global_browser_context, _global_agent
 
-        extra_chromium_args = [f"--window-size={window_w},{window_h}"]
+        extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"]
         cdp_url = chrome_cdp
 
         if use_own_browser:
@@ -362,6 +362,7 @@ async def run_org_agent(
                 config=BrowserContextConfig(
                     trace_path=save_trace_path if save_trace_path else None,
                     save_recording_path=save_recording_path if save_recording_path else None,
+                    save_downloads_path="./tmp/downloads",
                     no_viewport=False,
                     browser_window_size=BrowserContextWindowSize(
                         width=window_w, height=window_h
@@ -435,7 +436,7 @@ async def run_custom_agent(
     try:
         global _global_browser, _global_browser_context, _global_agent
 
-        extra_chromium_args = [f"--window-size={window_w},{window_h}"]
+        extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"]
         cdp_url = chrome_cdp
         if use_own_browser:
             cdp_url = os.getenv("CHROME_CDP", chrome_cdp)
@@ -470,6 +471,7 @@ async def run_custom_agent(
                     trace_path=save_trace_path if save_trace_path else None,
                     save_recording_path=save_recording_path if save_recording_path else None,
                     no_viewport=False,
+                    save_downloads_path="./tmp/downloads",
                     browser_window_size=BrowserContextWindowSize(
                         width=window_w, height=window_h
                     ),