browser-use · pratham4434 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -71,11 +71,11 @@ ENV BROWSER_USE_LOGGING_LEVEL=info
 ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
 ENV ANONYMIZED_TELEMETRY=false
 ENV DISPLAY=:99
-ENV RESOLUTION=1920x1080x24
+ENV RESOLUTION=960x540x24
 ENV VNC_PASSWORD=vncpassword
 ENV CHROME_PERSISTENT_SESSION=true
-ENV RESOLUTION_WIDTH=1920
-ENV RESOLUTION_HEIGHT=1080
+ENV RESOLUTION_WIDTH=960
+ENV RESOLUTION_HEIGHT=540
 
 # Set up supervisor configuration
 RUN mkdir -p /var/log/supervisor

diff --git a/custom_theme.py b/custom_theme.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+from gradio.themes.base import Base
+from gradio.themes.utils import colors, fonts, sizes
+
+
+class custom_theme(Base):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.blue,
+        secondary_hue: colors.Color | str = colors.sky,
+        neutral_hue: colors.Color | str = colors.gray,
+        spacing_size: sizes.Size | str = sizes.spacing_md,
+        radius_size: sizes.Size | str = sizes.radius_lg,
+        text_size: sizes.Size | str = sizes.text_md,
+        font: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Montserrat"),
+            "ui-sans-serif",
+            "system-ui",
+            "sans-serif",
+        ),
+        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Inter"),
+            "ui-monospace",
+            "Consolas",
+            "monospace",
+        ),
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            spacing_size=spacing_size,
+            radius_size=radius_size,
+            text_size=text_size,
+            font=font,
+            font_mono=font_mono,
+        )
+        self.name = "custom_theme"
+        super().set(
+            button_border_width="0px",
+            checkbox_label_border_width="1px",
+            button_transform_hover="scale(1.02)",
+            button_transition="all 0.1s ease-in-out",
+            slider_color="*primary_400",
+            button_primary_background_fill="linear-gradient(120deg, *secondary_500 0%, *primary_300 60%, *primary_400 100%)",
+            button_primary_background_fill_hover="linear-gradient(120deg, *secondary_400 0%, *primary_300 60%, *primary_300 100%)",
+            button_primary_text_color="*button_secondary_text_color",
+            button_secondary_background_fill="linear-gradient(120deg, *neutral_300 0%, *neutral_100 60%, *neutral_200 100%)",
+            button_secondary_background_fill_hover="linear-gradient(120deg, *neutral_200 0%, *neutral_100 60%, *neutral_100 100%)",
+            checkbox_label_background_fill_selected="linear-gradient(120deg, *primary_400 0%, *primary_300 60%, *primary_400 100%)",
+            checkbox_label_border_color_selected="*primary_400",
+            checkbox_background_color_selected="*primary_400",
+            checkbox_label_text_color_selected="*button_secondary_text_color",
+            slider_color_dark="*primary_500",
+            button_primary_background_fill_dark="linear-gradient(120deg, *secondary_600 0%, *primary_500 60%, *primary_600 100%)",
+            button_primary_background_fill_hover_dark="linear-gradient(120deg, *secondary_500 0%, *primary_500 60%, *primary_500 100%)",
+            button_primary_text_color_dark="*button_secondary_text_color",
+            button_secondary_background_fill_dark="linear-gradient(120deg, *neutral_700 0%, *neutral_600 60%, *neutral_700 100%)",
+            button_secondary_background_fill_hover_dark="linear-gradient(120deg, *neutral_600 0%, *neutral_600 60%, *neutral_700 100%)",
+            checkbox_label_background_fill_selected_dark="linear-gradient(120deg, *primary_600 0%, *primary_500 60%, *primary_600 100%)",
+            checkbox_label_border_color_selected_dark="*primary_600",
+            checkbox_background_color_selected_dark="*primary_600",
+            checkbox_label_text_color_selected_dark="*button_secondary_text_color",
+            block_shadow="*shadow_drop_lg",
+            button_secondary_shadow_hover="*shadow_drop_lg",
+            button_primary_shadow_hover="0 1px 3px 0 *primary_200, 0 1px 2px -1px *primary_200",
+            button_secondary_shadow_dark="none",
+            button_primary_shadow_dark="none",
+        )
diff --git a/docker-setup.ps1 b/docker-setup.ps1
@@ -0,0 +1,2 @@
+docker build -t agent .
+docker run -p 7788:7788 -p 6080:6080 -p 5901:5901 agent
diff --git a/logo.png b/logo.png
diff --git a/prompts/prompt.text b/prompts/prompt.text
@@ -0,0 +1,47 @@
+Step-by-Step Instructions
+
+1. Navigate to Athenahealth Preview Environment
+   - Open a web browser and go to: https://preview.athenahealth.com/
+
+2. Log In
+   - Enter the credentials:
+     - Username: p-bkumar1
+     - Password: Xcaliber@12345
+   - Click the Login button.
+
+3. Select the Default Department
+   - If prompted, choose the default department from the list (e.g., "7 Hills Department").
+
+4. Access the "Patients" Menu
+   - Locate the header at the top of the dashboard.
+   - Click on the "Patients" menu to open the dropdown.
+
+5. Access Document Search
+   - Option 1 (Primary Attempt)
+     - In the dropdown, look for "Documents > Document Search" and click it.
+     - If the primary attempt fails (error 404 or element not found):
+       - Refresh the page.
+       - Retry clicking "Document Search" (up to 3 times with 2-second intervals).
+
+6. Handle Iframes (Fallback Approach)
+   - Use the following sequence if Document Search is nested in iframes:
+     - Switch to the main iframe context:
+       - Locate and switch to iframe[name="frMain"].
+     - Switch to the sub-iframe:
+       - Locate and switch to iframe[id="searchFrame"] or iframe[name="frMain"] > iframe (if nested).
+     - Fill the DOCUMENTID and click Search:
+       - Enter the value "116873" in the DOCUMENTID field.
+       - Click the "Search" button.
+       - Retry up to 3 times:
+         - Wait 2 seconds between each retry if elements are missing.
+
+7. Observe and Report
+   - After clicking "Document Search" or executing the iframe fallback:
+
+Common Issues and Solutions
+- Element Not Found: Ensure the iframe is fully loaded (wait for 5–10 seconds).
+- Button Not Clickable: try to force click button again.
+- Network Errors: Verify your internet connection and retry the login process.
+
+Result
+- After following the steps, patient lab reports will be displayed. The task is completed successfully.
diff --git a/setup.ps1 b/setup.ps1
@@ -0,0 +1,18 @@
+deactivate
+
+Remove-Item -Recurse -Force .venv
+
+# Step 2: Set Up Python Environment
+uv venv --python 3.11
+
+# Activate the virtual environment
+.\.venv\Scripts\Activate.ps1
+
+# Step 3: Install Dependencies
+uv pip install -r requirements.txt
+playwright install
+
+
+# Step 4: Run web ui in local
+python webui.py --ip 127.0.0.1 --port 7788
+Write-Output "Setup complete. Virtual environment activated."
diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py
@@ -53,15 +53,15 @@ def __init__(
             browser: Browser | None = None,
             browser_context: BrowserContext | None = None,
             controller: Controller = Controller(),
-            use_vision: bool = True,
+            use_vision: bool = False,
             use_vision_for_planner: bool = False,
             save_conversation_path: Optional[str] = None,
             save_conversation_path_encoding: Optional[str] = 'utf-8',
-            max_failures: int = 3,
+            max_failures: int = 5,
             retry_delay: int = 10,
             system_prompt_class: Type[SystemPrompt] = SystemPrompt,
             agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
-            max_input_tokens: int = 128000,
+            max_input_tokens: int = 1280000,
             validate_output: bool = False,
             message_context: Optional[str] = None,
             generate_gif: bool | str = True,
@@ -281,8 +281,8 @@ async def _run_planner(self) -> Optional[str]:
             planner_messages[-1] = HumanMessage(content=new_msg)
 
         # Get planner output
-        response = await self.planner_llm.ainvoke(planner_messages)
-        plan = response.content
+        response = await self.ainvoke(planner_messages)
+        plan = response.contentplanner_llm
         last_state_message = planner_messages[-1]
         # remove image from last state message
         if isinstance(last_state_message.content, list):

diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py
@@ -43,6 +43,12 @@ def important_rules(self) -> str:
        {"go_to_url": {"url": "https://example.com"}},
        {"extract_page_content": {}}
      ]
+     - Iframe interaction: [
+           {"switch_frame": {"frame_name": "GlobalNav"}},
+           {"click_element": {"index": 1}},
+           {"switch_frame": {"frame_name": "frameContent"}},
+           {"click_element": {"index": 2}}
+         ]
 
 
 3. ELEMENT INTERACTION:
@@ -82,8 +88,39 @@ def important_rules(self) -> str:
    - Only provide the action sequence until you think the page will change.
    - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
    - only use multiple actions if it makes sense. 
-
-9. Extraction:
+9. IFrames:
+         - Identify iframes using their names or unique identifiers
+         - Switch to iframes before interacting with nested elements
+         - Use frame locators for element interaction within iframes
+         - Example action sequence for iframe interaction:
+             [
+               {"switch_frame": {"frame_name": "GlobalNav"}},
+               {"click_element": {"index": 1}},
+               {"switch_frame": {"frame_name": "frameContent"}},
+               {"click_element": {"index": 2}}
+             ]
+           - Always return to the main frame after iframe operations
+           - Handle nested iframes by chaining switch_frame actions
+        10. Action Sequencing for Iframes:
+       - Always start iframe interactions with switch_frame
+       - Perform all element interactions within the iframe context
+       - Use back_to_main_frame after completing iframe operations
+       - For nested iframes, chain switch_frame actions
+       - Example nested iframe sequence:
+         [
+           {"switch_frame": {"frame_name": "outerFrame"}},
+           {"switch_frame": {"frame_name": "innerFrame"}},
+           {"click_element": {"index": 1}},
+           {"back_to_main_frame": {}}
+         ]
+
+    11. Visual Context for Iframes:
+       - Bounding boxes for iframe elements will have frame name labels
+       - Example: [GlobalNav] <button>Patients</button>
+       - Use frame labels to identify element context
+       - Elements without frame labels are in the main page
+
+12. Extraction:
     - If your task is to find information or do research - call extract_content on the specific pages to get and store the information.
 
 """

diff --git a/src/utils/default_config_settings.py b/src/utils/default_config_settings.py
@@ -10,7 +10,7 @@ def default_config():
         "agent_type": "custom",
         "max_steps": 100,
         "max_actions_per_step": 10,
-        "use_vision": True,
+        "use_vision": False,
         "tool_calling_method": "auto",
         "llm_provider": "openai",
         "llm_model_name": "gpt-4o",

diff --git a/supervisord.conf b/supervisord.conf
@@ -1,3 +1,100 @@
+# [supervisord]
+# user=root
+# nodaemon=true
+# logfile=/dev/stdout
+# logfile_maxbytes=0
+# loglevel=debug
+
+# [program:xvfb]
+# command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset
+# autorestart=true
+# stdout_logfile=/dev/stdout
+# stdout_logfile_maxbytes=0
+# stderr_logfile=/dev/stderr
+# stderr_logfile_maxbytes=0
+# priority=100
+# startsecs=3
+# stopsignal=TERM
+# stopwaitsecs=10
+
+# [program:vnc_setup]
+# command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd"
+# autorestart=false
+# startsecs=0
+# priority=150
+# stdout_logfile=/dev/stdout
+# stdout_logfile_maxbytes=0
+# stderr_logfile=/dev/stderr
+# stderr_logfile_maxbytes=0
+
+# [program:x11vnc]
+# command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && chmod 666 /var/log/x11vnc.log && sleep 5 && DISPLAY=:99 x11vnc -display :99 -forever -shared -rfbauth /root/.vnc/passwd -bg -rfbport 5901 -o /var/log/x11vnc.log"
+# autorestart=true
+# stdout_logfile=/dev/stdout
+# stdout_logfile_maxbytes=0
+# stderr_logfile=/dev/stderr
+# stderr_logfile_maxbytes=0
+# priority=200
+# startretries=10
+# startsecs=10
+# stopsignal=TERM
+# stopwaitsecs=10
+# depends_on=vnc_setup,xvfb
+
+# [program:x11vnc_log]
+# command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && tail -f /var/log/x11vnc.log"
+# autorestart=true
+# stdout_logfile=/dev/stdout
+# stdout_logfile_maxbytes=0
+# stderr_logfile=/dev/stderr
+# stderr_logfile_maxbytes=0
+# priority=250
+# stopsignal=TERM
+# stopwaitsecs=5
+# depends_on=x11vnc
+
+# [program:novnc]
+# command=bash -c "sleep 5 && cd /opt/novnc && ./utils/novnc_proxy --vnc localhost:5901 --listen 0.0.0.0:6080 --web /opt/novnc --http-header='Content-Security-Policy: frame-ancestors http://localhost:7788/'"
+# autorestart=true
+# stdout_logfile=/dev/stdout
+# stdout_logfile_maxbytes=0
+# stderr_logfile=/dev/stderr
+# stderr_logfile_maxbytes=0
+# priority=300
+# startretries=5
+# startsecs=3
+# depends_on=x11vnc
+
+# [program:persistent_browser]
+# environment=START_URL="data:text/html,<html><body><h1>Browser Ready</h1></body></html>"
+# command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\""
+# autorestart=true
+# stdout_logfile=/dev/stdout
+# stdout_logfile_maxbytes=0
+# stderr_logfile=/dev/stderr
+# stderr_logfile_maxbytes=0
+# priority=350
+# startretries=5
+# startsecs=10
+# stopsignal=TERM
+# stopwaitsecs=15
+# depends_on=novnc
+
+# [program:webui]
+# command=python webui.py --ip 0.0.0.0 --port 7788
+# directory=/app
+# autorestart=true
+# stdout_logfile=/dev/stdout
+# stdout_logfile_maxbytes=0
+# stderr_logfile=/dev/stderr
+# stderr_logfile_maxbytes=0
+# priority=400
+# startretries=3
+# startsecs=3
+# stopsignal=TERM
+# stopwaitsecs=10
+# depends_on=persistent_browser
+
 [supervisord]
 user=root
 nodaemon=true
@@ -17,18 +114,18 @@ startsecs=3
 stopsignal=TERM
 stopwaitsecs=10
 
-[program:vnc_setup]
-command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd"
-autorestart=false
-startsecs=0
-priority=150
-stdout_logfile=/dev/stdout
-stdout_logfile_maxbytes=0
-stderr_logfile=/dev/stderr
-stderr_logfile_maxbytes=0
+# [program:vnc_setup]
+# command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd"
+# autorestart=false
+# startsecs=0
+# priority=150
+# stdout_logfile=/dev/stdout
+# stdout_logfile_maxbytes=0
+# stderr_logfile=/dev/stderr
+# stderr_logfile_maxbytes=0
 
 [program:x11vnc]
-command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && chmod 666 /var/log/x11vnc.log && sleep 5 && DISPLAY=:99 x11vnc -display :99 -forever -shared -rfbauth /root/.vnc/passwd -rfbport 5901 -o /var/log/x11vnc.log"
+command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && chmod 666 /var/log/x11vnc.log && sleep 5 && DISPLAY=:99 x11vnc -display :99 -nopw -forever -shared -bg -rfbport 5901 -o /var/log/x11vnc.log"
 autorestart=true
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
@@ -39,7 +136,7 @@ startretries=10
 startsecs=10
 stopsignal=TERM
 stopwaitsecs=10
-depends_on=vnc_setup,xvfb
+depends_on=xvfb
 
 [program:x11vnc_log]
 command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && tail -f /var/log/x11vnc.log"
@@ -93,4 +190,4 @@ startretries=3
 startsecs=3
 stopsignal=TERM
 stopwaitsecs=10
-depends_on=persistent_browser
+depends_on=persistent_browser
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		docker build -t agent .
		docker run -p 7788:7788 -p 6080:6080 -p 5901:5901 agent