Hello
+Alpha paragraph.
+Beta paragraph.
+diff --git a/README.md b/README.md index b7204cc..81940c0 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,25 @@ Selectors remain supported, but ref-based interaction is the preferred 2.0 flow: 2. act on `@eN` refs 3. re-snapshot +### Stateless Web Fetch + +```elixir +{:ok, result} = + Jido.Browser.web_fetch( + "https://example.com/docs", + format: :markdown, + allowed_domains: ["example.com"], + focus_terms: ["API", "authentication"], + citations: true + ) + +result.content +result.passages +result.metadata # present when extraction returns document metadata +``` + +`web_fetch/2` keeps HTML handling native for selector extraction and markdown conversion, and uses `extractous_ex` for fetched binary documents such as PDFs, Word, Excel, PowerPoint, OpenDocument, EPUB, and common email formats. Binary document responses may also include `result.metadata` when extraction returns document metadata. + ### State Persistence ```elixir @@ -143,6 +162,19 @@ config :jido_browser, :web, profile: "default" ``` +Optional web fetch settings: + +```elixir +config :jido_browser, :web_fetch, + cache_ttl_ms: 300_000, + extractous: [ + pdf: [extract_annotation_text: true], + office: [include_headers_and_footers: true] + ] +``` + +Configured `extractous` options are merged with any per-call `extractous:` keyword options passed to `Jido.Browser.web_fetch/2`. + ## Backends ### AgentBrowser (Default) @@ -173,6 +205,7 @@ Core operations: - `type/4` - `screenshot/2` - `extract_content/2` +- `web_fetch/2` - `evaluate/3` Agent-browser-native operations: @@ -252,6 +285,7 @@ Agent-browser-native operations: - `ReadPage` - `SnapshotUrl` - `SearchWeb` +- `WebFetch` ## Using With Jido Agents diff --git a/lib/jido_browser.ex b/lib/jido_browser.ex index 58f3e3a..39a4424 100644 --- a/lib/jido_browser.ex +++ b/lib/jido_browser.ex @@ -8,11 +8,13 @@ defmodule Jido.Browser do alias Jido.Browser.Error alias Jido.Browser.Session + alias Jido.Browser.WebFetch @default_adapter Jido.Browser.Adapters.AgentBrowser @default_timeout 30_000 @supported_screenshot_formats [:png] @supported_extract_formats [:markdown, :html, :text] + @supported_web_fetch_formats [:markdown, :html, :text] @doc "Starts a browser session using the configured adapter or an explicit adapter override." @spec start_session(keyword()) :: {:ok, Session.t()} | {:error, term()} @@ -107,6 +109,34 @@ defmodule Jido.Browser do end end + @doc """ + Fetches a URL over HTTP(S) without starting a browser session. + + HTML responses keep native selector extraction and format conversion, while + fetched binary documents such as PDFs and office files are extracted through + `ExtractousEx`. + """ + @spec web_fetch(String.t(), keyword()) :: {:ok, map()} | {:error, term()} + def web_fetch(url, opts \\ []) + + def web_fetch(url, _opts) when url in [nil, ""] do + {:error, Error.invalid_error("URL cannot be nil or empty", %{url: url})} + end + + def web_fetch(url, opts) when is_binary(url) do + format = opts[:format] || :markdown + + if format in @supported_web_fetch_formats do + WebFetch.fetch(url, normalize_timeout(opts)) + else + {:error, + Error.invalid_error("Unsupported web fetch format: #{inspect(format)}", %{ + format: format, + supported: @supported_web_fetch_formats + })} + end + end + @doc "Evaluates JavaScript in the browser when the adapter supports it." @spec evaluate(Session.t(), String.t(), keyword()) :: {:ok, Session.t(), map()} | {:error, term()} diff --git a/lib/jido_browser/actions/web_fetch.ex b/lib/jido_browser/actions/web_fetch.ex new file mode 100644 index 0000000..fa79361 --- /dev/null +++ b/lib/jido_browser/actions/web_fetch.ex @@ -0,0 +1,89 @@ +defmodule Jido.Browser.Actions.WebFetch do + @moduledoc """ + Stateless HTTP-first document retrieval for agent workflows. + + `WebFetch` is a lighter-weight alternative to browser navigation when the + target content can be retrieved over plain HTTP(S) without JavaScript + execution, including fetched PDFs and office-style documents. + """ + + use Jido.Action, + name: "web_fetch", + description: + "Fetch a URL over HTTP(S) with domain policy controls, Extractous-backed document extraction, " <> + "optional focused filtering, approximate token caps, and citation-ready passages.", + category: "Browser", + tags: ["browser", "web", "fetch", "http", "retrieval"], + vsn: "2.0.0", + schema: [ + url: [type: :string, required: true, doc: "The URL to fetch"], + format: [type: {:in, [:markdown, :text, :html]}, default: :markdown, doc: "Output format"], + selector: [type: :string, doc: "Optional CSS selector for HTML pages"], + allowed_domains: [type: {:list, :string}, default: [], doc: "Allow-list of host or host/path rules"], + blocked_domains: [type: {:list, :string}, default: [], doc: "Block-list of host or host/path rules"], + focus_terms: [type: {:list, :string}, default: [], doc: "Terms used to filter the fetched document"], + focus_window: [type: :integer, default: 0, doc: "Paragraph window around each focus match"], + max_content_tokens: [type: :integer, doc: "Approximate token cap for returned content"], + citations: [type: :boolean, default: false, doc: "Include citation-ready passage offsets"], + cache: [type: :boolean, default: true, doc: "Reuse cached fetch results when available"], + timeout: [type: :integer, doc: "Receive timeout in milliseconds"], + require_known_url: [type: :boolean, default: false, doc: "Require the URL to already be present in tool context"], + known_urls: [type: {:list, :string}, default: [], doc: "Additional known URLs accepted for provenance checks"], + max_uses: [type: :integer, doc: "Maximum successful web fetch calls allowed in current skill state"] + ] + + alias Jido.Browser.Error + + @impl true + def run(params, context) do + with :ok <- validate_max_uses(params, context), + {:ok, result} <- Jido.Browser.web_fetch(params.url, build_opts(params, context)) do + {:ok, Map.put(result, :status, "success")} + else + {:error, error} -> + {:error, error} + end + end + + defp build_opts(params, context) do + known_urls = + (Map.get(params, :known_urls, []) || []) + |> Kernel.++(get_in(context, [:skill_state, :seen_urls]) || []) + |> Enum.uniq() + + [] + |> maybe_put(:format, Map.get(params, :format, :markdown)) + |> maybe_put(:selector, params[:selector]) + |> maybe_put(:allowed_domains, Map.get(params, :allowed_domains, [])) + |> maybe_put(:blocked_domains, Map.get(params, :blocked_domains, [])) + |> maybe_put(:focus_terms, Map.get(params, :focus_terms, [])) + |> maybe_put(:focus_window, Map.get(params, :focus_window, 0)) + |> maybe_put(:max_content_tokens, params[:max_content_tokens]) + |> maybe_put(:citations, Map.get(params, :citations, false)) + |> maybe_put(:cache, Map.get(params, :cache, true)) + |> maybe_put(:timeout, params[:timeout]) + |> maybe_put(:require_known_url, Map.get(params, :require_known_url, false)) + |> maybe_put(:known_urls, known_urls) + end + + defp validate_max_uses(%{max_uses: max_uses}, context) when is_integer(max_uses) and max_uses >= 0 do + current_uses = get_in(context, [:skill_state, :web_fetch_uses]) || 0 + + if current_uses >= max_uses do + {:error, + Error.invalid_error("Web fetch max uses exceeded", %{ + error_code: :max_uses_exceeded, + max_uses: max_uses, + current_uses: current_uses + })} + else + :ok + end + end + + defp validate_max_uses(_params, _context), do: :ok + + defp maybe_put(opts, _key, nil), do: opts + defp maybe_put(opts, _key, []), do: opts + defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value) +end diff --git a/lib/jido_browser/plugin.ex b/lib/jido_browser/plugin.ex index 4e99408..a99f841 100644 --- a/lib/jido_browser/plugin.ex +++ b/lib/jido_browser/plugin.ex @@ -36,6 +36,7 @@ require Jido.Browser.Actions.WaitForSelector require Jido.Browser.Actions.ReadPage require Jido.Browser.Actions.SnapshotUrl require Jido.Browser.Actions.SearchWeb +require Jido.Browser.Actions.WebFetch defmodule Jido.Browser.Plugin do @moduledoc """ @@ -119,7 +120,8 @@ defmodule Jido.Browser.Plugin do # Self-contained composite actions (manage own session) Jido.Browser.Actions.ReadPage, Jido.Browser.Actions.SnapshotUrl, - Jido.Browser.Actions.SearchWeb + Jido.Browser.Actions.SearchWeb, + Jido.Browser.Actions.WebFetch ], description: "Browser automation for web navigation, interaction, and content extraction", category: "browser", @@ -136,7 +138,9 @@ defmodule Jido.Browser.Plugin do viewport: Map.get(config, :viewport, %{width: 1280, height: 720}), base_url: Map.get(config, :base_url), last_url: nil, - last_title: nil + last_title: nil, + seen_urls: [], + web_fetch_uses: 0 } {:ok, initial_state} @@ -151,7 +155,9 @@ defmodule Jido.Browser.Plugin do viewport: Zoi.any(description: "Browser viewport dimensions") |> Zoi.optional(), base_url: Zoi.string(description: "Base URL for relative navigation") |> Zoi.optional(), last_url: Zoi.string(description: "Last navigated URL") |> Zoi.optional(), - last_title: Zoi.string(description: "Last page title") |> Zoi.optional() + last_title: Zoi.string(description: "Last page title") |> Zoi.optional(), + seen_urls: Zoi.array(Zoi.string(description: "Known URLs discovered during tool use")) |> Zoi.default([]), + web_fetch_uses: Zoi.integer(description: "Successful web fetch calls in current skill state") |> Zoi.default(0) }) end @@ -204,7 +210,8 @@ defmodule Jido.Browser.Plugin do # Self-contained composite actions {"browser.read_page", Jido.Browser.Actions.ReadPage}, {"browser.snapshot_url", Jido.Browser.Actions.SnapshotUrl}, - {"browser.search_web", Jido.Browser.Actions.SearchWeb} + {"browser.search_web", Jido.Browser.Actions.SearchWeb}, + {"browser.web_fetch", Jido.Browser.Actions.WebFetch} ] end @@ -214,22 +221,17 @@ defmodule Jido.Browser.Plugin do end @impl Jido.Plugin - def transform_result(_action, {:ok, result}, _context) when is_map(result) do - case Map.get(result, :session) do - %Jido.Browser.Session{} = session -> - current_url = Map.get(result, :url) || Map.get(result, "url") || get_in(session, [:connection, :current_url]) - current_title = Map.get(result, :title) || Map.get(result, "title") || get_in(session, [:connection, :title]) - - state_updates = %{ - session: session, - last_url: current_url, - last_title: current_title - } - - {:ok, result, state_updates} + def transform_result(action, {:ok, result}, context) when is_map(result) do + state_updates = + %{} + |> maybe_put_session_state(result) + |> maybe_put_seen_urls(result, context) + |> maybe_increment_web_fetch_uses(action, context) - _ -> - {:ok, result} + if map_size(state_updates) == 0 do + {:ok, result} + else + {:ok, result, state_updates} end end @@ -260,6 +262,67 @@ defmodule Jido.Browser.Plugin do end end + defp maybe_put_session_state(acc, result) do + case Map.get(result, :session) do + %Jido.Browser.Session{} = session -> + current_url = Map.get(result, :url) || Map.get(result, "url") || get_in(session, [:connection, :current_url]) + current_title = Map.get(result, :title) || Map.get(result, "title") || get_in(session, [:connection, :title]) + + Map.merge(acc, %{ + session: session, + last_url: current_url, + last_title: current_title + }) + + _ -> + acc + end + end + + defp maybe_put_seen_urls(acc, result, context) do + current_seen_urls = get_in(context, [:skill_state, :seen_urls]) || [] + + seen_urls = + current_seen_urls + |> Kernel.++(extract_urls(result)) + |> Enum.reject(&nil_or_empty?/1) + |> Enum.uniq() + + if seen_urls == [] or seen_urls == current_seen_urls do + acc + else + Map.put(acc, :seen_urls, seen_urls) + end + end + + defp maybe_increment_web_fetch_uses(acc, Jido.Browser.Actions.WebFetch, context) do + current_uses = get_in(context, [:skill_state, :web_fetch_uses]) || 0 + Map.put(acc, :web_fetch_uses, current_uses + 1) + end + + defp maybe_increment_web_fetch_uses(acc, _action, _context), do: acc + + defp extract_urls(result) do + direct_urls = + [Map.get(result, :url), Map.get(result, "url"), Map.get(result, :final_url), Map.get(result, "final_url")] + |> Enum.reject(&nil_or_empty?/1) + + search_urls = + result + |> Map.get(:results, Map.get(result, "results", [])) + |> List.wrap() + |> Enum.map(fn item -> + if is_map(item), do: Map.get(item, :url) || Map.get(item, "url") + end) + |> Enum.reject(&nil_or_empty?/1) + + direct_urls ++ search_urls + end + + defp nil_or_empty?(nil), do: true + defp nil_or_empty?(""), do: true + defp nil_or_empty?(_value), do: false + def signal_patterns do [ # Session lifecycle @@ -308,7 +371,8 @@ defmodule Jido.Browser.Plugin do # Self-contained composite actions "browser.read_page", "browser.snapshot_url", - "browser.search_web" + "browser.search_web", + "browser.web_fetch" ] end end diff --git a/lib/jido_browser/web_fetch.ex b/lib/jido_browser/web_fetch.ex new file mode 100644 index 0000000..1eae9aa --- /dev/null +++ b/lib/jido_browser/web_fetch.ex @@ -0,0 +1,1138 @@ +defmodule Jido.Browser.WebFetch do + @moduledoc """ + Stateless HTTP-first web retrieval with optional domain policy, caching, + focused filtering, citation-ready passage metadata, and Extractous-backed + document extraction. + + This module is intended for document retrieval workloads where starting a full + browser session would be unnecessary or too expensive. + """ + + alias Jido.Browser.Error + + @cache_table :jido_browser_web_fetch_cache + @default_timeout 15_000 + @default_max_redirects 5 + @default_cache_ttl_ms 300_000 + @default_max_url_length 2_048 + @supported_formats [:markdown, :text, :html] + @html_content_types ["text/html", "application/xhtml+xml"] + @text_content_types [ + "text/plain", + "text/markdown", + "text/csv", + "text/xml", + "application/xml", + "application/json", + "application/ld+json" + ] + @document_content_types %{ + "application/pdf" => :pdf, + "application/msword" => :word_processing, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => :word_processing, + "application/vnd.ms-word.document.macroenabled.12" => :word_processing, + "application/vnd.openxmlformats-officedocument.wordprocessingml.template" => :word_processing, + "application/vnd.ms-word.template.macroenabled.12" => :word_processing, + "application/vnd.ms-excel" => :spreadsheet, + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => :spreadsheet, + "application/vnd.ms-excel.sheet.macroenabled.12" => :spreadsheet, + "application/vnd.openxmlformats-officedocument.spreadsheetml.template" => :spreadsheet, + "application/vnd.ms-excel.template.macroenabled.12" => :spreadsheet, + "application/vnd.ms-powerpoint" => :presentation, + "application/vnd.openxmlformats-officedocument.presentationml.presentation" => :presentation, + "application/vnd.ms-powerpoint.presentation.macroenabled.12" => :presentation, + "application/vnd.openxmlformats-officedocument.presentationml.slideshow" => :presentation, + "application/vnd.openxmlformats-officedocument.presentationml.template" => :presentation, + "application/vnd.oasis.opendocument.text" => :word_processing, + "application/vnd.oasis.opendocument.spreadsheet" => :spreadsheet, + "application/vnd.oasis.opendocument.presentation" => :presentation, + "application/rtf" => :word_processing, + "text/rtf" => :word_processing, + "application/epub+zip" => :ebook, + "message/rfc822" => :email, + "application/vnd.ms-outlook" => :email + } + @document_extensions %{ + "pdf" => :pdf, + "doc" => :word_processing, + "docx" => :word_processing, + "docm" => :word_processing, + "dotx" => :word_processing, + "dotm" => :word_processing, + "odt" => :word_processing, + "rtf" => :word_processing, + "xls" => :spreadsheet, + "xlsx" => :spreadsheet, + "xlsm" => :spreadsheet, + "xlsb" => :spreadsheet, + "ods" => :spreadsheet, + "ppt" => :presentation, + "pptx" => :presentation, + "pptm" => :presentation, + "ppsx" => :presentation, + "odp" => :presentation, + "epub" => :ebook, + "eml" => :email, + "msg" => :email + } + + @type result :: %{ + required(:url) => String.t(), + required(:final_url) => String.t(), + required(:content) => String.t(), + required(:format) => atom(), + required(:content_type) => String.t(), + required(:document_type) => atom(), + required(:retrieved_at) => String.t(), + required(:estimated_tokens) => non_neg_integer(), + required(:original_estimated_tokens) => non_neg_integer(), + required(:truncated) => boolean(), + required(:filtered) => boolean(), + required(:focus_matches) => non_neg_integer(), + required(:cached) => boolean(), + required(:citations) => %{enabled: boolean()}, + required(:passages) => list(map()), + optional(:title) => String.t() | nil, + optional(:metadata) => map() + } + + @doc """ + Fetches a URL over HTTP(S) and returns normalized document content. + + Supported options: + - `:format` - `:markdown`, `:text`, or `:html` + - `:selector` - CSS selector for HTML pages + - `:allowed_domains` / `:blocked_domains` - mutually exclusive host/path rules + - `:max_content_tokens` - approximate token cap + - `:citations` - boolean, when true include passage spans + - `:focus_terms` - list of terms used for focused filtering + - `:focus_window` - paragraph window around focus matches + - `:timeout` - receive timeout in milliseconds + - `:cache` - enable ETS cache, defaults to `true` + - `:cache_ttl_ms` - cache TTL in milliseconds + - `:require_known_url` / `:known_urls` - optional URL provenance guard + - `:extractous` - optional `ExtractousEx` keyword options merged with config + """ + @spec fetch(String.t(), keyword()) :: {:ok, result()} | {:error, Exception.t()} + def fetch(url, opts \\ []) + + def fetch(url, opts) when is_binary(url) and is_list(opts) do + with {:ok, opts} <- normalize_opts(opts), + {:ok, normalized_url, uri} <- validate_url(url, opts), + :ok <- validate_known_url(normalized_url, opts), + :ok <- validate_domain_filters(uri, opts) do + case fetch_cached(normalized_url, opts) do + {:ok, result} -> + {:ok, result} + + :miss -> + do_fetch(normalized_url, opts) + end + end + end + + def fetch(url, _opts) do + {:error, Error.invalid_error("URL must be a non-empty string", %{error_code: :invalid_input, url: url})} + end + + @doc false + @spec clear_cache() :: :ok + def clear_cache do + case :ets.whereis(@cache_table) do + :undefined -> + :ok + + table -> + :ets.delete_all_objects(table) + :ok + end + end + + defp do_fetch(url, opts) do + request_opts = [ + url: url, + headers: request_headers(), + receive_timeout: opts[:timeout], + decode_body: false, + redirect: true, + max_redirects: opts[:max_redirects] + ] + + case Req.run(request_opts) do + {%Req.Request{} = request, %Req.Response{} = response} -> + with :ok <- validate_http_status(response, url), + {:ok, final_url, final_uri} <- normalize_final_url(request), + :ok <- validate_domain_filters(final_uri, opts), + {:ok, result} <- build_result(url, final_url, response, opts) do + maybe_store_cache(url, opts, result) + {:ok, result} + end + + {_request, %Req.TransportError{} = exception} -> + {:error, Error.adapter_error("Web fetch request failed", %{error_code: :url_not_accessible, reason: exception})} + + {_request, %Req.TooManyRedirectsError{} = exception} -> + {:error, + Error.adapter_error("Web fetch exceeded redirect limit", %{error_code: :url_not_accessible, reason: exception})} + + {_request, %_{} = exception} -> + {:error, Error.adapter_error("Web fetch failed", %{error_code: :unavailable, reason: exception})} + end + end + + defp build_result(url, final_url, response, opts) do + content_type = response_content_type(response) + document_type = extractable_document_type(content_type, final_url, response.body) + + cond do + content_type in @html_content_types -> + build_html_result(url, final_url, response.body, content_type, opts) + + not is_nil(document_type) -> + build_document_result(url, final_url, response.body, content_type, document_type, opts) + + text_content_type?(content_type) -> + build_text_result(url, final_url, response.body, content_type, opts) + + true -> + {:error, + Error.adapter_error("Unsupported content type for web fetch", %{ + error_code: :unsupported_content_type, + content_type: content_type + })} + end + end + + defp build_html_result(url, final_url, body, content_type, opts) when is_binary(body) do + selector = opts[:selector] + + with {:ok, document} <- parse_document(body), + {:ok, html} <- select_html(document, body, selector), + {:ok, title} <- extract_title(document), + {:ok, content} <- format_html(html, opts[:format], opts) do + finalize_result(url, final_url, content, title, content_type, :html, opts) + end + end + + defp build_html_result(_url, _final_url, body, content_type, _opts) do + {:error, + Error.adapter_error("Unexpected response body for HTML fetch", %{ + error_code: :unavailable, + content_type: content_type, + body: body + })} + end + + defp build_text_result(url, final_url, body, content_type, opts) when is_binary(body) do + with :ok <- validate_non_html_options(content_type, opts), + {:ok, content} <- format_text(body, opts[:format]) do + finalize_result(url, final_url, content, nil, content_type, :text, opts) + end + end + + defp build_text_result(_url, _final_url, body, content_type, _opts) do + {:error, + Error.adapter_error("Unexpected response body for text fetch", %{ + error_code: :unavailable, + content_type: content_type, + body: body + })} + end + + defp build_document_result(url, final_url, body, content_type, document_type, opts) when is_binary(body) do + with :ok <- validate_non_html_options(content_type, opts), + {:ok, text, metadata} <- extract_document_content(body, final_url, content_type, document_type, opts) do + finalize_result( + url, + final_url, + text, + document_title(metadata, final_url), + content_type, + document_type, + opts, + metadata + ) + end + end + + defp build_document_result(_url, _final_url, body, content_type, _document_type, _opts) do + {:error, + Error.adapter_error("Unexpected response body for document fetch", %{ + error_code: :unavailable, + content_type: content_type, + body: body + })} + end + + defp build_response(opts, attrs) do + passages = maybe_build_passages(attrs.content, attrs.title, attrs.final_url, opts[:citations]) + + %{ + url: attrs.url, + final_url: attrs.final_url, + title: attrs.title, + content: attrs.content, + format: opts[:format], + content_type: attrs.content_type, + document_type: attrs.document_type, + retrieved_at: retrieved_at(), + estimated_tokens: estimate_tokens(attrs.content), + original_estimated_tokens: attrs.original_estimated_tokens, + truncated: attrs.truncated, + filtered: attrs.filtered, + focus_matches: attrs.focus_matches, + cached: false, + citations: %{enabled: opts[:citations]}, + passages: passages + } + |> maybe_put_metadata(attrs.metadata) + end + + defp finalize_result(url, final_url, content, title, content_type, document_type, opts, metadata \\ nil) do + with {:ok, filtered_content, filtered, focus_matches} <- maybe_filter_content(content, opts), + {final_content, truncated, original_estimated_tokens} <- + maybe_truncate(filtered_content, opts[:max_content_tokens]) do + attrs = %{ + url: url, + final_url: final_url, + content: final_content, + title: title, + content_type: content_type, + document_type: document_type, + truncated: truncated, + filtered: filtered, + focus_matches: focus_matches, + original_estimated_tokens: original_estimated_tokens, + metadata: metadata + } + + {:ok, build_response(opts, attrs)} + end + end + + defp validate_non_html_options(content_type, opts) do + cond do + opts[:selector] -> + {:error, + Error.invalid_error("Selector filtering is only supported for HTML content", %{ + error_code: :invalid_input, + selector: opts[:selector], + content_type: content_type + })} + + opts[:format] == :html -> + {:error, + Error.invalid_error("HTML output is only supported for HTML content", %{ + error_code: :invalid_input, + format: :html, + content_type: content_type + })} + + true -> + :ok + end + end + + defp normalize_opts(opts) do + format = opts[:format] || :markdown + citations = normalize_citations(opts[:citations]) + focus_terms = normalize_focus_terms(opts[:focus_terms]) + + with {:ok, configured_extractous_opts} <- normalize_extractous_opts(config(:extractous, [])), + {:ok, request_extractous_opts} <- normalize_extractous_opts(Keyword.get(opts, :extractous, [])), + {:ok, selector} <- normalize_selector(opts[:selector]), + {:ok, focus_window} <- normalize_integer_opt(:focus_window, Keyword.get(opts, :focus_window, 0), min: 0), + {:ok, timeout} <- + normalize_integer_opt(:timeout, Keyword.get(opts, :timeout, config(:timeout, @default_timeout)), min: 1), + {:ok, max_redirects} <- + normalize_integer_opt(:max_redirects, Keyword.get(opts, :max_redirects, @default_max_redirects), min: 0), + {:ok, cache_ttl_ms} <- + normalize_integer_opt( + :cache_ttl_ms, + Keyword.get(opts, :cache_ttl_ms, config(:cache_ttl_ms, @default_cache_ttl_ms)), + min: 0 + ), + {:ok, max_content_tokens} <- + normalize_optional_integer_opt(:max_content_tokens, opts[:max_content_tokens], min: 1), + {:ok, max_url_length} <- normalize_optional_integer_opt(:max_url_length, opts[:max_url_length], min: 1), + {:ok, cache} <- normalize_boolean_opt(:cache, Keyword.get(opts, :cache, true)), + {:ok, require_known_url} <- + normalize_boolean_opt(:require_known_url, Keyword.get(opts, :require_known_url, false)) do + cond do + format not in @supported_formats -> + {:error, + Error.invalid_error("Unsupported web fetch format", %{ + error_code: :invalid_input, + format: format, + supported_formats: @supported_formats + })} + + present_domain_rules?(opts[:allowed_domains]) and present_domain_rules?(opts[:blocked_domains]) -> + {:error, + Error.invalid_error("Use either allowed_domains or blocked_domains, not both", %{ + error_code: :invalid_input + })} + + format == :html and focus_terms != [] -> + {:error, + Error.invalid_error("Focused filtering is only supported for markdown and text output", %{ + error_code: :invalid_input, + format: format + })} + + true -> + normalized = + opts + |> Keyword.put(:format, format) + |> Keyword.put(:selector, selector) + |> Keyword.put(:citations, citations) + |> Keyword.put(:focus_terms, focus_terms) + |> Keyword.put(:focus_window, focus_window) + |> Keyword.put(:timeout, timeout) + |> Keyword.put(:max_redirects, max_redirects) + |> Keyword.put(:cache, cache) + |> Keyword.put(:cache_ttl_ms, cache_ttl_ms) + |> Keyword.put(:require_known_url, require_known_url) + |> Keyword.put(:extractous, merge_extractous_opts(configured_extractous_opts, request_extractous_opts)) + |> maybe_put(:max_content_tokens, max_content_tokens) + |> maybe_put(:max_url_length, max_url_length) + |> Keyword.put_new(:known_urls, []) + + {:ok, normalized} + end + end + end + + defp validate_url(url, opts) do + normalized_url = String.trim(url) + max_url_length = opts[:max_url_length] || @default_max_url_length + + with :ok <- validate_url_length(normalized_url, max_url_length), + {:ok, uri} <- parse_fetch_uri(normalized_url), + :ok <- validate_uri_host(uri) do + {:ok, URI.to_string(uri), normalize_uri(uri)} + end + end + + defp validate_known_url(url, opts) do + known_urls = + opts[:known_urls] + |> List.wrap() + |> Enum.map(&normalize_known_url/1) + |> Enum.reject(&is_nil/1) + + if Keyword.get(opts, :require_known_url, false) do + if url in known_urls do + :ok + else + {:error, + Error.invalid_error("Web fetch URL must already be present in tool context", %{ + error_code: :url_not_allowed, + url: url + })} + end + else + :ok + end + end + + defp validate_domain_filters(%URI{} = uri, opts) do + with {:ok, allowed_rules} <- normalize_domain_rules(opts[:allowed_domains]), + {:ok, blocked_rules} <- normalize_domain_rules(opts[:blocked_domains]) do + cond do + allowed_rules != [] and not Enum.any?(allowed_rules, &rule_matches?(&1, uri)) -> + {:error, + Error.invalid_error("URL is not permitted by allowed_domains", %{ + error_code: :url_not_allowed, + url: URI.to_string(uri) + })} + + blocked_rules != [] and Enum.any?(blocked_rules, &rule_matches?(&1, uri)) -> + {:error, + Error.invalid_error("URL is blocked by blocked_domains", %{ + error_code: :url_not_allowed, + url: URI.to_string(uri) + })} + + true -> + :ok + end + end + end + + defp normalize_domain_rules(nil), do: {:ok, []} + + defp normalize_domain_rules(rules) do + rules + |> List.wrap() + |> Enum.reduce_while({:ok, []}, fn rule, {:ok, acc} -> + case normalize_domain_rule(rule) do + {:ok, normalized} -> {:cont, {:ok, [normalized | acc]}} + {:error, reason} -> {:halt, {:error, reason}} + end + end) + |> case do + {:ok, normalized} -> {:ok, Enum.reverse(normalized)} + error -> error + end + end + + defp normalize_domain_rule(rule) when is_binary(rule) do + normalized = String.trim(rule) + + cond do + normalized == "" -> + {:error, Error.invalid_error("Domain rules cannot be empty", %{error_code: :invalid_input})} + + String.contains?(normalized, "://") -> + {:error, + Error.invalid_error("Domain rules must not include URL schemes", %{ + error_code: :invalid_input, + rule: normalized + })} + + true -> + uri = URI.parse("https://" <> normalized) + host = String.downcase(uri.host || "") + path = uri.path || "/" + + cond do + host == "" -> + {:error, + Error.invalid_error("Domain rule must include a host", %{error_code: :invalid_input, rule: normalized})} + + not ascii_only?(host) -> + {:error, + Error.invalid_error("Domain rules must use ASCII hosts", %{ + error_code: :invalid_input, + rule: normalized + })} + + true -> + {:ok, %{host: host, path: normalize_rule_path(path)}} + end + end + end + + defp normalize_domain_rule(rule) do + {:error, Error.invalid_error("Domain rule must be a string", %{error_code: :invalid_input, rule: rule})} + end + + defp rule_matches?(%{host: host, path: path}, %URI{host: uri_host} = uri) do + uri_host = String.downcase(uri_host) + request_path = normalize_rule_path(uri.path || "/") + + host_matches? = uri_host == host or String.ends_with?(uri_host, "." <> host) + path_matches? = path == "/" or String.starts_with?(request_path, path) + + host_matches? and path_matches? + end + + defp normalize_final_url(%Req.Request{url: %URI{} = uri}) do + normalized = normalize_uri(uri) + {:ok, URI.to_string(normalized), normalized} + end + + defp validate_http_status(%Req.Response{status: status}, _url) when status in 200..299, do: :ok + + defp validate_http_status(%Req.Response{status: 429}, _url) do + {:error, Error.adapter_error("Web fetch rate limited", %{error_code: :too_many_requests, status: 429})} + end + + defp validate_http_status(%Req.Response{status: status}, url) do + {:error, + Error.adapter_error("Web fetch returned an HTTP error", %{ + error_code: :url_not_accessible, + status: status, + url: url + })} + end + + defp parse_document(body) do + case Floki.parse_document(body) do + {:ok, document} -> + {:ok, document} + + {:error, reason} -> + {:error, Error.adapter_error("Failed to parse fetched HTML", %{error_code: :unavailable, reason: reason})} + end + end + + defp select_html(_document, body, nil), do: {:ok, body} + defp select_html(document, _body, ""), do: select_html(document, nil, nil) + + defp select_html(document, _body, selector) do + nodes = Floki.find(document, selector) + + if nodes == [] do + {:error, + Error.invalid_error("Selector did not match any elements in fetched HTML", %{ + error_code: :invalid_input, + selector: selector + })} + else + {:ok, Floki.raw_html(nodes)} + end + end + + defp extract_title(document) do + title = + document + |> Floki.find("title") + |> Floki.text(sep: " ") + |> String.trim() + |> blank_to_nil() + + {:ok, title} + end + + defp format_html(html, :html, _opts), do: {:ok, html} + + defp format_html(html, :text, _opts) do + with {:ok, fragment} <- parse_fragment(html) do + {:ok, fragment |> Floki.text(sep: "\n") |> String.trim()} + end + end + + defp format_html(html, :markdown, _opts) do + {:ok, Html2Markdown.convert(html) |> String.trim()} + rescue + error -> + {:error, + Error.adapter_error("Failed to convert fetched HTML to markdown", %{error_code: :unavailable, reason: error})} + end + + defp format_text(text, :text), do: {:ok, String.trim(text)} + defp format_text(text, :markdown), do: {:ok, String.trim(text)} + + defp format_text(_text, :html) do + {:error, + Error.invalid_error("HTML output is only supported for HTML content", %{ + error_code: :invalid_input + })} + end + + defp parse_fragment(html) do + case Floki.parse_fragment(html) do + {:ok, fragment} -> + {:ok, fragment} + + {:error, reason} -> + {:error, + Error.adapter_error("Failed to parse fetched HTML fragment", %{error_code: :unavailable, reason: reason})} + end + end + + defp maybe_filter_content(content, opts) do + case opts[:focus_terms] do + [] -> + {:ok, content, false, 0} + + terms -> + sections = split_sections(content) + matching_indexes = matching_section_indexes(sections, terms) + window = max(opts[:focus_window] || 0, 0) + kept_indexes = expand_focus_window(matching_indexes, window, length(sections)) + filtered_content = render_section_slice(sections, kept_indexes) + + {:ok, filtered_content, true, length(matching_indexes)} + end + end + + defp maybe_truncate(content, nil), do: {content, false, estimate_tokens(content)} + + defp maybe_truncate(content, max_content_tokens) when is_integer(max_content_tokens) and max_content_tokens > 0 do + original_estimated_tokens = estimate_tokens(content) + + if original_estimated_tokens <= max_content_tokens do + {content, false, original_estimated_tokens} + else + char_limit = max_content_tokens * 4 + truncated = String.slice(content, 0, char_limit) |> String.trim() + {truncated, true, original_estimated_tokens} + end + end + + defp maybe_truncate(content, _other), do: {content, false, estimate_tokens(content)} + + defp maybe_build_passages(_content, _title, _url, false), do: [] + + defp maybe_build_passages(content, title, url, true) do + content + |> split_sections() + |> Enum.reject(&(&1 == "")) + |> Enum.reduce({[], 0, 0}, fn section, {passages, cursor, index} -> + start_char = cursor + end_char = start_char + String.length(section) + + passage = %{ + index: index, + start_char: start_char, + end_char: end_char, + text: section, + title: title, + url: url + } + + {[passage | passages], end_char + 2, index + 1} + end) + |> elem(0) + |> Enum.reverse() + |> Enum.take(50) + end + + defp split_sections(content) do + content + |> String.split(~r/\n\s*\n+/, trim: true) + |> case do + [] -> [String.trim(content)] + sections -> Enum.map(sections, &String.trim/1) + end + end + + defp extract_document_content(bytes, final_url, content_type, document_type, opts) do + case ExtractousEx.extract_from_bytes(bytes, opts[:extractous]) do + {:ok, %{content: content, metadata: metadata}} when is_binary(content) -> + {:ok, String.trim(content), normalize_metadata(metadata)} + + {:error, reason} -> + {:error, + Error.adapter_error("ExtractousEx failed while extracting document content", %{ + error_code: :unavailable, + url: final_url, + content_type: content_type, + document_type: document_type, + reason: reason + })} + end + rescue + error -> + {:error, + Error.adapter_error("ExtractousEx failed while extracting document content", %{ + error_code: :unavailable, + url: final_url, + content_type: content_type, + document_type: document_type, + reason: error + })} + end + + defp fetch_cached(url, opts) do + if opts[:cache] do + ensure_cache_table!() + lookup_cached_result(cache_key(url, opts), System.system_time(:millisecond)) + else + :miss + end + end + + defp lookup_cached_result(key, now) do + case :ets.lookup(@cache_table, key) do + [{_key, expires_at, result}] -> handle_cached_result(key, expires_at, result, now) + [] -> :miss + end + end + + defp handle_cached_result(_key, expires_at, result, now) when expires_at > now do + {:ok, Map.put(result, :cached, true)} + end + + defp handle_cached_result(key, _expires_at, _result, _now) do + :ets.delete(@cache_table, key) + :miss + end + + defp maybe_store_cache(url, opts, result) do + if opts[:cache] do + ensure_cache_table!() + + expires_at = System.system_time(:millisecond) + max(opts[:cache_ttl_ms], 0) + :ets.insert(@cache_table, {cache_key(url, opts), expires_at, result}) + end + + :ok + end + + defp ensure_cache_table! do + case :ets.whereis(@cache_table) do + :undefined -> + try do + :ets.new(@cache_table, [:named_table, :set, :public, read_concurrency: true, write_concurrency: true]) + rescue + ArgumentError -> @cache_table + end + + table -> + table + end + end + + defp cache_key(url, opts) do + {:jido_browser_web_fetch, url, opts[:format], opts[:selector], opts[:allowed_domains], opts[:blocked_domains], + opts[:focus_terms], opts[:focus_window], opts[:max_content_tokens], opts[:citations], opts[:extractous]} + end + + defp request_headers do + [ + {"accept", + "text/html,application/xhtml+xml,text/plain,application/json,application/pdf," <> + "application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document," <> + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet," <> + "application/vnd.openxmlformats-officedocument.presentationml.presentation,*/*;q=0.1"}, + {"user-agent", user_agent()} + ] + end + + defp user_agent do + vsn = + case Application.spec(:jido_browser, :vsn) do + nil -> "dev" + value -> List.to_string(value) + end + + "jido_browser/#{vsn}" + end + + defp response_content_type(response) do + response + |> Req.Response.get_header("content-type") + |> List.first() + |> case do + nil -> infer_content_type(response.body) + content_type -> content_type |> String.split(";") |> hd() |> String.trim() |> String.downcase() + end + end + + defp infer_content_type(body) when is_binary(body) do + cond do + String.starts_with?(body, "%PDF-") -> + "application/pdf" + + likely_text?(body) -> + "text/plain" + + true -> + "application/octet-stream" + end + end + + defp infer_content_type(_body), do: "application/octet-stream" + + defp text_content_type?(content_type) do + content_type in @text_content_types or String.starts_with?(content_type, "text/") + end + + defp retrieved_at do + DateTime.utc_now() + |> DateTime.truncate(:second) + |> DateTime.to_iso8601() + end + + defp estimate_tokens(content) when is_binary(content) do + div(String.length(content) + 3, 4) + end + + defp estimate_tokens(_content), do: 0 + + defp normalize_citations(%{enabled: enabled}), do: enabled == true + defp normalize_citations(enabled), do: enabled == true + + defp present_domain_rules?(rules), do: rules not in [nil, []] + + defp normalize_focus_terms(nil), do: [] + + defp normalize_focus_terms(terms) do + terms + |> List.wrap() + |> Enum.map(fn + term when is_binary(term) -> String.trim(term) + term -> to_string(term) + end) + |> Enum.reject(&(&1 == "")) + |> Enum.uniq() + end + + defp normalize_extractous_opts(nil), do: {:ok, []} + + defp normalize_extractous_opts(opts) when is_list(opts) do + if Keyword.keyword?(opts) do + {:ok, canonicalize_keyword_list(opts)} + else + {:error, + Error.invalid_error("Extractous options must be a keyword list", %{ + error_code: :invalid_input, + extractous: opts + })} + end + end + + defp normalize_extractous_opts(opts) do + {:error, + Error.invalid_error("Extractous options must be a keyword list", %{ + error_code: :invalid_input, + extractous: opts + })} + end + + defp normalize_selector(nil), do: {:ok, nil} + + defp normalize_selector(selector) when is_binary(selector) do + selector + |> String.trim() + |> case do + "" -> {:ok, nil} + value -> {:ok, value} + end + end + + defp normalize_selector(selector) do + {:error, + Error.invalid_error("Selector must be a string", %{ + error_code: :invalid_input, + selector: selector + })} + end + + defp validate_url_length("", _max_url_length) do + {:error, Error.invalid_error("URL cannot be empty", %{error_code: :invalid_input})} + end + + defp validate_url_length(normalized_url, max_url_length) do + if String.length(normalized_url) > max_url_length do + {:error, + Error.invalid_error("URL exceeds maximum length", %{ + error_code: :url_too_long, + max_url_length: max_url_length + })} + else + :ok + end + end + + defp parse_fetch_uri(normalized_url) do + uri = URI.parse(normalized_url) + + if uri.scheme in ["http", "https"] do + {:ok, uri} + else + {:error, + Error.invalid_error("Web fetch only supports http and https URLs", %{ + error_code: :invalid_input, + scheme: uri.scheme + })} + end + end + + defp validate_uri_host(%URI{host: host}) when host in [nil, ""] do + {:error, Error.invalid_error("URL must include a host", %{error_code: :invalid_input})} + end + + defp validate_uri_host(%URI{host: host}) do + if ascii_only?(host) do + :ok + else + {:error, + Error.invalid_error("Web fetch only accepts ASCII hostnames", %{ + error_code: :url_not_allowed, + host: host + })} + end + end + + defp normalize_integer_opt(_name, value, min: min) when is_integer(value) and value >= min, do: {:ok, value} + + defp normalize_integer_opt(name, value, min: min) do + {:error, + Error.invalid_error("#{name} must be an integer greater than or equal to #{min}", %{ + error_code: :invalid_input, + option: name, + value: value + })} + end + + defp normalize_optional_integer_opt(_name, nil, _opts), do: {:ok, nil} + defp normalize_optional_integer_opt(name, value, opts), do: normalize_integer_opt(name, value, opts) + + defp normalize_boolean_opt(_name, value) when is_boolean(value), do: {:ok, value} + + defp normalize_boolean_opt(name, value) do + {:error, + Error.invalid_error("#{name} must be a boolean", %{ + error_code: :invalid_input, + option: name, + value: value + })} + end + + defp canonicalize_keyword_list(keyword_list) do + keyword_list + |> Enum.map(fn {key, value} = pair -> + if is_list(value) and Keyword.keyword?(value) do + {key, canonicalize_keyword_list(value)} + else + pair + end + end) + |> Enum.sort_by(fn {key, _value} -> to_string(key) end) + end + + defp merge_extractous_opts(left, right) do + Keyword.merge(left, right, fn _key, left_value, right_value -> + if Keyword.keyword?(left_value) and Keyword.keyword?(right_value) do + merge_extractous_opts(left_value, right_value) + else + right_value + end + end) + end + + defp normalize_known_url(url) when is_binary(url) do + url + |> String.trim() + |> case do + "" -> nil + value -> value + end + end + + defp normalize_known_url(_), do: nil + + defp normalize_uri(%URI{} = uri) do + %{uri | host: String.downcase(uri.host || ""), fragment: nil} + end + + defp normalize_rule_path(nil), do: "/" + defp normalize_rule_path(""), do: "/" + defp normalize_rule_path(path), do: if(String.starts_with?(path, "/"), do: path, else: "/" <> path) + + defp extractable_document_type(content_type, final_url, body) do + Map.get(@document_content_types, content_type) || + infer_document_type_from_body(body) || + if(ambiguous_binary_content_type?(content_type), do: infer_document_type_from_url(final_url), else: nil) + end + + defp infer_document_type_from_url(url) do + url + |> URI.parse() + |> Map.get(:path, "") + |> Path.extname() + |> String.trim_leading(".") + |> String.downcase() + |> case do + "" -> nil + extension -> Map.get(@document_extensions, extension) + end + end + + defp infer_document_type_from_body(body) when is_binary(body) do + if String.starts_with?(body, "%PDF-"), do: :pdf, else: nil + end + + defp infer_document_type_from_body(_body), do: nil + + defp document_title(metadata, url) do + metadata + |> metadata_title() + |> blank_to_nil() + |> case do + nil -> title_from_url(url) + title -> title + end + end + + defp metadata_title(metadata) when is_map(metadata) do + Enum.find_value([:title, "title", "dc:title", :"dc:title"], fn key -> + metadata + |> Map.get(key) + |> metadata_value_to_string() + |> blank_to_nil() + end) + end + + defp metadata_value_to_string(nil), do: nil + defp metadata_value_to_string(value) when is_binary(value), do: String.trim(value) + + defp metadata_value_to_string(value) when is_list(value), + do: value |> Enum.map_join(" ", &to_string/1) |> String.trim() + + defp metadata_value_to_string(value) when is_atom(value), do: value |> Atom.to_string() |> String.trim() + defp metadata_value_to_string(value) when is_number(value), do: value |> to_string() |> String.trim() + defp metadata_value_to_string(_value), do: nil + + defp normalize_metadata(metadata) when is_map(metadata), do: metadata + + defp maybe_put_metadata(response, metadata) when metadata in [%{}, nil], do: response + defp maybe_put_metadata(response, metadata), do: Map.put(response, :metadata, metadata) + defp maybe_put(opts, _key, nil), do: opts + defp maybe_put(opts, key, value), do: Keyword.put(opts, key, value) + + defp matching_section_indexes(sections, terms) do + downcased_terms = Enum.map(terms, &String.downcase/1) + + sections + |> Enum.with_index() + |> Enum.flat_map(fn {section, index} -> + if section_matches_term?(section, downcased_terms), do: [index], else: [] + end) + end + + defp section_matches_term?(section, downcased_terms) do + lowered = String.downcase(section) + Enum.any?(downcased_terms, &String.contains?(lowered, &1)) + end + + defp expand_focus_window(matching_indexes, window, section_count) do + matching_indexes + |> Enum.flat_map(fn index -> (index - window)..(index + window) end) + |> Enum.filter(&(&1 >= 0 and &1 < section_count)) + |> Enum.uniq() + |> Enum.sort() + end + + defp render_section_slice(sections, indexes) do + indexes + |> Enum.map(&Enum.at(sections, &1)) + |> Enum.reject(&(&1 == "")) + |> Enum.join("\n\n") + |> String.trim() + end + + defp title_from_url(url) do + path = URI.parse(url).path || "" + + case path do + "" -> nil + "/" -> nil + value -> value |> Path.basename() |> String.trim("/") |> blank_to_nil() + end + end + + defp blank_to_nil(nil), do: nil + defp blank_to_nil(""), do: nil + defp blank_to_nil(value), do: value + + defp ascii_only?(value) when is_binary(value) do + String.printable?(value) and String.match?(value, ~r/^[\x00-\x7F]+$/) + end + + defp ambiguous_binary_content_type?(content_type) do + content_type in [ + "application/octet-stream", + "binary/octet-stream", + "application/download", + "application/x-download", + "application/zip", + "application/x-zip-compressed" + ] + end + + defp likely_text?(body) when is_binary(body) do + String.valid?(body) and not String.contains?(body, <<0>>) + end + + defp config(key, default) do + :jido_browser + |> Application.get_env(:web_fetch, []) + |> Keyword.get(key, default) + end +end diff --git a/mix.exs b/mix.exs index b7ce8ca..9fad805 100644 --- a/mix.exs +++ b/mix.exs @@ -69,7 +69,9 @@ defmodule Jido.Browser.MixProject do {:req, "~> 0.5"}, {:jason, "~> 1.4"}, {:uniq, "~> 0.6"}, + {:floki, "~> 0.38"}, {:html2markdown, "~> 0.3"}, + {:extractous_ex, "~> 0.2"}, # Dev/Test {:credo, "~> 1.7", only: [:dev, :test], runtime: false}, @@ -111,7 +113,8 @@ defmodule Jido.Browser.MixProject do Core: [ Jido.Browser, Jido.Browser.Session, - Jido.Browser.Plugin + Jido.Browser.Plugin, + Jido.Browser.WebFetch ], Adapters: [ Jido.Browser.Adapter, @@ -154,7 +157,8 @@ defmodule Jido.Browser.MixProject do "Content Extraction": [ Jido.Browser.Actions.Snapshot, Jido.Browser.Actions.Screenshot, - Jido.Browser.Actions.ExtractContent + Jido.Browser.Actions.ExtractContent, + Jido.Browser.Actions.WebFetch ], Advanced: [ Jido.Browser.Actions.Evaluate diff --git a/mix.lock b/mix.lock index aaf36a7..e91947c 100644 --- a/mix.lock +++ b/mix.lock @@ -1,6 +1,7 @@ %{ "abacus": {:hex, :abacus, "2.1.0", "b6db5c989ba3d9dd8c36d1cb269e2f0058f34768d47c67eb8ce06697ecb36dd4", [:mix], [], "hexpm", "255de08b02884e8383f1eed8aa31df884ce0fb5eb394db81ff888089f2a1bbff"}, "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"}, + "castore": {:hex, :castore, "1.0.18", "5e43ef0ec7d31195dfa5a65a86e6131db999d074179d2ba5a8de11fe14570f55", [:mix], [], "hexpm", "f393e4fe6317829b158fb74d86eb681f737d2fe326aa61ccf6293c4104957e34"}, "certifi": {:hex, :certifi, "2.15.0", "0e6e882fcdaaa0a5a9f2b3db55b1394dba07e8d6d9bcad08318fb604c6839712", [:rebar3], [], "hexpm", "b147ed22ce71d72eafdad94f055165c1c182f61a2ff49df28bcc71d1d5b94a60"}, "credo": {:hex, :credo, "1.7.17", "f92b6aa5b26301eaa5a35e4d48ebf5aa1e7094ac00ae38f87086c562caf8a22f", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "1eb5645c835f0b6c9b5410f94b5a185057bcf6d62a9c2b476da971cde8749645"}, "crontab": {:hex, :crontab, "1.2.0", "503611820257939d5d0fd272eb2b454f48a470435a809479ddc2c40bb515495c", [:mix], [{:ecto, "~> 1.0 or ~> 2.0 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm", "ebd7ef4d831e1b20fa4700f0de0284a04cac4347e813337978e25b4cc5cc2207"}, @@ -12,6 +13,7 @@ "erlex": {:hex, :erlex, "0.2.8", "cd8116f20f3c0afe376d1e8d1f0ae2452337729f68be016ea544a72f767d9c12", [:mix], [], "hexpm", "9d66ff9fedf69e49dc3fd12831e12a8a37b76f8651dd21cd45fcf5561a8a7590"}, "ex_doc": {:hex, :ex_doc, "0.40.1", "67542e4b6dde74811cfd580e2c0149b78010fd13001fda7cfeb2b2c2ffb1344d", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "bcef0e2d360d93ac19f01a85d58f91752d930c0a30e2681145feea6bd3516e00"}, "excoveralls": {:hex, :excoveralls, "0.18.5", "e229d0a65982613332ec30f07940038fe451a2e5b29bce2a5022165f0c9b157e", [:mix], [{:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "523fe8a15603f86d64852aab2abe8ddbd78e68579c8525ae765facc5eae01562"}, + "extractous_ex": {:hex, :extractous_ex, "0.2.1", "c9f7fd58b1d3b0d7eda9e219b1ed534a5b25e485884405d3ceee878e67248df2", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:rustler, "~> 0.37", [hex: :rustler, repo: "hexpm", optional: false]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "8c1a3c74105448545a8478c3610fc920b2da418d47eae656853dc3e881adebd0"}, "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"}, "finch": {:hex, :finch, "0.21.0", "b1c3b2d48af02d0c66d2a9ebfb5622be5c5ecd62937cf79a88a7f98d48a8290c", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "87dc6e169794cb2570f75841a19da99cfde834249568f2a5b121b809588a4377"}, "floki": {:hex, :floki, "0.38.0", "62b642386fa3f2f90713f6e231da0fa3256e41ef1089f83b6ceac7a3fd3abf33", [:mix], [], "hexpm", "a5943ee91e93fb2d635b612caf5508e36d37548e84928463ef9dd986f0d1abd9"}, @@ -51,6 +53,8 @@ "private": {:hex, :private, "0.1.2", "da4add9f36c3818a9f849840ca43016c8ae7f76d7a46c3b2510f42dcc5632932", [:mix], [], "hexpm", "22ee01c3f450cf8d135da61e10ec59dde006238fab1ea039014791fc8f3ff075"}, "recase": {:hex, :recase, "0.8.1", "ab98cd35857a86fa5ca99036f575241d71d77d9c2ab0c39aacf1c9b61f6f7d1d", [:mix], [], "hexpm", "9fd8d63e7e43bd9ea385b12364e305778b2bbd92537e95c4b2e26fc507d5e4c2"}, "req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"}, + "rustler": {:hex, :rustler, "0.37.3", "5f4e6634d43b26f0a69834dd1d3ed4e1710b022a053bf4a670220c9540c92602", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "a6872c6f53dcf00486d1e7f9e046e20e01bf1654bdacc4193016c2e8002b32a2"}, + "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.4", "700a878312acfac79fb6c572bb8b57f5aae05fe1cf70d34b5974850bbf2c05bf", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "3b33d99b540b15f142ba47944f7a163a25069f6d608783c321029bc1ffb09514"}, "splode": {:hex, :splode, "0.3.0", "ff8effecc509a51245df2f864ec78d849248647c37a75886033e3b1a53ca9470", [:mix], [], "hexpm", "73cfd0892d7316d6f2c93e6e8784bd6e137b2aa38443de52fd0a25171d106d81"}, "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"}, "telemetry": {:hex, :telemetry, "1.4.1", "ab6de178e2b29b58e8256b92b382ea3f590a47152ca3651ea857a6cae05ac423", [:rebar3], [], "hexpm", "2172e05a27531d3d31dd9782841065c50dd5c3c7699d95266b2edd54c2dafa1c"}, diff --git a/test/jido_browser/composite_actions_and_installer_test.exs b/test/jido_browser/composite_actions_and_installer_test.exs index dddf9d5..52b4633 100644 --- a/test/jido_browser/composite_actions_and_installer_test.exs +++ b/test/jido_browser/composite_actions_and_installer_test.exs @@ -5,6 +5,7 @@ defmodule Jido.Browser.CompositeActionsAndInstallerTest do alias Jido.Browser.Actions.ReadPage alias Jido.Browser.Actions.SearchWeb alias Jido.Browser.Actions.SnapshotUrl + alias Jido.Browser.Actions.WebFetch alias Jido.Browser.Installer alias Jido.Browser.Session @@ -176,6 +177,64 @@ defmodule Jido.Browser.CompositeActionsAndInstallerTest do end end + describe "WebFetch.run/2" do + test "passes provenance options through to the fetch API" do + expect(Jido.Browser, :web_fetch, fn "https://example.com/guide", opts -> + assert opts[:require_known_url] == true + assert "https://example.com/guide" in opts[:known_urls] + assert opts[:allowed_domains] == ["example.com"] + + {:ok, + %{ + url: "https://example.com/guide", + final_url: "https://example.com/guide", + title: "Guide", + content: "Fetched guide content", + format: :markdown, + content_type: "text/html", + document_type: :html, + retrieved_at: "2026-03-21T00:00:00Z", + estimated_tokens: 5, + original_estimated_tokens: 5, + truncated: false, + filtered: false, + focus_matches: 0, + cached: false, + citations: %{enabled: false}, + passages: [] + }} + end) + + context = %{skill_state: %{seen_urls: ["https://example.com/guide"], web_fetch_uses: 0}} + + assert {:ok, result} = + WebFetch.run( + %{ + url: "https://example.com/guide", + require_known_url: true, + allowed_domains: ["example.com"] + }, + context + ) + + assert result.status == "success" + assert result.url == "https://example.com/guide" + end + + test "returns max_uses_exceeded before calling the fetch API" do + context = %{skill_state: %{web_fetch_uses: 2}} + + assert {:error, error} = + WebFetch.run( + %{url: "https://example.com/guide", max_uses: 2}, + context + ) + + assert %Jido.Browser.Error.InvalidError{} = error + assert error.details.error_code == :max_uses_exceeded + end + end + describe "Installer" do test "target returns a supported platform atom" do assert Installer.target() in [ diff --git a/test/jido_browser/plugin_test.exs b/test/jido_browser/plugin_test.exs index 8636afd..ef16ef9 100644 --- a/test/jido_browser/plugin_test.exs +++ b/test/jido_browser/plugin_test.exs @@ -23,9 +23,9 @@ defmodule Jido.Browser.PluginTest do assert "automation" in tags end - test "has 37 actions" do + test "has 38 actions" do actions = Plugin.actions() - assert length(actions) == 37 + assert length(actions) == 38 end test "includes all expected action modules" do @@ -73,13 +73,14 @@ defmodule Jido.Browser.PluginTest do # Advanced assert Jido.Browser.Actions.Evaluate in actions + assert Jido.Browser.Actions.WebFetch in actions end end describe "signal_routes/1" do - test "returns 37 routes" do + test "returns 38 routes" do routes = Plugin.signal_routes(%{}) - assert length(routes) == 37 + assert length(routes) == 38 end test "maps browser.navigate to Navigate action" do @@ -122,6 +123,8 @@ defmodule Jido.Browser.PluginTest do assert state.adapter == Jido.Browser.Adapters.AgentBrowser assert state.last_url == nil assert state.last_title == nil + assert state.seen_urls == [] + assert state.web_fetch_uses == 0 end test "accepts headless config override" do @@ -154,7 +157,7 @@ defmodule Jido.Browser.PluginTest do test "returns list of signal patterns" do patterns = Plugin.signal_patterns() assert is_list(patterns) - assert length(patterns) == 37 + assert length(patterns) == 38 end test "all patterns have browser. prefix" do @@ -173,6 +176,7 @@ defmodule Jido.Browser.PluginTest do assert "browser.save_state" in patterns assert "browser.tab_list" in patterns assert "browser.console" in patterns + assert "browser.web_fetch" in patterns end end @@ -188,6 +192,36 @@ defmodule Jido.Browser.PluginTest do assert Plugin.transform_result(:some_action, result, %{}) == result end + test "tracks discovered URLs and fetch usage for web fetch results" do + context = %{skill_state: %{seen_urls: ["https://seed.example"], web_fetch_uses: 1}} + + result = + Plugin.transform_result( + Jido.Browser.Actions.WebFetch, + {:ok, %{url: "https://example.com", final_url: "https://example.com/final", status: "success"}}, + context + ) + + assert {:ok, _result, state_updates} = result + + assert Enum.sort(state_updates.seen_urls) == + Enum.sort(["https://seed.example", "https://example.com", "https://example.com/final"]) + + assert state_updates.web_fetch_uses == 2 + end + + test "tracks URLs returned by search results" do + result = + Plugin.transform_result( + Jido.Browser.Actions.SearchWeb, + {:ok, %{results: [%{url: "https://elixir-lang.org"}]}}, + %{skill_state: %{}} + ) + + assert {:ok, _result, state_updates} = result + assert state_updates.seen_urls == ["https://elixir-lang.org"] + end + test "enhances error results when session available" do context = %{ skill_state: %{ diff --git a/test/jido_browser/web_fetch_test.exs b/test/jido_browser/web_fetch_test.exs new file mode 100644 index 0000000..931ada8 --- /dev/null +++ b/test/jido_browser/web_fetch_test.exs @@ -0,0 +1,293 @@ +defmodule Jido.Browser.WebFetchTest do + use ExUnit.Case, async: false + use Mimic + + alias Jido.Browser.Error + alias Jido.Browser.WebFetch + + setup :set_mimic_global + + setup_all do + Mimic.copy(Req) + Mimic.copy(ExtractousEx) + :ok + end + + setup do + WebFetch.clear_cache() + :ok + end + + describe "web_fetch/2" do + test "fetches HTML content with selector extraction and citation passages" do + expect(Req, :run, fn opts -> + assert opts[:url] == "https://example.com/article" + assert opts[:decode_body] == false + + request = Req.Request.new(url: "https://example.com/article") + + response = + %Req.Response{ + status: 200, + headers: %{"content-type" => ["text/html; charset=utf-8"]}, + body: """ + +
Alpha paragraph.
+Beta paragraph.
+