From 7abcda13c2bff7515d21704865c4b1375e8b8fe0 Mon Sep 17 00:00:00 2001 From: Matt Husby Date: Sun, 25 Aug 2024 16:39:48 -0400 Subject: [PATCH 1/3] Document type and github loader --- lib/document.ex | 48 ++++++++ lib/document/loader.ex | 5 + lib/document/loaders/github.ex | 167 +++++++++++++++++++++++++++ test/document/text_document_test.exs | 35 ++++++ 4 files changed, 255 insertions(+) create mode 100644 lib/document.ex create mode 100644 lib/document/loader.ex create mode 100644 lib/document/loaders/github.ex create mode 100644 test/document/text_document_test.exs diff --git a/lib/document.ex b/lib/document.ex new file mode 100644 index 00000000..ab710629 --- /dev/null +++ b/lib/document.ex @@ -0,0 +1,48 @@ +defmodule LangChain.Document do + use Ecto.Schema + import Ecto.Changeset + require Logger + alias __MODULE__ + + @type t :: %__MODULE__{} + + @primary_key false + embedded_schema do + field :content, :string + field :metadata, :map + field :type, :string + end + + @create_fields [:content, :metadata, :type] + @required_fields [:content, :type] + + @doc """ + Build a new document and return an `:ok`/`:error` tuple with the result. + """ + @spec new(attrs :: map()) :: {:ok, t()} | {:error, Ecto.Changeset.t()} + def new(attrs \\ %{}) do + %Document{} + |> cast(attrs, @create_fields) + |> common_validations() + |> apply_action(:insert) + end + + @doc """ + Build a new document and error out if the changeset is invalid + """ + @spec new!(attrs :: map()) :: t() + def new!(attrs \\ %{}) do + case new(attrs) do + {:ok, doc} -> + doc + + {:error, changeset} -> + raise LangChainError, changeset + end + end + + defp common_validations(changeset) do + changeset + |> validate_required(@required_fields) + end +end diff --git a/lib/document/loader.ex b/lib/document/loader.ex new file mode 100644 index 00000000..09743dad --- /dev/null +++ b/lib/document/loader.ex @@ -0,0 +1,5 @@ +defmodule LangChain.Document.Loader do + alias LangChain.Document + + @callback load(options :: map()) :: %Document{} +end diff --git a/lib/document/loaders/github.ex b/lib/document/loaders/github.ex new file mode 100644 index 00000000..d2b2cd8b --- /dev/null +++ b/lib/document/loaders/github.ex @@ -0,0 +1,167 @@ +defmodule LangChain.Document.Loaders.Github do + @moduledoc """ + Currently this module only supports grabbing issues. + + Extending this to support other resources (like PRs, commits, etc) will require + a little more work, but at the moment I am not using those resources. + """ + + @behaviour LangChain.Document.Loader + + use Ecto.Schema + require Logger + import Ecto.Changeset + alias __MODULE__ + alias LangChain.Config + alias LangChain.Document + alias LangChain.LangChainError + + # allow up to 1 minute for response. + @receive_timeout 60_000 + + @primary_key false + embedded_schema do + # API endpoint to use. Defaults to Github's API + field :endpoint, :string, default: "https://api.github.com" + field :api_key, :string + field :receive_timeout, :integer, default: @receive_timeout + end + + @type t :: %Github{} + + @create_fields [ + :endpoint, + :api_key, + :receive_timeout + ] + @required_fields [:endpoint, :receive_timeout] + + @spec load(t(), map()) :: [Document.t()] | [] + def load(%Github{} = github, %{type: :issue} = options) do + make_request(github, options[:repo]) + |> to_documents() + end + + def load(options) do + raise LangChainError, "Unsupported type: #{inspect(options[:type])}" + end + + @spec to_documents(issues :: [map()]) :: [Document.t()] + def to_documents(issues) do + Enum.map(issues, fn issue -> + %Document{ + content: issue.body, + metadata: %{ + id: issue.id, + title: issue.title + }, + type: "github_issue" + } + end) + end + + @spec make_request(t(), String.t()) :: [map()] | no_return() + def make_request(github, repo) do + make_request(github, repo, 1, 3, []) + end + + @spec make_request(t(), String.t(), integer(), integer(), [map()]) :: [map()] | no_return() + def make_request(_gihub, _repo, _page, 0, _acc) do + raise LangChainError, "Retries exceeded. Connection failed." + end + + def make_request(%Github{} = github, repo, page, retry_count, acc) do + req = + Req.new( + url: "#{github.endpoint}/repos/#{repo}/issues?page=#{page}", + headers: headers(get_api_key(github)), + receive_timeout: github.receive_timeout, + retry: :transient, + max_retries: 3, + retry_delay: fn attempt -> 300 * attempt end + ) + + req + |> Req.get() + # parse the body and return it as parsed structs + |> case do + {:ok, %Req.Response{body: data, headers: _headers} = _response} -> + case process_response(data) do + {:error, reason} -> + {:error, reason} + + result -> + # @TODO check the headers and see if we need to do some pagination + # if so, call this function recursively with the next page + result + end + + {:error, %Req.TransportError{reason: :timeout}} -> + {:error, "Request timed out"} + + {:error, %Req.TransportError{reason: :closed}} -> + # Force a retry by making a recursive call decrementing the counter + Logger.debug(fn -> "Mint connection closed: retry count = #{inspect(retry_count)}" end) + make_request(github, repo, page, retry_count - 1, acc) + + other -> + Logger.error("Unexpected and unhandled API response! #{inspect(other)}") + other + end + end + + @spec get_api_key(t()) :: String.t() + defp get_api_key(%Github{api_key: api_key}) do + api_key || Config.resolve(:github_key, "") + end + + @doc """ + Setup a Github client configuration. + """ + @spec new(attrs :: map()) :: {:ok, t} | {:error, Ecto.Changeset.t()} + def new(%{} = attrs \\ %{}) do + %Github{} + |> cast(attrs, @create_fields) + |> common_validation() + |> apply_action(:insert) + end + + @doc """ + Setup a Guthub client configuration and return it or raise an error if invalid. + """ + @spec new!(attrs :: map()) :: t() | no_return() + def new!(attrs \\ %{}) do + case new(attrs) do + {:ok, chain} -> + chain + + {:error, changeset} -> + raise LangChainError, changeset + end + end + + defp common_validation(changeset) do + changeset + |> validate_required(@required_fields) + end + + defp headers("") do + %{} + end + + defp headers(api_key) do + %{ + "Authorization" => "Bearer #{api_key}" + } + end + + def process_response(response) do + Enum.map(response, fn issue -> + %{ + :id => issue["id"], + :title => issue["title"], + :body => issue["body"] + } + end) + end +end diff --git a/test/document/text_document_test.exs b/test/document/text_document_test.exs new file mode 100644 index 00000000..af4892fc --- /dev/null +++ b/test/document/text_document_test.exs @@ -0,0 +1,35 @@ +defmodule LangChain.Document.TextDocuemntTest do + use ExUnit.Case + doctest LangChain.Document + alias LangChain.Document + + describe "new/1" do + test "works with basic attrs" do + assert {:ok, %Document{} = document} = + Document.new(%{ + "content" => "Here's some content", + "type" => "plain_text", + "metadata" => %{"source" => "https://example.com"} + }) + + assert document.type == "plain_text" + assert document.content == "Here's some content" + assert document.metadata["source"] == "https://example.com" + end + + test "returns error when invalid" do + assert {:error, changeset} = Document.new(%{"content" => nil}) + refute changeset.valid? + assert {"can't be blank", _} = changeset.errors[:content] + assert {"can't be blank", _} = changeset.errors[:type] + end + end + + describe "new!/1" do + test "throws when invalid" do + assert_raise LangChainError, fn -> + Document.new!(%{"content" => nil}) + end + end + end +end From 80f0a22d2d4aa15c3c47ead4cedd6f485655c9ba Mon Sep 17 00:00:00 2001 From: Matt Husby Date: Sun, 25 Aug 2024 16:54:55 -0400 Subject: [PATCH 2/3] I lost some alias's somewhere :-) --- lib/document.ex | 1 + test/document/text_document_test.exs | 1 + 2 files changed, 2 insertions(+) diff --git a/lib/document.ex b/lib/document.ex index ab710629..c79a2d1f 100644 --- a/lib/document.ex +++ b/lib/document.ex @@ -3,6 +3,7 @@ defmodule LangChain.Document do import Ecto.Changeset require Logger alias __MODULE__ + alias LangChain.LangChainError @type t :: %__MODULE__{} diff --git a/test/document/text_document_test.exs b/test/document/text_document_test.exs index af4892fc..4bfbc9ac 100644 --- a/test/document/text_document_test.exs +++ b/test/document/text_document_test.exs @@ -2,6 +2,7 @@ defmodule LangChain.Document.TextDocuemntTest do use ExUnit.Case doctest LangChain.Document alias LangChain.Document + alias LangChain.LangChainError describe "new/1" do test "works with basic attrs" do From f83d91d3c5ab46452c3bfcb258165a3f3117c410 Mon Sep 17 00:00:00 2001 From: Matt Husby Date: Fri, 10 Jan 2025 19:41:23 -0500 Subject: [PATCH 3/3] Refactor GitHub loader and add tests --- lib/document/loaders/github.ex | 13 +--- test/document/loaders/github_test.exs | 87 +++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 10 deletions(-) create mode 100644 test/document/loaders/github_test.exs diff --git a/lib/document/loaders/github.ex b/lib/document/loaders/github.ex index d2b2cd8b..29067a20 100644 --- a/lib/document/loaders/github.ex +++ b/lib/document/loaders/github.ex @@ -83,18 +83,11 @@ defmodule LangChain.Document.Loaders.Github do req |> Req.get() - # parse the body and return it as parsed structs |> case do {:ok, %Req.Response{body: data, headers: _headers} = _response} -> - case process_response(data) do - {:error, reason} -> - {:error, reason} - - result -> - # @TODO check the headers and see if we need to do some pagination - # if so, call this function recursively with the next page - result - end + # @TODO check the headers and see if we need to do some pagination + # if so, call this function recursively with the next page + process_response(data) {:error, %Req.TransportError{reason: :timeout}} -> {:error, "Request timed out"} diff --git a/test/document/loaders/github_test.exs b/test/document/loaders/github_test.exs new file mode 100644 index 00000000..3629caa8 --- /dev/null +++ b/test/document/loaders/github_test.exs @@ -0,0 +1,87 @@ +defmodule LangChain.Document.Loaders.GithubTest do + use ExUnit.Case, async: true + + alias LangChain.Document.Loaders.Github + alias LangChain.Document + alias LangChain.LangChainError + + describe "new/1" do + test "creates a new Github struct with default values" do + assert {:ok, %Github{endpoint: "https://api.github.com", receive_timeout: 60_000}} = + Github.new(%{}) + end + + test "creates a new Github struct with custom values" do + attrs = %{ + endpoint: "https://custom.github.com", + api_key: "test_key", + receive_timeout: 30_000 + } + + assert {:ok, + %Github{ + endpoint: "https://custom.github.com", + api_key: "test_key", + receive_timeout: 30_000 + }} = + Github.new(attrs) + end + end + + describe "load/1 with unsupported type" do + test "raises LangChainError when type is unsupported" do + options = %{type: :unsupported_type} + + assert_raise LangChainError, "Unsupported type: :unsupported_type", fn -> + Github.load(options) + end + end + end + + describe "new!/1" do + test "creates a new Github struct with default values" do + assert %Github{endpoint: "https://api.github.com", receive_timeout: 60_000} = + Github.new!(%{}) + end + end + + describe "to_documents/1" do + test "converts a list of issues into Document structs" do + issues = [ + %{id: 1, title: "Issue 1", body: "Body 1"}, + %{id: 2, title: "Issue 2", body: "Body 2"} + ] + + documents = Github.to_documents(issues) + + assert [ + %Document{ + content: "Body 1", + metadata: %{id: 1, title: "Issue 1"}, + type: "github_issue" + }, + %Document{ + content: "Body 2", + metadata: %{id: 2, title: "Issue 2"}, + type: "github_issue" + } + ] = documents + end + end + + describe "process_response/1" do + test "processes a list of GitHub issues from the API response" do + response = [ + %{"id" => 1, "title" => "Issue 1", "body" => "Body 1"}, + %{"id" => 2, "title" => "Issue 2", "body" => "Body 2"} + ] + + processed_issues = Github.process_response(response) + + assert [ + %{id: 1, title: "Issue 1", body: "Body 1"}, + %{id: 2, title: "Issue 2", body: "Body 2"} + ] = processed_issues + end + end +end