Skip to content

Commit 292c9f2

Browse files
committed
feat: Add support for pgvector's vector data type
1 parent af77af4 commit 292c9f2

File tree

7 files changed

+113
-8
lines changed

7 files changed

+113
-8
lines changed

docker-compose.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
version: "2.1"
44
services:
55
postgres:
6-
image: docker.io/postgres:latest
6+
image: ankane/pgvector:latest
77
command: postgres -c ssl=on -c ssl_cert_file=/var/lib/postgresql/server.crt -c ssl_key_file=/var/lib/postgresql/server.key -c ssl_ca_file=/var/lib/postgresql/ca.crt -c hba_file=/var/lib/postgresql/pg_hba.conf
88
environment:
99
POSTGRES_USER: postgres
@@ -13,16 +13,19 @@ services:
1313
POSTGRES_INITDB_ARGS: --auth-host=cert
1414
# Not placed in the data directory (/var/lib/postgresql/data) because of https://gist.github.com/mrw34/c97bb03ea1054afb551886ffc8b63c3b?permalink_comment_id=2678568#gistcomment-2678568
1515
volumes:
16+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
1617
- ./ssl/server.crt:/var/lib/postgresql/server.crt # Certificate verifying the server's identity to the client.
1718
- ./ssl/server.key:/var/lib/postgresql/server.key # Private key to verify the server's certificate is legitimate.
1819
- ./ssl/ca.crt:/var/lib/postgresql/ca.crt # Certificate authority to use when verifying the client's identity to the server.
1920
- ./ssl/pg_hba.conf:/var/lib/postgresql/pg_hba.conf # Configuration file to allow connection over SSL.
2021
ports:
2122
- "5432:5432"
2223
postgres_no_ssl: # Borrowed from https://github.com/MeltanoLabs/tap-postgres/blob/main/.github/workflows/test.yml#L13-L23
23-
image: docker.io/postgres:latest
24+
image: ankane/pgvector:latest
2425
environment:
2526
POSTGRES_PASSWORD: postgres
27+
volumes:
28+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
2629
ports:
2730
- 5433:5432
2831
ssh:
@@ -37,17 +40,20 @@ services:
3740
- PASSWORD_ACCESS=false
3841
- USER_NAME=melty
3942
volumes:
43+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
4044
- ./ssh_tunnel/ssh-server-config:/config/ssh_host_keys:ro
4145
ports:
4246
- "127.0.0.1:2223:2222"
4347
networks:
4448
- inner
4549
postgresdb:
46-
image: postgres:13.0
50+
image: ankane/pgvector:latest
4751
environment:
4852
POSTGRES_USER: postgres
4953
POSTGRES_PASSWORD: postgres
5054
POSTGRES_DB: main
55+
volumes:
56+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
5157
networks:
5258
inner:
5359
ipv4_address: 10.5.0.5

poetry.lock

Lines changed: 55 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ packages = [
3333
python = "<3.12,>=3.8.1"
3434
requests = "^2.25.1"
3535
singer-sdk = ">=0.28,<0.34"
36+
pgvector = { version="^0.2.4", optional = true }
3637
psycopg2-binary = "2.9.9"
3738
sqlalchemy = ">=2.0,<3.0"
3839
sshtunnel = "0.4.0"
@@ -50,6 +51,9 @@ types-simplejson = "^3.19.0.2"
5051
types-sqlalchemy = "^1.4.53.38"
5152
types-jsonschema = "^4.19.0.3"
5253

54+
[tool.poetry.extras]
55+
pgvector = ["pgvector"]
56+
5357
[tool.mypy]
5458
exclude = "tests"
5559

target_postgres/connector.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,19 @@ def pick_individual_type(jsonschema_type: dict):
277277
if "object" in jsonschema_type["type"]:
278278
return JSONB()
279279
if "array" in jsonschema_type["type"]:
280+
# FIXME: This currently uses a non-conformant
281+
# definition for the Singer SCHEMA. Example:
282+
# {"type": "array",
283+
# "items": {"type": "number"},
284+
# "storage": {"type": "vector", "dim": 4}}
285+
if (
286+
"storage" in jsonschema_type
287+
and "type" in jsonschema_type["storage"]
288+
and jsonschema_type["storage"]["type"] == "vector"
289+
):
290+
from pgvector.sqlalchemy import Vector
291+
292+
return Vector(jsonschema_type["storage"]["dim"])
280293
return ARRAY(JSONB())
281294
if jsonschema_type.get("format") == "date-time":
282295
return TIMESTAMP()
@@ -310,6 +323,13 @@ def pick_best_sql_type(sql_type_array: list):
310323
NOTYPE,
311324
]
312325

326+
try:
327+
from pgvector.sqlalchemy import Vector
328+
329+
precedence_order.append(Vector)
330+
except ImportError:
331+
pass
332+
313333
for sql_type in precedence_order:
314334
for obj in sql_type_array:
315335
if isinstance(obj, sql_type):
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"type": "SCHEMA", "stream": "array_float_vector", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "number"}, "storage": {"type": "vector", "dim": 4}}}}}
2+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 1, "value": [ 1.1, 2.1, 1.1, 1.3 ]}}
3+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 2, "value": [ 1.0, 1.0, 1.0, 2.3 ]}}
4+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 3, "value": [ 2.0, 1.2, 1.0, 0.9 ]}}
5+
{"type": "STATE", "value": {"array_float_vector": 3}}

target_postgres/tests/init.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CREATE EXTENSION IF NOT EXISTS vector;

target_postgres/tests/test_target_postgres.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,25 @@ def test_array_boolean(postgres_target):
458458
)
459459

460460

461+
def test_array_float_vector(postgres_target):
462+
pgvector_sa = pytest.importorskip("pgvector.sqlalchemy")
463+
file_name = "array_float_vector.singer"
464+
singer_file_to_target(file_name, postgres_target)
465+
row = {
466+
"id": 1,
467+
"value": "[1.1,2.1,1.1,1.3]",
468+
}
469+
verify_data(postgres_target, "array_float_vector", 3, "id", row)
470+
verify_schema(
471+
postgres_target,
472+
"array_float_vector",
473+
check_columns={
474+
"id": {"type": BIGINT},
475+
"value": {"type": pgvector_sa.Vector},
476+
},
477+
)
478+
479+
461480
def test_array_number(postgres_target):
462481
file_name = "array_number.singer"
463482
singer_file_to_target(file_name, postgres_target)

0 commit comments

Comments
 (0)