Skip to content

Commit abcb147

Browse files
committed
feat: Add support for pgvector's vector data type
1 parent af77af4 commit abcb147

File tree

10 files changed

+141
-15
lines changed

10 files changed

+141
-15
lines changed

.github/workflows/ci_workflow.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
pipx install poetry
3737
- name: Install dependencies
3838
run: |
39-
poetry install
39+
poetry install --all-extras
4040
- name: Run pytest
4141
run: |
4242
poetry run pytest --capture=no

README.md

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ tap-carbon-intensity | target-postgres --config /path/to/target-postgres-config.
102102

103103
```bash
104104
pipx install poetry
105-
poetry install
105+
poetry install --all-extras
106106
pipx install pre-commit
107107
pre-commit install
108108
```
@@ -152,6 +152,8 @@ develop your own Singer taps and targets.
152152

153153
## Data Types
154154

155+
### Mapping
156+
155157
The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes.
156158

157159
| jsonschema | Postgres |
@@ -202,7 +204,20 @@ The below table shows how this tap will map between jsonschema datatypes and Pos
202204

203205
Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array.
204206

205-
If a column has multiple jsonschema types, the following order is using to order Postgres types, from highest priority to lowest priority.
207+
When using [pgvector], this type mapping applies, additionally to the table above.
208+
209+
| jsonschema | Postgres |
210+
|------------------------------------------------|----------|
211+
| array (with additional SCHEMA annotations [1]) | vector |
212+
213+
[1] `"storage": {"type": "vector", "dim": 4}`
214+
215+
### Resolution Order
216+
217+
If a column has multiple jsonschema types, there is a priority list for
218+
resolving the best type candidate, from the highest priority to the
219+
lowest priority.
220+
206221
- ARRAY(JSONB)
207222
- JSONB
208223
- TEXT
@@ -215,3 +230,9 @@ If a column has multiple jsonschema types, the following order is using to order
215230
- INTEGER
216231
- BOOLEAN
217232
- NOTYPE
233+
234+
When using [pgvector], the `pgvector.sqlalchemy.Vector` type is added to the bottom
235+
of the list.
236+
237+
238+
[pgvector]: https://github.com/pgvector/pgvector

docker-compose.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
version: "2.1"
44
services:
55
postgres:
6-
image: docker.io/postgres:latest
6+
image: ankane/pgvector:latest
77
command: postgres -c ssl=on -c ssl_cert_file=/var/lib/postgresql/server.crt -c ssl_key_file=/var/lib/postgresql/server.key -c ssl_ca_file=/var/lib/postgresql/ca.crt -c hba_file=/var/lib/postgresql/pg_hba.conf
88
environment:
99
POSTGRES_USER: postgres
@@ -13,16 +13,19 @@ services:
1313
POSTGRES_INITDB_ARGS: --auth-host=cert
1414
# Not placed in the data directory (/var/lib/postgresql/data) because of https://gist.github.com/mrw34/c97bb03ea1054afb551886ffc8b63c3b?permalink_comment_id=2678568#gistcomment-2678568
1515
volumes:
16+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
1617
- ./ssl/server.crt:/var/lib/postgresql/server.crt # Certificate verifying the server's identity to the client.
1718
- ./ssl/server.key:/var/lib/postgresql/server.key # Private key to verify the server's certificate is legitimate.
1819
- ./ssl/ca.crt:/var/lib/postgresql/ca.crt # Certificate authority to use when verifying the client's identity to the server.
1920
- ./ssl/pg_hba.conf:/var/lib/postgresql/pg_hba.conf # Configuration file to allow connection over SSL.
2021
ports:
2122
- "5432:5432"
2223
postgres_no_ssl: # Borrowed from https://github.com/MeltanoLabs/tap-postgres/blob/main/.github/workflows/test.yml#L13-L23
23-
image: docker.io/postgres:latest
24+
image: ankane/pgvector:latest
2425
environment:
2526
POSTGRES_PASSWORD: postgres
27+
volumes:
28+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
2629
ports:
2730
- 5433:5432
2831
ssh:
@@ -37,17 +40,20 @@ services:
3740
- PASSWORD_ACCESS=false
3841
- USER_NAME=melty
3942
volumes:
43+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
4044
- ./ssh_tunnel/ssh-server-config:/config/ssh_host_keys:ro
4145
ports:
4246
- "127.0.0.1:2223:2222"
4347
networks:
4448
- inner
4549
postgresdb:
46-
image: postgres:13.0
50+
image: ankane/pgvector:latest
4751
environment:
4852
POSTGRES_USER: postgres
4953
POSTGRES_PASSWORD: postgres
5054
POSTGRES_DB: main
55+
volumes:
56+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
5157
networks:
5258
inner:
5359
ipv4_address: 10.5.0.5

poetry.lock

Lines changed: 55 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ packages = [
3333
python = "<3.12,>=3.8.1"
3434
requests = "^2.25.1"
3535
singer-sdk = ">=0.28,<0.34"
36+
pgvector = { version="^0.2.4", optional = true }
3637
psycopg2-binary = "2.9.9"
3738
sqlalchemy = ">=2.0,<3.0"
3839
sshtunnel = "0.4.0"
@@ -50,6 +51,9 @@ types-simplejson = "^3.19.0.2"
5051
types-sqlalchemy = "^1.4.53.38"
5152
types-jsonschema = "^4.19.0.3"
5253

54+
[tool.poetry.extras]
55+
pgvector = ["pgvector"]
56+
5357
[tool.mypy]
5458
exclude = "tests"
5559

target_postgres/connector.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,19 @@ def pick_individual_type(jsonschema_type: dict):
277277
if "object" in jsonschema_type["type"]:
278278
return JSONB()
279279
if "array" in jsonschema_type["type"]:
280+
# FIXME: This currently uses a non-conformant
281+
# definition for the Singer SCHEMA. Example:
282+
# {"type": "array",
283+
# "items": {"type": "number"},
284+
# "storage": {"type": "vector", "dim": 4}}
285+
if (
286+
"storage" in jsonschema_type
287+
and "type" in jsonschema_type["storage"]
288+
and jsonschema_type["storage"]["type"] == "vector"
289+
):
290+
from pgvector.sqlalchemy import Vector
291+
292+
return Vector(jsonschema_type["storage"]["dim"])
280293
return ARRAY(JSONB())
281294
if jsonschema_type.get("format") == "date-time":
282295
return TIMESTAMP()
@@ -310,6 +323,13 @@ def pick_best_sql_type(sql_type_array: list):
310323
NOTYPE,
311324
]
312325

326+
try:
327+
from pgvector.sqlalchemy import Vector
328+
329+
precedence_order.append(Vector)
330+
except ImportError:
331+
pass
332+
313333
for sql_type in precedence_order:
314334
for obj in sql_type_array:
315335
if isinstance(obj, sql_type):
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"type": "SCHEMA", "stream": "array_float_vector", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "number"}, "storage": {"type": "vector", "dim": 4}}}}}
2+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 1, "value": [ 1.1, 2.1, 1.1, 1.3 ]}}
3+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 2, "value": [ 1.0, 1.0, 1.0, 2.3 ]}}
4+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 3, "value": [ 2.0, 1.2, 1.0, 0.9 ]}}
5+
{"type": "STATE", "value": {"array_float_vector": 3}}

target_postgres/tests/init.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CREATE EXTENSION IF NOT EXISTS vector;

target_postgres/tests/test_target_postgres.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,25 @@ def test_array_boolean(postgres_target):
458458
)
459459

460460

461+
def test_array_float_vector(postgres_target):
462+
pgvector_sa = pytest.importorskip("pgvector.sqlalchemy")
463+
file_name = "array_float_vector.singer"
464+
singer_file_to_target(file_name, postgres_target)
465+
row = {
466+
"id": 1,
467+
"value": "[1.1,2.1,1.1,1.3]",
468+
}
469+
verify_data(postgres_target, "array_float_vector", 3, "id", row)
470+
verify_schema(
471+
postgres_target,
472+
"array_float_vector",
473+
check_columns={
474+
"id": {"type": BIGINT},
475+
"value": {"type": pgvector_sa.Vector},
476+
},
477+
)
478+
479+
461480
def test_array_number(postgres_target):
462481
file_name = "array_number.singer"
463482
singer_file_to_target(file_name, postgres_target)

tox.ini

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ isolated_build = true
99
allowlist_externals = poetry
1010

1111
commands =
12-
poetry install -v
12+
poetry install --all-extras -v
1313
poetry run pytest
1414
poetry run black --check target_postgres/
1515
poetry run flake8 target_postgres
@@ -21,22 +21,22 @@ commands =
2121
# To execute, run `tox -e pytest`
2222
envlist = py37, py38, py39
2323
commands =
24-
poetry install -v
24+
poetry install --all-extras -v
2525
poetry run pytest
2626

2727
[testenv:format]
2828
# Attempt to auto-resolve lint errors before they are raised.
2929
# To execute, run `tox -e format`
3030
commands =
31-
poetry install -v
31+
poetry install --all-extras -v
3232
poetry run black target_postgres/
3333
poetry run isort target_postgres
3434

3535
[testenv:lint]
3636
# Raise an error if lint and style standards are not met.
3737
# To execute, run `tox -e lint`
3838
commands =
39-
poetry install -v
39+
poetry install --all-extras -v
4040
poetry run black --check --diff target_postgres/
4141
poetry run isort --check target_postgres
4242
poetry run flake8 target_postgres

0 commit comments

Comments
 (0)