Skip to content

Commit c15e8aa

Browse files
committed
feat: Add support for pgvector's vector data type
1 parent af77af4 commit c15e8aa

File tree

10 files changed

+174
-15
lines changed

10 files changed

+174
-15
lines changed

.github/workflows/ci_workflow.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
pipx install poetry
3737
- name: Install dependencies
3838
run: |
39-
poetry install
39+
poetry install --all-extras
4040
- name: Run pytest
4141
run: |
4242
poetry run pytest --capture=no

README.md

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ tap-carbon-intensity | target-postgres --config /path/to/target-postgres-config.
102102

103103
```bash
104104
pipx install poetry
105-
poetry install
105+
poetry install --all-extras
106106
pipx install pre-commit
107107
pre-commit install
108108
```
@@ -152,6 +152,8 @@ develop your own Singer taps and targets.
152152

153153
## Data Types
154154

155+
### Mapping
156+
155157
The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes.
156158

157159
| jsonschema | Postgres |
@@ -202,7 +204,20 @@ The below table shows how this tap will map between jsonschema datatypes and Pos
202204

203205
Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array.
204206

205-
If a column has multiple jsonschema types, the following order is using to order Postgres types, from highest priority to lowest priority.
207+
When using [pgvector], this type mapping applies, additionally to the table above.
208+
209+
| jsonschema | Postgres |
210+
|------------------------------------------------|----------|
211+
| array (with additional SCHEMA annotations [1]) | vector |
212+
213+
[1] `"storage": {"type": "vector", "dim": 4}`
214+
215+
### Resolution Order
216+
217+
If a column has multiple jsonschema types, there is a priority list for
218+
resolving the best type candidate, from the highest priority to the
219+
lowest priority.
220+
206221
- ARRAY(JSONB)
207222
- JSONB
208223
- TEXT
@@ -215,3 +230,9 @@ If a column has multiple jsonschema types, the following order is using to order
215230
- INTEGER
216231
- BOOLEAN
217232
- NOTYPE
233+
234+
When using [pgvector], the `pgvector.sqlalchemy.Vector` type is added to the bottom
235+
of the list.
236+
237+
238+
[pgvector]: https://github.com/pgvector/pgvector

docker-compose.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
version: "2.1"
44
services:
55
postgres:
6-
image: docker.io/postgres:latest
6+
image: ankane/pgvector:latest
77
command: postgres -c ssl=on -c ssl_cert_file=/var/lib/postgresql/server.crt -c ssl_key_file=/var/lib/postgresql/server.key -c ssl_ca_file=/var/lib/postgresql/ca.crt -c hba_file=/var/lib/postgresql/pg_hba.conf
88
environment:
99
POSTGRES_USER: postgres
@@ -13,16 +13,19 @@ services:
1313
POSTGRES_INITDB_ARGS: --auth-host=cert
1414
# Not placed in the data directory (/var/lib/postgresql/data) because of https://gist.github.com/mrw34/c97bb03ea1054afb551886ffc8b63c3b?permalink_comment_id=2678568#gistcomment-2678568
1515
volumes:
16+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
1617
- ./ssl/server.crt:/var/lib/postgresql/server.crt # Certificate verifying the server's identity to the client.
1718
- ./ssl/server.key:/var/lib/postgresql/server.key # Private key to verify the server's certificate is legitimate.
1819
- ./ssl/ca.crt:/var/lib/postgresql/ca.crt # Certificate authority to use when verifying the client's identity to the server.
1920
- ./ssl/pg_hba.conf:/var/lib/postgresql/pg_hba.conf # Configuration file to allow connection over SSL.
2021
ports:
2122
- "5432:5432"
2223
postgres_no_ssl: # Borrowed from https://github.com/MeltanoLabs/tap-postgres/blob/main/.github/workflows/test.yml#L13-L23
23-
image: docker.io/postgres:latest
24+
image: ankane/pgvector:latest
2425
environment:
2526
POSTGRES_PASSWORD: postgres
27+
volumes:
28+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
2629
ports:
2730
- 5433:5432
2831
ssh:
@@ -37,17 +40,20 @@ services:
3740
- PASSWORD_ACCESS=false
3841
- USER_NAME=melty
3942
volumes:
43+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
4044
- ./ssh_tunnel/ssh-server-config:/config/ssh_host_keys:ro
4145
ports:
4246
- "127.0.0.1:2223:2222"
4347
networks:
4448
- inner
4549
postgresdb:
46-
image: postgres:13.0
50+
image: ankane/pgvector:latest
4751
environment:
4852
POSTGRES_USER: postgres
4953
POSTGRES_PASSWORD: postgres
5054
POSTGRES_DB: main
55+
volumes:
56+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
5157
networks:
5258
inner:
5359
ipv4_address: 10.5.0.5

poetry.lock

Lines changed: 55 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ packages = [
3333
python = "<3.12,>=3.8.1"
3434
requests = "^2.25.1"
3535
singer-sdk = ">=0.28,<0.34"
36+
pgvector = { version="^0.2.4", optional = true }
3637
psycopg2-binary = "2.9.9"
3738
sqlalchemy = ">=2.0,<3.0"
3839
sshtunnel = "0.4.0"
@@ -50,6 +51,9 @@ types-simplejson = "^3.19.0.2"
5051
types-sqlalchemy = "^1.4.53.38"
5152
types-jsonschema = "^4.19.0.3"
5253

54+
[tool.poetry.extras]
55+
pgvector = ["pgvector"]
56+
5357
[tool.mypy]
5458
exclude = "tests"
5559

target_postgres/connector.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,53 @@ def pick_individual_type(jsonschema_type: dict):
277277
if "object" in jsonschema_type["type"]:
278278
return JSONB()
279279
if "array" in jsonschema_type["type"]:
280+
# Select between different kinds of `ARRAY` data types.
281+
#
282+
# This currently leverages an unspecified definition for the Singer SCHEMA,
283+
# using the `additionalProperties` attribute to convey additional type
284+
# information, agnostic of the target database.
285+
#
286+
# In this case, it is about telling different kinds of `ARRAY` types apart:
287+
# Either it is a vanilla `ARRAY`, to be stored into a `jsonb[]` type, or,
288+
# alternatively, it can be a "vector" kind `ARRAY` of floating point
289+
# numbers, effectively what pgvector is storing in its `VECTOR` type.
290+
#
291+
# Still, `type: "vector"` is only a surrogate label here, because other
292+
# database systems may use different types for implementing the same thing,
293+
# and need to translate accordingly.
294+
"""
295+
Schema override rule in `meltano.yml`:
296+
297+
type: "array"
298+
items:
299+
type: "number"
300+
additionalProperties:
301+
storage:
302+
type: "vector"
303+
dim: 4
304+
305+
Produced schema annotation in `catalog.json`:
306+
307+
{"type": "array",
308+
"items": {"type": "number"},
309+
"additionalProperties": {"storage": {"type": "vector", "dim": 4}}}
310+
"""
311+
if (
312+
"additionalProperties" in jsonschema_type
313+
and "storage" in jsonschema_type["additionalProperties"]
314+
):
315+
storage_properties = jsonschema_type["additionalProperties"]["storage"]
316+
if (
317+
"type" in storage_properties
318+
and storage_properties["type"] == "vector"
319+
):
320+
# On PostgreSQL/pgvector, use the corresponding type definition
321+
# from its SQLAlchemy dialect.
322+
from pgvector.sqlalchemy import (
323+
Vector, # type: ignore[import-untyped]
324+
)
325+
326+
return Vector(storage_properties["dim"])
280327
return ARRAY(JSONB())
281328
if jsonschema_type.get("format") == "date-time":
282329
return TIMESTAMP()
@@ -310,6 +357,13 @@ def pick_best_sql_type(sql_type_array: list):
310357
NOTYPE,
311358
]
312359

360+
try:
361+
from pgvector.sqlalchemy import Vector
362+
363+
precedence_order.append(Vector)
364+
except ImportError:
365+
pass
366+
313367
for sql_type in precedence_order:
314368
for obj in sql_type_array:
315369
if isinstance(obj, sql_type):
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"type": "SCHEMA", "stream": "array_float_vector", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "number"}, "storage": {"type": "vector", "dim": 4}}}}}
2+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 1, "value": [ 1.1, 2.1, 1.1, 1.3 ]}}
3+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 2, "value": [ 1.0, 1.0, 1.0, 2.3 ]}}
4+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 3, "value": [ 2.0, 1.2, 1.0, 0.9 ]}}
5+
{"type": "STATE", "value": {"array_float_vector": 3}}

target_postgres/tests/init.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CREATE EXTENSION IF NOT EXISTS vector;

target_postgres/tests/test_target_postgres.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,24 @@ def test_array_boolean(postgres_target):
458458
)
459459

460460

461+
def test_array_float_vector(postgres_target):
462+
file_name = "array_float_vector.singer"
463+
singer_file_to_target(file_name, postgres_target)
464+
row = {
465+
"id": 1,
466+
"value": [Decimal("1.1"), Decimal("2.1"), Decimal("1.1"), Decimal("1.3")],
467+
}
468+
verify_data(postgres_target, "array_float_vector", 3, "id", row)
469+
verify_schema(
470+
postgres_target,
471+
"array_float_vector",
472+
check_columns={
473+
"id": {"type": BIGINT},
474+
"value": {"type": ARRAY},
475+
},
476+
)
477+
478+
461479
def test_array_number(postgres_target):
462480
file_name = "array_number.singer"
463481
singer_file_to_target(file_name, postgres_target)

tox.ini

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ isolated_build = true
99
allowlist_externals = poetry
1010

1111
commands =
12-
poetry install -v
12+
poetry install --all-extras -v
1313
poetry run pytest
1414
poetry run black --check target_postgres/
1515
poetry run flake8 target_postgres
@@ -21,22 +21,22 @@ commands =
2121
# To execute, run `tox -e pytest`
2222
envlist = py37, py38, py39
2323
commands =
24-
poetry install -v
24+
poetry install --all-extras -v
2525
poetry run pytest
2626

2727
[testenv:format]
2828
# Attempt to auto-resolve lint errors before they are raised.
2929
# To execute, run `tox -e format`
3030
commands =
31-
poetry install -v
31+
poetry install --all-extras -v
3232
poetry run black target_postgres/
3333
poetry run isort target_postgres
3434

3535
[testenv:lint]
3636
# Raise an error if lint and style standards are not met.
3737
# To execute, run `tox -e lint`
3838
commands =
39-
poetry install -v
39+
poetry install --all-extras -v
4040
poetry run black --check --diff target_postgres/
4141
poetry run isort --check target_postgres
4242
poetry run flake8 target_postgres

0 commit comments

Comments
 (0)