Skip to content

Commit 0f0f4fa

Browse files
committed
feat: Add support for pgvector's vector data type
1 parent daeb62d commit 0f0f4fa

File tree

10 files changed

+187
-17
lines changed

10 files changed

+187
-17
lines changed

.github/workflows/ci_workflow.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
pipx install poetry
3737
- name: Install dependencies
3838
run: |
39-
poetry install
39+
poetry install --all-extras
4040
- name: Run pytest
4141
run: |
4242
poetry run pytest --capture=no

README.md

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ tap-carbon-intensity | target-postgres --config /path/to/target-postgres-config.
102102

103103
```bash
104104
pipx install poetry
105-
poetry install
105+
poetry install --all-extras
106106
pipx install pre-commit
107107
pre-commit install
108108
```
@@ -152,6 +152,8 @@ develop your own Singer taps and targets.
152152

153153
## Data Types
154154

155+
### Mapping
156+
155157
The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes.
156158

157159
| jsonschema | Postgres |
@@ -202,7 +204,20 @@ The below table shows how this tap will map between jsonschema datatypes and Pos
202204

203205
Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array.
204206

205-
If a column has multiple jsonschema types, the following order is using to order Postgres types, from highest priority to lowest priority.
207+
When using [pgvector], this type mapping applies, additionally to the table above.
208+
209+
| jsonschema | Postgres |
210+
|------------------------------------------------|----------|
211+
| array (with additional SCHEMA annotations [1]) | vector |
212+
213+
[1] `"additionalProperties": {"storage": {"type": "vector", "dim": 4}}`
214+
215+
### Resolution Order
216+
217+
If a column has multiple jsonschema types, there is a priority list for
218+
resolving the best type candidate, from the highest priority to the
219+
lowest priority.
220+
206221
- ARRAY(JSONB)
207222
- JSONB
208223
- TEXT
@@ -215,3 +230,9 @@ If a column has multiple jsonschema types, the following order is using to order
215230
- INTEGER
216231
- BOOLEAN
217232
- NOTYPE
233+
234+
When using [pgvector], the `pgvector.sqlalchemy.Vector` type is added to the bottom
235+
of the list.
236+
237+
238+
[pgvector]: https://github.com/pgvector/pgvector

docker-compose.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
version: "2.1"
44
services:
55
postgres:
6-
image: docker.io/postgres:latest
6+
image: ankane/pgvector:latest
77
command: postgres -c ssl=on -c ssl_cert_file=/var/lib/postgresql/server.crt -c ssl_key_file=/var/lib/postgresql/server.key -c ssl_ca_file=/var/lib/postgresql/ca.crt -c hba_file=/var/lib/postgresql/pg_hba.conf
88
environment:
99
POSTGRES_USER: postgres
@@ -13,16 +13,19 @@ services:
1313
POSTGRES_INITDB_ARGS: --auth-host=cert
1414
# Not placed in the data directory (/var/lib/postgresql/data) because of https://gist.github.com/mrw34/c97bb03ea1054afb551886ffc8b63c3b?permalink_comment_id=2678568#gistcomment-2678568
1515
volumes:
16+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
1617
- ./ssl/server.crt:/var/lib/postgresql/server.crt # Certificate verifying the server's identity to the client.
1718
- ./ssl/server.key:/var/lib/postgresql/server.key # Private key to verify the server's certificate is legitimate.
1819
- ./ssl/ca.crt:/var/lib/postgresql/ca.crt # Certificate authority to use when verifying the client's identity to the server.
1920
- ./ssl/pg_hba.conf:/var/lib/postgresql/pg_hba.conf # Configuration file to allow connection over SSL.
2021
ports:
2122
- "5432:5432"
2223
postgres_no_ssl: # Borrowed from https://github.com/MeltanoLabs/tap-postgres/blob/main/.github/workflows/test.yml#L13-L23
23-
image: docker.io/postgres:latest
24+
image: ankane/pgvector:latest
2425
environment:
2526
POSTGRES_PASSWORD: postgres
27+
volumes:
28+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
2629
ports:
2730
- 5433:5432
2831
ssh:
@@ -37,17 +40,20 @@ services:
3740
- PASSWORD_ACCESS=false
3841
- USER_NAME=melty
3942
volumes:
43+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
4044
- ./ssh_tunnel/ssh-server-config:/config/ssh_host_keys:ro
4145
ports:
4246
- "127.0.0.1:2223:2222"
4347
networks:
4448
- inner
4549
postgresdb:
46-
image: postgres:13.0
50+
image: ankane/pgvector:latest
4751
environment:
4852
POSTGRES_USER: postgres
4953
POSTGRES_PASSWORD: postgres
5054
POSTGRES_DB: main
55+
volumes:
56+
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
5157
networks:
5258
inner:
5359
ipv4_address: 10.5.0.5

poetry.lock

Lines changed: 55 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ packages = [
3333
python = "<3.12,>=3.8.1"
3434
requests = "^2.25.1"
3535
singer-sdk = ">=0.28,<0.34"
36+
pgvector = { version="^0.2.4", optional = true }
3637
psycopg2-binary = "2.9.9"
3738
sqlalchemy = ">=2.0,<3.0"
3839
sshtunnel = "0.4.0"
@@ -50,11 +51,17 @@ types-simplejson = "^3.19.0.2"
5051
types-sqlalchemy = "^1.4.53.38"
5152
types-jsonschema = "^4.19.0.3"
5253

54+
[tool.poetry.extras]
55+
pgvector = ["pgvector"]
56+
5357
[tool.mypy]
5458
exclude = "tests"
5559

5660
[[tool.mypy.overrides]]
57-
module = ["sshtunnel"]
61+
module = [
62+
"pgvector.sqlalchemy",
63+
"sshtunnel",
64+
]
5865
ignore_missing_imports = true
5966

6067
[tool.isort]

target_postgres/connector.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,14 @@ def prepare_table( # type: ignore[override]
114114
connection=connection,
115115
)
116116
return table
117+
# To make table reflection work properly with pgvector,
118+
# the module needs to be imported beforehand.
119+
try:
120+
from pgvector.sqlalchemy import Vector # noqa: F401
121+
except ImportError:
122+
self.logger.debug(
123+
"Unable to handle pgvector's `Vector` type. Please install `pgvector`."
124+
)
117125
meta.reflect(connection, only=[table_name])
118126
table = meta.tables[
119127
full_table_name
@@ -277,6 +285,51 @@ def pick_individual_type(jsonschema_type: dict):
277285
if "object" in jsonschema_type["type"]:
278286
return JSONB()
279287
if "array" in jsonschema_type["type"]:
288+
# Select between different kinds of `ARRAY` data types.
289+
#
290+
# This currently leverages an unspecified definition for the Singer SCHEMA,
291+
# using the `additionalProperties` attribute to convey additional type
292+
# information, agnostic of the target database.
293+
#
294+
# In this case, it is about telling different kinds of `ARRAY` types apart:
295+
# Either it is a vanilla `ARRAY`, to be stored into a `jsonb[]` type, or,
296+
# alternatively, it can be a "vector" kind `ARRAY` of floating point
297+
# numbers, effectively what pgvector is storing in its `VECTOR` type.
298+
#
299+
# Still, `type: "vector"` is only a surrogate label here, because other
300+
# database systems may use different types for implementing the same thing,
301+
# and need to translate accordingly.
302+
"""
303+
Schema override rule in `meltano.yml`:
304+
305+
type: "array"
306+
items:
307+
type: "number"
308+
additionalProperties:
309+
storage:
310+
type: "vector"
311+
dim: 4
312+
313+
Produced schema annotation in `catalog.json`:
314+
315+
{"type": "array",
316+
"items": {"type": "number"},
317+
"additionalProperties": {"storage": {"type": "vector", "dim": 4}}}
318+
"""
319+
if (
320+
"additionalProperties" in jsonschema_type
321+
and "storage" in jsonschema_type["additionalProperties"]
322+
):
323+
storage_properties = jsonschema_type["additionalProperties"]["storage"]
324+
if (
325+
"type" in storage_properties
326+
and storage_properties["type"] == "vector"
327+
):
328+
# On PostgreSQL/pgvector, use the corresponding type definition
329+
# from its SQLAlchemy dialect.
330+
from pgvector.sqlalchemy import Vector
331+
332+
return Vector(storage_properties["dim"])
280333
return ARRAY(JSONB())
281334
if jsonschema_type.get("format") == "date-time":
282335
return TIMESTAMP()
@@ -310,6 +363,13 @@ def pick_best_sql_type(sql_type_array: list):
310363
NOTYPE,
311364
]
312365

366+
try:
367+
from pgvector.sqlalchemy import Vector
368+
369+
precedence_order.append(Vector)
370+
except ImportError:
371+
pass
372+
313373
for sql_type in precedence_order:
314374
for obj in sql_type_array:
315375
if isinstance(obj, sql_type):
@@ -516,7 +576,7 @@ def _adapt_column_type( # type: ignore[override]
516576
return
517577

518578
# Not the same type, generic type or compatible types
519-
# calling merge_sql_types for assistnace
579+
# calling merge_sql_types for assistance.
520580
compatible_sql_type = self.merge_sql_types([current_type, sql_type])
521581

522582
if str(compatible_sql_type) == str(current_type):
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"type": "SCHEMA", "stream": "array_float_vector", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "number"}, "additionalProperties": {"storage": {"type": "vector", "dim": 4}}}}}}
2+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 1, "value": [ 1.1, 2.1, 1.1, 1.3 ]}}
3+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 2, "value": [ 1.0, 1.0, 1.0, 2.3 ]}}
4+
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 3, "value": [ 2.0, 1.2, 1.0, 0.9 ]}}
5+
{"type": "STATE", "value": {"array_float_vector": 3}}

target_postgres/tests/init.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CREATE EXTENSION IF NOT EXISTS vector;

target_postgres/tests/test_target_postgres.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,26 @@ def test_array_boolean(postgres_target, helper):
471471
)
472472

473473

474+
def test_array_float_vector(postgres_target, helper):
475+
pgvector_sa = pytest.importorskip("pgvector.sqlalchemy")
476+
477+
file_name = "array_float_vector.singer"
478+
singer_file_to_target(file_name, postgres_target)
479+
row = {
480+
"id": 1,
481+
"value": "[1.1,2.1,1.1,1.3]",
482+
}
483+
helper.verify_data("array_float_vector", 3, "id", row)
484+
485+
helper.verify_schema(
486+
"array_float_vector",
487+
check_columns={
488+
"id": {"type": BIGINT},
489+
"value": {"type": pgvector_sa.Vector},
490+
},
491+
)
492+
493+
474494
def test_array_number(postgres_target, helper):
475495
file_name = "array_number.singer"
476496
singer_file_to_target(file_name, postgres_target)

tox.ini

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ isolated_build = true
99
allowlist_externals = poetry
1010

1111
commands =
12-
poetry install -v
12+
poetry install --all-extras -v
1313
poetry run pytest
1414
poetry run black --check target_postgres/
1515
poetry run flake8 target_postgres
@@ -21,22 +21,22 @@ commands =
2121
# To execute, run `tox -e pytest`
2222
envlist = py37, py38, py39
2323
commands =
24-
poetry install -v
24+
poetry install --all-extras -v
2525
poetry run pytest
2626

2727
[testenv:format]
2828
# Attempt to auto-resolve lint errors before they are raised.
2929
# To execute, run `tox -e format`
3030
commands =
31-
poetry install -v
31+
poetry install --all-extras -v
3232
poetry run black target_postgres/
3333
poetry run isort target_postgres
3434

3535
[testenv:lint]
3636
# Raise an error if lint and style standards are not met.
3737
# To execute, run `tox -e lint`
3838
commands =
39-
poetry install -v
39+
poetry install --all-extras -v
4040
poetry run black --check --diff target_postgres/
4141
poetry run isort --check target_postgres
4242
poetry run flake8 target_postgres

0 commit comments

Comments
 (0)