@@ -277,6 +277,53 @@ def pick_individual_type(jsonschema_type: dict):
277
277
if "object" in jsonschema_type ["type" ]:
278
278
return JSONB ()
279
279
if "array" in jsonschema_type ["type" ]:
280
+ # Select between different kinds of `ARRAY` data types.
281
+ #
282
+ # This currently leverages an unspecified definition for the Singer SCHEMA,
283
+ # using the `additionalProperties` attribute to convey additional type
284
+ # information, agnostic of the target database.
285
+ #
286
+ # In this case, it is about telling different kinds of `ARRAY` types apart:
287
+ # Either it is a vanilla `ARRAY`, to be stored into a `jsonb[]` type, or,
288
+ # alternatively, it can be a "vector" kind `ARRAY` of floating point
289
+ # numbers, effectively what pgvector is storing in its `VECTOR` type.
290
+ #
291
+ # Still, `type: "vector"` is only a surrogate label here, because other
292
+ # database systems may use different types for implementing the same thing,
293
+ # and need to translate accordingly.
294
+ """
295
+ Schema override rule in `meltano.yml`:
296
+
297
+ type: "array"
298
+ items:
299
+ type: "number"
300
+ additionalProperties:
301
+ storage:
302
+ type: "vector"
303
+ dim: 4
304
+
305
+ Produced schema annotation in `catalog.json`:
306
+
307
+ {"type": "array",
308
+ "items": {"type": "number"},
309
+ "additionalProperties": {"storage": {"type": "vector", "dim": 4}}}
310
+ """
311
+ if (
312
+ "additionalProperties" in jsonschema_type
313
+ and "storage" in jsonschema_type ["additionalProperties" ]
314
+ ):
315
+ storage_properties = jsonschema_type ["additionalProperties" ]["storage" ]
316
+ if (
317
+ "type" in storage_properties
318
+ and storage_properties ["type" ] == "vector"
319
+ ):
320
+ # On PostgreSQL/pgvector, use the corresponding type definition
321
+ # from its SQLAlchemy dialect.
322
+ from pgvector .sqlalchemy import (
323
+ Vector , # type: ignore[import-untyped]
324
+ )
325
+
326
+ return Vector (storage_properties ["dim" ])
280
327
return ARRAY (JSONB ())
281
328
if jsonschema_type .get ("format" ) == "date-time" :
282
329
return TIMESTAMP ()
@@ -310,6 +357,13 @@ def pick_best_sql_type(sql_type_array: list):
310
357
NOTYPE ,
311
358
]
312
359
360
+ try :
361
+ from pgvector .sqlalchemy import Vector
362
+
363
+ precedence_order .append (Vector )
364
+ except ImportError :
365
+ pass
366
+
313
367
for sql_type in precedence_order :
314
368
for obj in sql_type_array :
315
369
if isinstance (obj , sql_type ):
0 commit comments