feat: Support x-sql-datatype for integer types (#500)

edgarrmondragon · web-flow · commit 73a59c86df86 · 2025-01-27T22:11:30.000-06:00
https://sdk.meltano.com/en/v0.44.0/guides/sql-target.html#use-the-x-sql-datatype-json-schema-extension
diff --git a/README.md b/README.md
@@ -302,6 +302,18 @@ If a column has multiple jsonschema types, the following order is using to order
 - BOOLEAN
 - NOTYPE
 
+### `x-sql-datatype` extension
+
+This target supports the [`x-sql-datatype` extension](https://sdk.meltano.com/en/latest/guides/sql-target.html#use-the-x-sql-datatype-json-schema-extension) to the JSON schema. This extension allows you to specify the Postgres data type that should be used for a given field. This can be useful when the default mapping is not what you want.
+
+<!-- insert a table with the mapping -->
+
+| `x-sql-datatype` | Postgres | Description                                                        |
+| :--------------- | :------- | :----------------------------------------------------------------- |
+| smallint         | smallint | small-range integer (-32768 to +32767)                             |
+| integer          | integer  | typical choice for integer (-2147483648 to +2147483647)            |
+| bigint           | bigint   | large-range integer (-9223372036854775808 to +9223372036854775807) |
+
 ### Using the Singer catalog to narrow down the Postgres data types
 
 You can use [Singer catalog's schema](https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#schemas) to override the data types coming from the tap. The easiest way to do this is to use Meltano and its [`schema` setting](https://docs.meltano.com/concepts/plugins/#schema-extra) for the tap:
@@ -320,6 +332,20 @@ plugins:
           maximum: 1000
 ```
 
+Or to use the `x-sql-datatype` extension:
+
+```yaml
+# meltano.yml
+plugins:
+  extractors:
+  - name: tap-my-tap
+    schema:
+      some_stream_id:
+        my_column:
+          type: integer
+          x-sql-datatype: smallint
+```
+
 ## Content Encoding Support
 
 Json Schema supports the [`contentEncoding` keyword](https://datatracker.ietf.org/doc/html/rfc4648#section-8), which can be used to specify the encoding of input string types.
diff --git a/meltano.yml b/meltano.yml
@@ -5,7 +5,7 @@ project_id: target-postgres
 plugins:
   extractors:
   - name: tap-smoke-test
-    namespace: tap_smoke_test
+    variant: meltano
     pip_url: git+https://github.com/meltano/tap-smoke-test.git
     executable: tap-smoke-test
     config:
@@ -19,6 +19,11 @@ plugins:
           __key_properties__: [id]
         page_views:
           __key_properties__: [vistor_id]
+    schema:
+      animals:
+        views:
+          type: integer
+          x-sql-datatype: smallint
   - name: tap-github
     variant: meltanolabs
     pip_url: git+https://github.com/MeltanoLabs/tap-github.git
diff --git a/plugins/extractors/tap-github--meltanolabs.lock b/plugins/extractors/tap-github--meltanolabs.lock
@@ -6,11 +6,12 @@
   "label": "GitHub",
   "docs": "https://hub.meltano.com/extractors/tap-github--meltanolabs",
   "repo": "https://github.com/MeltanoLabs/tap-github",
-  "pip_url": "git+https://github.com/MeltanoLabs/tap-github.git",
+  "pip_url": "meltanolabs-tap-github",
   "description": "Code hosting platform",
   "logo_url": "https://hub.meltano.com/assets/logos/extractors/github.png",
   "capabilities": [
     "about",
+    "batch",
     "catalog",
     "discover",
     "schema-flattening",
@@ -41,22 +42,90 @@
       "label": "Additional Auth Tokens",
       "description": "List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits."
     },
+    {
+      "name": "auth_app_keys",
+      "kind": "array",
+      "label": "Auth App Keys",
+      "description": "List of GitHub App credentials to authenticate with. Each credential can be constructed by combining an App ID and App private key into the format `:app_id:;;-----BEGIN RSA PRIVATE KEY----- _YOUR_P_KEY_ -----END RSA PRIVATE KEY-----`."
+    },
     {
       "name": "auth_token",
-      "kind": "password",
+      "kind": "string",
       "label": "Auth Token",
-      "description": "GitHub token to authenticate with."
+      "description": "GitHub token to authenticate with.",
+      "sensitive": true
+    },
+    {
+      "name": "batch_config.encoding.compression",
+      "kind": "options",
+      "label": "Batch Compression Format",
+      "description": "Compression format to use for batch files.",
+      "options": [
+        {
+          "label": "GZIP",
+          "value": "gzip"
+        },
+        {
+          "label": "None",
+          "value": "none"
+        }
+      ]
+    },
+    {
+      "name": "batch_config.encoding.format",
+      "kind": "options",
+      "label": "Batch Encoding Format",
+      "description": "Format to use for batch files.",
+      "options": [
+        {
+          "label": "JSONL",
+          "value": "jsonl"
+        },
+        {
+          "label": "Parquet",
+          "value": "parquet"
+        }
+      ]
+    },
+    {
+      "name": "batch_config.storage.prefix",
+      "kind": "string",
+      "label": "Batch Storage Prefix",
+      "description": "Prefix to use when writing batch files."
+    },
+    {
+      "name": "batch_config.storage.root",
+      "kind": "string",
+      "label": "Batch Storage Root",
+      "description": "Root path to use when writing batch files."
+    },
+    {
+      "name": "expiry_time_buffer",
+      "kind": "integer",
+      "label": "Expiry Time Buffer"
+    },
+    {
+      "name": "faker_config.locale",
+      "kind": "array",
+      "label": "Faker Locale",
+      "description": "One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization"
+    },
+    {
+      "name": "faker_config.seed",
+      "kind": "string",
+      "label": "Faker Seed",
+      "description": "Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator"
     },
     {
       "name": "flattening_enabled",
       "kind": "boolean",
-      "label": "Flattening Enabled",
+      "label": "Enable Schema Flattening",
       "description": "'True' to enable schema flattening and automatically expand nested properties."
     },
     {
       "name": "flattening_max_depth",
       "kind": "integer",
-      "label": "Flattening Max Depth",
+      "label": "Max Flattening Depth",
       "description": "The max depth to flatten schemas."
     },
     {
@@ -110,6 +179,27 @@
       "kind": "object",
       "label": "Stream Maps"
     },
+    {
+      "name": "stream_options.milestones.state",
+      "kind": "options",
+      "value": "open",
+      "label": "Stream Options Milestones State",
+      "description": "Configures which states are of interest. Must be one of [open, closed, all], defaults to open.",
+      "options": [
+        {
+          "label": "Open",
+          "value": "open"
+        },
+        {
+          "label": "Closed",
+          "value": "closed"
+        },
+        {
+          "label": "All",
+          "value": "all"
+        }
+      ]
+    },
     {
       "name": "user_agent",
       "kind": "string",
diff --git a/plugins/extractors/tap-smoke-test--meltano.lock b/plugins/extractors/tap-smoke-test--meltano.lock
@@ -0,0 +1,122 @@
+{
+  "plugin_type": "extractors",
+  "name": "tap-smoke-test",
+  "namespace": "tap_smoke_test",
+  "variant": "meltano",
+  "label": "Smoke Test",
+  "docs": "https://hub.meltano.com/extractors/tap-smoke-test--meltano",
+  "repo": "https://github.com/meltano/tap-smoke-test",
+  "pip_url": "git+https://github.com/meltano/tap-smoke-test.git",
+  "executable": "tap-smoke-test",
+  "description": "Generates sample data to be used for testing.",
+  "logo_url": "https://hub.meltano.com/assets/logos/extractors/smoke-test.png",
+  "capabilities": [
+    "about",
+    "batch",
+    "catalog",
+    "discover",
+    "schema-flattening",
+    "state",
+    "stream-maps"
+  ],
+  "settings_group_validation": [
+    [
+      "streams"
+    ]
+  ],
+  "settings": [
+    {
+      "name": "batch_config.encoding.compression",
+      "kind": "options",
+      "label": "Batch Compression Format",
+      "description": "Compression format to use for batch files.",
+      "options": [
+        {
+          "label": "GZIP",
+          "value": "gzip"
+        },
+        {
+          "label": "None",
+          "value": "none"
+        }
+      ]
+    },
+    {
+      "name": "batch_config.encoding.format",
+      "kind": "options",
+      "label": "Batch Encoding Format",
+      "description": "Format to use for batch files.",
+      "options": [
+        {
+          "label": "JSONL",
+          "value": "jsonl"
+        },
+        {
+          "label": "Parquet",
+          "value": "parquet"
+        }
+      ]
+    },
+    {
+      "name": "batch_config.storage.prefix",
+      "kind": "string",
+      "label": "Batch Storage Prefix",
+      "description": "Prefix to use when writing batch files."
+    },
+    {
+      "name": "batch_config.storage.root",
+      "kind": "string",
+      "label": "Batch Storage Root",
+      "description": "Root path to use when writing batch files."
+    },
+    {
+      "name": "faker_config.locale",
+      "kind": "array",
+      "label": "Faker Locale",
+      "description": "One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization"
+    },
+    {
+      "name": "faker_config.seed",
+      "kind": "string",
+      "label": "Faker Seed",
+      "description": "Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator"
+    },
+    {
+      "name": "flattening_enabled",
+      "kind": "boolean",
+      "label": "Enable Schema Flattening",
+      "description": "'True' to enable schema flattening and automatically expand nested properties."
+    },
+    {
+      "name": "flattening_max_depth",
+      "kind": "integer",
+      "label": "Max Flattening Depth",
+      "description": "The max depth to flatten schemas."
+    },
+    {
+      "name": "schema_inference_record_count",
+      "kind": "integer",
+      "value": 5,
+      "label": "Schema Inference Record Count",
+      "description": "How many records of the source data should be used for schema inference/construction."
+    },
+    {
+      "name": "stream_map_config",
+      "kind": "object",
+      "label": "User Stream Map Configuration",
+      "description": "User-defined config values to be used within map expressions."
+    },
+    {
+      "name": "stream_maps",
+      "kind": "object",
+      "label": "Stream Maps",
+      "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)."
+    },
+    {
+      "name": "streams",
+      "kind": "array",
+      "label": "Streams",
+      "description": "An array of objects containing:\n* `stream_name`: The name of the stream.\n* `input_filename`: Path to a jsonl file containing records to use for mock data.\n* `client_exception`: (Default False) Whether we should simulate failing by having the client raise an exception.\n* `schema_gen_exception`: (Default False) Whether we should simulate failing by raising an exception during schema inference.\n* `loop_count`: (Default 1) The number of times we should playback the input file.\n\nFor example:\n\n```yaml\nstreams:\n- stream_name: animals\n  input_filename: https://raw.githubusercontent.com/meltano/tap-smoke-test/main/demo-data/animals-data.jsonl\n```\n"
+    }
+  ]
+}
diff --git a/target_postgres/connector.py b/target_postgres/connector.py
@@ -310,6 +310,9 @@ def jsonschema_to_sql(self) -> JSONSchemaToSQL:
         to_sql.register_format_handler("hostname", TEXT)
         to_sql.register_format_handler("ipv4", TEXT)
         to_sql.register_format_handler("ipv6", TEXT)
+        to_sql.register_sql_datatype_handler("smallint", SMALLINT)
+        to_sql.register_sql_datatype_handler("integer", INTEGER)
+        to_sql.register_sql_datatype_handler("bigint", BIGINT)
         return to_sql
 
     def to_sql_type(self, jsonschema_type: dict) -> sa.types.TypeEngine:
diff --git a/target_postgres/tests/test_types.py b/target_postgres/tests/test_types.py
@@ -93,6 +93,14 @@ def test_datetime_string(self, to_postgres: JSONSchemaToPostgres):
                 BIGINT,
                 id="bigint",
             ),
+            pytest.param(
+                {
+                    "type": "integer",
+                    "x-sql-datatype": "smallint",
+                },
+                SMALLINT,
+                id="x-sql-datatype-smallint",
+            ),
         ],
     )
     def test_integers(