From dc9798d6bcac9babc7f61bdef89d0304aac9c732 Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Thu, 26 Jun 2025 13:30:52 -0700 Subject: [PATCH 01/34] chore: Add sift-stream-bindings as a dep --- python/lib/sift_py/grpc/transport.py | 13 ++++++++++--- python/pyproject.toml | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/python/lib/sift_py/grpc/transport.py b/python/lib/sift_py/grpc/transport.py index b6aff1438..d2db9245d 100644 --- a/python/lib/sift_py/grpc/transport.py +++ b/python/lib/sift_py/grpc/transport.py @@ -40,8 +40,12 @@ def get_ssl_credentials(cert_via_openssl: bool) -> grpc.ChannelCredentials: ssl_context = ssl.create_default_context() certs_der = ssl_context.get_ca_certs(binary_form=True) - certs_x509 = [crypto.load_certificate(crypto.FILETYPE_ASN1, x) for x in certs_der] - certs_pem = [crypto.dump_certificate(crypto.FILETYPE_PEM, x) for x in certs_x509] + certs_x509 = [ + crypto.load_certificate(crypto.FILETYPE_ASN1, x) for x in certs_der + ] + certs_pem = [ + crypto.dump_certificate(crypto.FILETYPE_PEM, x) for x in certs_x509 + ] certs_bytes = b"".join(certs_pem) return grpc.ssl_channel_credentials(certs_bytes) @@ -225,7 +229,10 @@ def _compute_keep_alive_channel_opts(config: KeepaliveConfig) -> List[Tuple[str, ("grpc.keepalive_time_ms", config["keepalive_time_ms"]), ("grpc.keepalive_timeout_ms", config["keepalive_timeout_ms"]), ("grpc.http2.max_pings_without_data", config["max_pings_without_data"]), - ("grpc.keepalive_permit_without_calls", config["keepalive_permit_without_calls"]), + ( + "grpc.keepalive_permit_without_calls", + config["keepalive_permit_without_calls"], + ), ] diff --git a/python/pyproject.toml b/python/pyproject.toml index a05067997..90e14ef67 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "pydantic_core~=2.3", "requests~=2.25", "requests-toolbelt~=1.0", + "sift-stream-bindings>=0.1", "alive-progress~=3.0", # May move these to optional dependencies in the future. "pandas-stubs~=2.0", From b0a3e2387cef0534e84d05ba101c73d880d1c546 Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Fri, 27 Jun 2025 15:27:06 -0700 Subject: [PATCH 02/34] feat: Wrap grpc.Channel with a class that includes config --- python/lib/sift_py/grpc/transport.py | 41 +++++++++++++++++++++------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/python/lib/sift_py/grpc/transport.py b/python/lib/sift_py/grpc/transport.py index d2db9245d..02d244a6f 100644 --- a/python/lib/sift_py/grpc/transport.py +++ b/python/lib/sift_py/grpc/transport.py @@ -21,7 +21,30 @@ from sift_py.grpc._retry import RetryPolicy from sift_py.grpc.keepalive import DEFAULT_KEEPALIVE_CONFIG, KeepaliveConfig -SiftChannel: TypeAlias = grpc.Channel + +class SiftChannelWithConfig: + """ + A wrapper around grpc.Channel that includes the configuration used to create it. + This allows access to the original config for debugging or other purposes. + """ + + def __init__(self, config: SiftChannelConfig, channel: grpc.Channel): + self._channel = channel + self.config = config + + def __getattr__(self, name): + # Delegate all other attributes to the underlying channel + return getattr(self._channel, name) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Close the underlying channel + self._channel.close() + + +SiftChannel: TypeAlias = SiftChannelWithConfig SiftAsyncChannel: TypeAlias = grpc_aio.Channel @@ -40,12 +63,8 @@ def get_ssl_credentials(cert_via_openssl: bool) -> grpc.ChannelCredentials: ssl_context = ssl.create_default_context() certs_der = ssl_context.get_ca_certs(binary_form=True) - certs_x509 = [ - crypto.load_certificate(crypto.FILETYPE_ASN1, x) for x in certs_der - ] - certs_pem = [ - crypto.dump_certificate(crypto.FILETYPE_PEM, x) for x in certs_x509 - ] + certs_x509 = [crypto.load_certificate(crypto.FILETYPE_ASN1, x) for x in certs_der] + certs_pem = [crypto.dump_certificate(crypto.FILETYPE_PEM, x) for x in certs_x509] certs_bytes = b"".join(certs_pem) return grpc.ssl_channel_credentials(certs_bytes) @@ -71,14 +90,16 @@ def use_sift_channel( cert_via_openssl = config.get("cert_via_openssl", False) if not use_ssl: - return _use_insecure_sift_channel(config, metadata) + channel = _use_insecure_sift_channel(config, metadata) + return SiftChannelWithConfig(config, channel) credentials = get_ssl_credentials(cert_via_openssl) options = _compute_channel_options(config) api_uri = _clean_uri(config["uri"], use_ssl) channel = grpc.secure_channel(api_uri, credentials, options) interceptors = _compute_sift_interceptors(config, metadata) - return grpc.intercept_channel(channel, *interceptors) + intercepted_channel = grpc.intercept_channel(channel, *interceptors) + return SiftChannelWithConfig(config, intercepted_channel) def use_sift_async_channel( @@ -104,7 +125,7 @@ def use_sift_async_channel( def _use_insecure_sift_channel( config: SiftChannelConfig, metadata: Optional[Dict[str, Any]] = None -) -> SiftChannel: +) -> grpc.Channel: """ FOR DEVELOPMENT PURPOSES ONLY """ From 0bfaa9bd26e4e07065fa521f276d2d201c8dad3b Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Fri, 27 Jun 2025 17:33:47 -0700 Subject: [PATCH 03/34] feat: Initialize builder in IngestionService --- python/lib/sift_py/ingestion/_internal/ingest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 8df404478..8a23dad16 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -13,6 +13,7 @@ from sift.ingest.v1.ingest_pb2_grpc import IngestServiceStub from sift.ingestion_configs.v2.ingestion_configs_pb2 import ChannelConfig as ChannelConfigPb from sift.ingestion_configs.v2.ingestion_configs_pb2 import IngestionConfig +from sift_stream_bindings import SiftStreamBuilderPy from sift_py.grpc.transport import SiftChannel from sift_py.ingestion._internal.error import IngestionValidationError @@ -81,6 +82,7 @@ def __init__( rule.asset_names.append(config.asset_name) self.rule_service.create_or_update_rules(config.rules) + self.builder = SiftStreamBuilderPy(channel.config.get("uri"), channel.config.get("apikey")) self.rules = config.rules self.asset_name = config.asset_name self.transport_channel = channel From 311542497f22e1a8b24fad6eb736443fa12d5171 Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Fri, 18 Jul 2025 16:08:57 -0700 Subject: [PATCH 04/34] feat: Added sift stream helpers --- .../lib/sift_py/ingestion/_internal/ingest.py | 13 +- .../lib/sift_py/ingestion/_internal/stream.py | 316 ++++++++++++++++++ 2 files changed, 324 insertions(+), 5 deletions(-) create mode 100644 python/lib/sift_py/ingestion/_internal/stream.py diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 8a23dad16..7496a4dd5 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -13,7 +13,6 @@ from sift.ingest.v1.ingest_pb2_grpc import IngestServiceStub from sift.ingestion_configs.v2.ingestion_configs_pb2 import ChannelConfig as ChannelConfigPb from sift.ingestion_configs.v2.ingestion_configs_pb2 import IngestionConfig -from sift_stream_bindings import SiftStreamBuilderPy from sift_py.grpc.transport import SiftChannel from sift_py.ingestion._internal.error import IngestionValidationError @@ -24,6 +23,7 @@ get_ingestion_config_flows, ) from sift_py.ingestion._internal.run import create_run, get_run_id_by_name +from sift_py.ingestion._internal.stream import get_builder, get_run_form, stream_requests from sift_py.ingestion.channel import ( ChannelConfig, ChannelValue, @@ -82,7 +82,7 @@ def __init__( rule.asset_names.append(config.asset_name) self.rule_service.create_or_update_rules(config.rules) - self.builder = SiftStreamBuilderPy(channel.config.get("uri"), channel.config.get("apikey")) + self.builder = get_builder(channel) self.rules = config.rules self.asset_name = config.asset_name self.transport_channel = channel @@ -96,7 +96,7 @@ def ingest(self, *requests: IngestWithConfigDataStreamRequest): """ Perform data ingestion. """ - self.ingest_service_stub.IngestWithConfigDataStream(iter(requests)) + stream_requests(self.builder, requests, self.run_id) def ingest_flows(self, *flows: FlowOrderedChannelValues): """ @@ -112,7 +112,7 @@ def ingest_flows(self, *flows: FlowOrderedChannelValues): req = self.create_ingestion_request(flow_name, timestamp, channel_values) requests.append(req) - self.ingest_service_stub.IngestWithConfigDataStream(iter(requests)) + stream_requests(self.builder, requests, self.run_id) def try_ingest_flows(self, *flows: Flow): """ @@ -128,7 +128,7 @@ def try_ingest_flows(self, *flows: Flow): req = self.try_create_ingestion_request(flow_name, timestamp, channel_values) requests.append(req) - self.ingest_service_stub.IngestWithConfigDataStream(iter(requests)) + stream_requests(self.builder, requests, self.run_id) def attach_run( self, @@ -161,12 +161,15 @@ def attach_run( metadata=metadata, ) + self.builder.run = get_run_form(run_name, description or "", tags or []) + def detach_run(self): """ Detach run from this period of ingestion. Subsequent data ingested won't be associated with the run being detached. """ self.run_id = None + self.builder.run = None def try_create_ingestion_request_ordered_values( self, diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py new file mode 100644 index 000000000..e507c8150 --- /dev/null +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -0,0 +1,316 @@ +from typing import List +from urllib.parse import urlparse, urlunparse + +from sift_stream_bindings import ( + ChannelBitFieldElementPy, + ChannelConfigPy, + ChannelDataTypePy, + ChannelEnumTypePy, + ChannelValuePy, + FlowConfigPy, + IngestionConfigFormPy, + IngestWithConfigDataStreamRequestPy, + RunFormPy, + TimeValuePy, +) + +from sift_py.ingestion.channel import SiftChannel +from sift_py.ingestion.config.telemetry import TelemetryConfig +from sift_py.ingestion.stream.builder import SiftStreamBuilderPy + +""" +TODO: + - helper to fetch ingestion config id via client key + - stream helper: build, send, finish + - helper to convert FlowOrderedChannelValues to IngestWithConfigDataStreamRequestPy + - create_ingestion_request helper, IngestWithConfigDataStreamRequestPy? + - helper to convert List of IngestWithConfigDataChannelValue to IngestWithConfigDataStreamRequestPy? +""" + + +def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> SiftStreamBuilderPy: + """ + Get a builder for a stream. + + Args: + channel: The channel to get a builder for + ingestion_config: The ingestion config to use for the builder + + Returns: + SiftStreamBuilderPy: The builder for the channel + """ + uri = channel.config.get("uri") + apikey = channel.config.get("apikey") + + if not uri or not apikey: + raise ValueError(f"Channel config is missing uri or apikey: {channel.config}") + + parsed = urlparse(uri) + # If no scheme is provided, default to https + if not parsed.scheme: + # Reconstruct URL with https scheme + parsed = parsed._replace(scheme="https") + uri = urlunparse(parsed) + # If scheme is http, upgrade to https for security + elif parsed.scheme == "http": + parsed = parsed._replace(scheme="https") + uri = urlunparse(parsed) + + builder = SiftStreamBuilderPy(uri, apikey) + builder.ingestion_config = telemetry_config_to_ingestion_config_py(ingestion_config) + builder.enable_tls = channel.config.get("use_ssl", True) + return builder + + +def stream_requests( + builder: SiftStreamBuilderPy, + requests: List, + run_id: str = "", +) -> None: + """ + Stream requests using the stream bindings. + + Args: + channel: The SiftChannel to use for streaming + ingestion_config: The TelemetryConfig for the ingestion + requests: List of IngestWithConfigDataStreamRequest protobuf objects + run_id: Optional run ID to associate with the requests + """ + # Convert protobuf requests to Python binding requests + py_requests = [ingest_request_to_ingest_request_py(request, run_id) for request in requests] + + # Create stream and send requests + sift_stream = builder.build() + sift_stream.send_requests(iter(py_requests)) + sift_stream.finish() + + +def telemetry_config_to_ingestion_config_py( + telemetry_config: TelemetryConfig, +) -> IngestionConfigFormPy: + """ + Convert a TelemetryConfig to an IngestionConfigFormPy. + + Args: + telemetry_config: The TelemetryConfig to convert + + Returns: + IngestionConfigFormPy: The converted ingestion config + """ + # Convert flows + flow_configs_py = [] + + for flow_config in telemetry_config.flows: + # Convert channels in this flow + channel_configs_py = [] + + for channel_config in flow_config.channels: + # Convert enum types + enum_types_py = [] + for enum_type in channel_config.enum_types: + enum_types_py.append( + ChannelEnumTypePy( + name=enum_type.name, + key=enum_type.key, + ) + ) + + # Convert bit field elements + bit_field_elements_py = [] + for bit_field_element in channel_config.bit_field_elements: + bit_field_elements_py.append( + ChannelBitFieldElementPy( + name=bit_field_element.name, + index=bit_field_element.index, + bit_count=bit_field_element.bit_count, + ) + ) + + # Convert data type + data_type_py = convert_channel_data_type(channel_config.data_type) + + # Create channel config + channel_config_py = ChannelConfigPy( + name=channel_config.name, + data_type=data_type_py, + unit=channel_config.unit or "", + description=channel_config.description or "", + enum_types=enum_types_py, + bit_field_elements=bit_field_elements_py, + ) + + channel_configs_py.append(channel_config_py) + + # Create flow config + flow_config_py = FlowConfigPy( + name=flow_config.name, + channels=channel_configs_py, + ) + + flow_configs_py.append(flow_config_py) + + # Create ingestion config + ingestion_config_py = IngestionConfigFormPy( + asset_name=telemetry_config.asset_name, + client_key=telemetry_config.ingestion_client_key, + flows=flow_configs_py, + ) + + return ingestion_config_py + + +def convert_channel_data_type(data_type) -> ChannelDataTypePy: + """ + Convert a ChannelDataType to ChannelDataTypePy. + + Args: + data_type: The ChannelDataType to convert + + Returns: + ChannelDataTypePy: The converted data type + """ + # Import here to avoid circular imports + from sift_py.ingestion.channel import ChannelDataType + + if data_type == ChannelDataType.DOUBLE: + return ChannelDataTypePy.Double + elif data_type == ChannelDataType.STRING: + return ChannelDataTypePy.String + elif data_type == ChannelDataType.ENUM: + return ChannelDataTypePy.Enum + elif data_type == ChannelDataType.BIT_FIELD: + return ChannelDataTypePy.BitField + elif data_type == ChannelDataType.BOOL: + return ChannelDataTypePy.Bool + elif data_type == ChannelDataType.FLOAT: + return ChannelDataTypePy.Float + elif data_type == ChannelDataType.INT_32: + return ChannelDataTypePy.Int32 + elif data_type == ChannelDataType.UINT_32: + return ChannelDataTypePy.Uint32 + elif data_type == ChannelDataType.INT_64: + return ChannelDataTypePy.Int64 + elif data_type == ChannelDataType.UINT_64: + return ChannelDataTypePy.Uint64 + elif data_type == ChannelDataType.BYTES: + return ChannelDataTypePy.Bytes + else: + return ChannelDataTypePy.Unspecified + + +def get_run_form(run_name: str, run_description: str, run_tags: List[str]) -> RunFormPy: + """ + Get a run form. + + Args: + run_name: The name of the run + run_description: The description of the run + run_tags: The tags of the run + + Returns: + RunFormPy: The run form + """ + return RunFormPy( + name=run_name, + description=run_description, + client_key=run_name, + tags=run_tags, + ) + + +def ingest_request_to_ingest_request_py( + request, + run_id, +) -> IngestWithConfigDataStreamRequestPy: + """ + Convert an IngestWithConfigDataStreamRequest to IngestWithConfigDataStreamRequestPy. + + Args: + request: The IngestWithConfigDataStreamRequest to convert + run_id: The run ID to use + + Returns: + IngestWithConfigDataStreamRequestPy: The converted request + """ + timestamp_py = None + if request.HasField("timestamp"): + timestamp_py = TimeValuePy.from_timestamp( + request.timestamp.seconds, request.timestamp.nanos + ) + + channel_values_py = [ + convert_channel_value_to_channel_value_py(channel_value) + for channel_value in request.channel_values + ] + + return IngestWithConfigDataStreamRequestPy( + ingestion_config_id=request.ingestion_config_id, + flow=request.flow, + timestamp=timestamp_py, + channel_values=channel_values_py, + run_id=run_id, + end_stream_on_validation_error=request.end_stream_on_validation_error, + organization_id=request.organization_id, + ) + + +def convert_channel_value_to_channel_value_py(channel_value) -> ChannelValuePy: + """ + Convert an IngestWithConfigDataChannelValue to ChannelValuePy. + + Args: + channel_value: The IngestWithConfigDataChannelValue to convert + + Returns: + ChannelValuePy: The converted channel value + """ + # Import here to avoid circular imports + from sift.ingest.v1.ingest_pb2 import IngestWithConfigDataChannelValue + + if not isinstance(channel_value, IngestWithConfigDataChannelValue): + raise ValueError(f"Expected IngestWithConfigDataChannelValue, got {type(channel_value)}") + + # Extract the value from the oneof field + # Note: We need a channel name, but the protobuf doesn't contain it + # This is a limitation - we'll use a placeholder name + channel_name = "unknown_channel" # This is a limitation of the conversion + + if channel_value.HasField("string"): + return ChannelValuePy.string(channel_name, channel_value.string) + elif channel_value.HasField("double"): + return ChannelValuePy.double(channel_name, channel_value.double) + elif channel_value.HasField("float"): + return ChannelValuePy.float(channel_name, channel_value.float) + elif channel_value.HasField("bool"): + return ChannelValuePy.bool(channel_name, channel_value.bool) + elif channel_value.HasField("int32"): + return ChannelValuePy.int32(channel_name, channel_value.int32) + elif channel_value.HasField("uint32"): + return ChannelValuePy.uint32(channel_name, channel_value.uint32) + elif channel_value.HasField("int64"): + return ChannelValuePy.int64(channel_name, channel_value.int64) + elif channel_value.HasField("uint64"): + return ChannelValuePy.uint64(channel_name, channel_value.uint64) + elif channel_value.HasField("enum"): + # For enum values, we need to create a ChannelEnumTypePy + enum_type = ChannelEnumTypePy(name=f"enum_{channel_value.enum}", key=channel_value.enum) + return ChannelValuePy.enum_value(channel_name, enum_type) + elif channel_value.HasField("bit_field"): + # For bit field values, we need to create ChannelBitFieldElementPy list + # This is a simplified conversion - in practice you'd need the actual bit field definition + bit_field_elements = [] + for i, byte in enumerate(channel_value.bit_field): + if byte != 0: + bit_field_elements.append( + ChannelBitFieldElementPy(name=f"bit_{i}", index=i, bit_count=1) + ) + return ChannelValuePy.bitfield(channel_name, bit_field_elements) + elif channel_value.HasField("bytes"): + # For bytes values, we'll convert to a string representation + return ChannelValuePy.string(channel_name, str(channel_value.bytes)) + elif channel_value.HasField("empty"): + # For empty values, we'll return a default value + return ChannelValuePy.string(channel_name, "") + else: + # No field set, return empty string + return ChannelValuePy.string(channel_name, "") From 9230bdda5b8fe78e9732dbb476ee6f132e5e26cd Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Fri, 18 Jul 2025 16:52:00 -0700 Subject: [PATCH 05/34] fix: Bug fixes & async --- .../lib/sift_py/ingestion/_internal/ingest.py | 2 +- .../lib/sift_py/ingestion/_internal/stream.py | 53 +++++++++++-------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 7496a4dd5..7f16d4998 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -82,7 +82,7 @@ def __init__( rule.asset_names.append(config.asset_name) self.rule_service.create_or_update_rules(config.rules) - self.builder = get_builder(channel) + self.builder = get_builder(channel, config) self.rules = config.rules self.asset_name = config.asset_name self.transport_channel = channel diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index e507c8150..64e010f75 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -1,5 +1,5 @@ +import asyncio from typing import List -from urllib.parse import urlparse, urlunparse from sift_stream_bindings import ( ChannelBitFieldElementPy, @@ -11,12 +11,12 @@ IngestionConfigFormPy, IngestWithConfigDataStreamRequestPy, RunFormPy, + SiftStreamBuilderPy, TimeValuePy, ) -from sift_py.ingestion.channel import SiftChannel +from sift_py.grpc.transport import SiftChannel from sift_py.ingestion.config.telemetry import TelemetryConfig -from sift_py.ingestion.stream.builder import SiftStreamBuilderPy """ TODO: @@ -45,16 +45,9 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift if not uri or not apikey: raise ValueError(f"Channel config is missing uri or apikey: {channel.config}") - parsed = urlparse(uri) - # If no scheme is provided, default to https - if not parsed.scheme: - # Reconstruct URL with https scheme - parsed = parsed._replace(scheme="https") - uri = urlunparse(parsed) - # If scheme is http, upgrade to https for security - elif parsed.scheme == "http": - parsed = parsed._replace(scheme="https") - uri = urlunparse(parsed) + if not uri.startswith("https://"): + uri = f"https://{uri}" + print(f"Using URI: {uri}") builder = SiftStreamBuilderPy(uri, apikey) builder.ingestion_config = telemetry_config_to_ingestion_config_py(ingestion_config) @@ -62,17 +55,16 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift return builder -def stream_requests( +async def stream_requests_async( builder: SiftStreamBuilderPy, requests: List, run_id: str = "", ) -> None: """ - Stream requests using the stream bindings. + Stream requests using the stream bindings asynchronously. Args: - channel: The SiftChannel to use for streaming - ingestion_config: The TelemetryConfig for the ingestion + builder: The SiftStreamBuilderPy to use for streaming requests: List of IngestWithConfigDataStreamRequest protobuf objects run_id: Optional run ID to associate with the requests """ @@ -80,9 +72,26 @@ def stream_requests( py_requests = [ingest_request_to_ingest_request_py(request, run_id) for request in requests] # Create stream and send requests - sift_stream = builder.build() - sift_stream.send_requests(iter(py_requests)) - sift_stream.finish() + sift_stream = await builder.build() + sift_stream = await sift_stream.send_requests(py_requests) + await sift_stream.finish() + + +def stream_requests( + builder: SiftStreamBuilderPy, + requests: List, + run_id: str = "", +) -> None: + """ + Stream requests using the stream bindings synchronously. + + Args: + builder: The SiftStreamBuilderPy to use for streaming + requests: List of IngestWithConfigDataStreamRequest protobuf objects + run_id: Optional run ID to associate with the requests + """ + # Run the async function in a new event loop + asyncio.run(stream_requests_async(builder, requests, run_id)) def telemetry_config_to_ingestion_config_py( @@ -220,7 +229,7 @@ def get_run_form(run_name: str, run_description: str, run_tags: List[str]) -> Ru def ingest_request_to_ingest_request_py( request, - run_id, + run_id: str = "", ) -> IngestWithConfigDataStreamRequestPy: """ Convert an IngestWithConfigDataStreamRequest to IngestWithConfigDataStreamRequestPy. @@ -248,7 +257,7 @@ def ingest_request_to_ingest_request_py( flow=request.flow, timestamp=timestamp_py, channel_values=channel_values_py, - run_id=run_id, + run_id=run_id or "", end_stream_on_validation_error=request.end_stream_on_validation_error, organization_id=request.organization_id, ) From 000733f3985a75443d32af2e76ee65c68e1dcc5c Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Mon, 21 Jul 2025 13:07:09 -0700 Subject: [PATCH 06/34] feat: Pipe client-key for runs to attach_run methods --- .../lib/sift_py/ingestion/_internal/ingest.py | 4 +- python/lib/sift_py/ingestion/_internal/run.py | 2 + .../lib/sift_py/ingestion/_internal/stream.py | 54 ++++++++++++++++++- python/lib/sift_py/ingestion/service.py | 3 +- 4 files changed, 59 insertions(+), 4 deletions(-) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 7f16d4998..79af2f215 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -134,6 +134,7 @@ def attach_run( self, channel: SiftChannel, run_name: str, + client_key: Optional[str] = None, description: Optional[str] = None, organization_id: Optional[str] = None, tags: Optional[List[str]] = None, @@ -155,13 +156,14 @@ def attach_run( self.run_id = create_run( channel=channel, run_name=run_name, + run_client_key=client_key or "", description=description or "", organization_id=organization_id or "", tags=tags or [], metadata=metadata, ) - self.builder.run = get_run_form(run_name, description or "", tags or []) + self.builder.run = get_run_form(run_name, description or "", client_key or "", tags or []) def detach_run(self): """ diff --git a/python/lib/sift_py/ingestion/_internal/run.py b/python/lib/sift_py/ingestion/_internal/run.py index 4ea6efb80..604f83ba4 100644 --- a/python/lib/sift_py/ingestion/_internal/run.py +++ b/python/lib/sift_py/ingestion/_internal/run.py @@ -32,6 +32,7 @@ def get_run_id_by_name( def create_run( channel: SiftChannel, run_name: str, + run_client_key: str, description: str, organization_id: str, tags: List[str], @@ -43,6 +44,7 @@ def create_run( req = CreateRunRequest( name=run_name, + client_key=run_client_key, description=description, organization_id=organization_id, tags=tags, diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index 64e010f75..16f9f2f6a 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -1,4 +1,5 @@ import asyncio +import re from typing import List from sift_stream_bindings import ( @@ -28,6 +29,48 @@ """ +def _sanitize_client_key(client_key: str) -> str: + """ + Validate and sanitize a client key to meet Sift constraints. + + Client key must be 3-128 characters, start and end with alphanumeric, + and contain only [a-zA-Z0-9_~.-] + + Args: + client_key: The client key to validate + + Returns: + str: A valid client key + + Raises: + ValueError: If the client key cannot be made valid + """ + # TODO: Test + if not client_key: + raise ValueError("Client key cannot be empty") + + # Remove any characters that don't match the allowed pattern + sanitized = re.sub(r"[^a-zA-Z0-9_~.-]", "_", client_key) + + # Ensure it starts with alphanumeric + if sanitized and not sanitized[0].isalnum(): + sanitized = "a" + sanitized + + # Ensure it ends with alphanumeric + if sanitized and not sanitized[-1].isalnum(): + sanitized = sanitized + "0" + + # Check length constraints + if len(sanitized) < 3: + # Pad with alphanumeric characters to meet minimum length + sanitized = sanitized + "00"[: 3 - len(sanitized)] + elif len(sanitized) > 128: + # Truncate to 128 characters, ensuring it ends with alphanumeric + sanitized = sanitized[:126] + "0" + + return sanitized + + def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> SiftStreamBuilderPy: """ Get a builder for a stream. @@ -207,22 +250,29 @@ def convert_channel_data_type(data_type) -> ChannelDataTypePy: return ChannelDataTypePy.Unspecified -def get_run_form(run_name: str, run_description: str, run_tags: List[str]) -> RunFormPy: +def get_run_form( + run_name: str, run_description: str, client_key: str, run_tags: List[str] +) -> RunFormPy: """ Get a run form. Args: run_name: The name of the run run_description: The description of the run + client_key: The client key to use (if empty, run_name will be used and validated) run_tags: The tags of the run Returns: RunFormPy: The run form """ + # Use provided client_key or sanitize run_name as fallback + if not client_key: + client_key = _sanitize_client_key(run_name) + return RunFormPy( name=run_name, description=run_description, - client_key=run_name, + client_key=client_key, tags=run_tags, ) diff --git a/python/lib/sift_py/ingestion/service.py b/python/lib/sift_py/ingestion/service.py index 773dc07f9..2a9b9ddb9 100644 --- a/python/lib/sift_py/ingestion/service.py +++ b/python/lib/sift_py/ingestion/service.py @@ -63,6 +63,7 @@ def attach_run( self, channel: SiftChannel, run_name: str, + client_key: Optional[str] = None, description: Optional[str] = None, organization_id: Optional[str] = None, tags: Optional[List[str]] = None, @@ -75,7 +76,7 @@ def attach_run( Include `force_new=True` to force the creation of a new run, which will allow creation of a new run using an existing name. """ super().attach_run( - channel, run_name, description, organization_id, tags, metadata, force_new + channel, run_name, client_key, description, organization_id, tags, metadata, force_new ) def detach_run(self): From 5540cd61360b9a9be953b5a4f5b39dd0a81856d3 Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Tue, 22 Jul 2025 15:45:20 -0700 Subject: [PATCH 07/34] tmp: experiments --- .../lib/sift_py/ingestion/_internal/ingest.py | 18 ++++- .../lib/sift_py/ingestion/_internal/stream.py | 71 ++++++++++++------- python/lib/sift_py/ingestion/buffer.py | 2 +- 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 79af2f215..2ad30e3f6 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -96,7 +96,23 @@ def ingest(self, *requests: IngestWithConfigDataStreamRequest): """ Perform data ingestion. """ - stream_requests(self.builder, requests, self.run_id) + stream_requests(self.builder, *requests, self.run_id) + + def ingest_async(self, *requests: IngestWithConfigDataStreamRequest): + """ + Perform data ingestion asynchronously in a background thread. + This allows multiple ingest calls to run in parallel. + """ + import threading + + thread = threading.Thread( + target=stream_requests, + args=(self.builder, *requests), + kwargs={"run_id": self.run_id or ""}, + daemon=True, + ) + thread.start() + return thread def ingest_flows(self, *flows: FlowOrderedChannelValues): """ diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index 16f9f2f6a..ae517240c 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -1,7 +1,9 @@ import asyncio import re +from queue import Queue from typing import List +from sift.ingest.v1.ingest_pb2 import IngestWithConfigDataStreamRequest from sift_stream_bindings import ( ChannelBitFieldElementPy, ChannelConfigPy, @@ -22,10 +24,6 @@ """ TODO: - helper to fetch ingestion config id via client key - - stream helper: build, send, finish - - helper to convert FlowOrderedChannelValues to IngestWithConfigDataStreamRequestPy - - create_ingestion_request helper, IngestWithConfigDataStreamRequestPy? - - helper to convert List of IngestWithConfigDataChannelValue to IngestWithConfigDataStreamRequestPy? """ @@ -89,7 +87,7 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift raise ValueError(f"Channel config is missing uri or apikey: {channel.config}") if not uri.startswith("https://"): - uri = f"https://{uri}" + uri = f"http://{uri}" print(f"Using URI: {uri}") builder = SiftStreamBuilderPy(uri, apikey) @@ -99,42 +97,58 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift async def stream_requests_async( - builder: SiftStreamBuilderPy, - requests: List, - run_id: str = "", -) -> None: - """ - Stream requests using the stream bindings asynchronously. - - Args: - builder: The SiftStreamBuilderPy to use for streaming - requests: List of IngestWithConfigDataStreamRequest protobuf objects - run_id: Optional run ID to associate with the requests - """ - # Convert protobuf requests to Python binding requests - py_requests = [ingest_request_to_ingest_request_py(request, run_id) for request in requests] - - # Create stream and send requests - sift_stream = await builder.build() - sift_stream = await sift_stream.send_requests(py_requests) - await sift_stream.finish() + builder: SiftStreamBuilderPy, run_id: str, *requests: IngestWithConfigDataStreamRequest +): + async def ingestion_thread(): + # Create stream and send requests + sift_stream = await builder.build() + try: + while not data_queue.empty(): + item = data_queue.get() + sift_stream = await sift_stream.send_requests(item) + await sift_stream.finish() + except Exception as e: + # Ensure stream is finished even if there's an error + try: + await sift_stream.finish() + except: + pass + raise e + + # Create a dedicated queue for this batch of requests + data_queue = Queue() + + # Put each request individually into the queue, filtering out None values + processed_requests = [] + for request in requests: + processed_request = ingest_request_to_ingest_request_py(request, run_id) + if processed_request is not None: + processed_requests.append(processed_request) + data_queue.put(processed_requests) + + print(f"Processing {len(requests)} requests in queue") + + # Process this batch + await ingestion_thread() def stream_requests( builder: SiftStreamBuilderPy, - requests: List, + *requests: IngestWithConfigDataStreamRequest, run_id: str = "", ) -> None: """ Stream requests using the stream bindings synchronously. + Each call to this function creates its own queue and stream, allowing multiple + batches to be processed concurrently when called from different threads. Args: builder: The SiftStreamBuilderPy to use for streaming requests: List of IngestWithConfigDataStreamRequest protobuf objects run_id: Optional run ID to associate with the requests """ - # Run the async function in a new event loop - asyncio.run(stream_requests_async(builder, requests, run_id)) + print(f"Starting stream requests for {len(requests)} requests") + asyncio.run(stream_requests_async(builder, run_id, *requests)) def telemetry_config_to_ingestion_config_py( @@ -291,6 +305,9 @@ def ingest_request_to_ingest_request_py( Returns: IngestWithConfigDataStreamRequestPy: The converted request """ + if request is None: + return None + timestamp_py = None if request.HasField("timestamp"): timestamp_py = TimeValuePy.from_timestamp( diff --git a/python/lib/sift_py/ingestion/buffer.py b/python/lib/sift_py/ingestion/buffer.py index 193d5aec5..1ca76bc9d 100644 --- a/python/lib/sift_py/ingestion/buffer.py +++ b/python/lib/sift_py/ingestion/buffer.py @@ -161,7 +161,7 @@ def flush(self): def _flush(self): if len(self._buffer) > 0: - self._ingestion_service.ingest(*self._buffer) + self._ingestion_service.ingest_async(*self._buffer) self._buffer.clear() def _start_flush_timer(self): From bc9969cd964d9a369804bcd0f2e6faebb663d194 Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Tue, 22 Jul 2025 17:51:29 -0700 Subject: [PATCH 08/34] placeholder for messy testing --- .../lib/sift_py/ingestion/_internal/stream.py | 23 +++++++++ python/lib/sift_py/ingestion/buffer.py | 4 ++ python/lib/sift_py/ingestion/service.py | 49 +++++++++++++++++++ 3 files changed, 76 insertions(+) diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index ae517240c..8e5ef4aa9 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -151,6 +151,29 @@ def stream_requests( asyncio.run(stream_requests_async(builder, run_id, *requests)) +def stream_requests_test( + builder: SiftStreamBuilderPy, + *requests: IngestWithConfigDataStreamRequest, + run_id: str = "", +) -> None: + """ + Stream requests using the stream bindings synchronously. + """ + + async def ingestion_test(): + sift_stream = await builder.build() + for request in processed_requests: + sift_stream = await sift_stream.send_requests(request) + await sift_stream.finish() + + print(f"Starting stream requests test for {len(requests)} requests") + processed_requests = [] + for request in requests: + processed_request = ingest_request_to_ingest_request_py(request, run_id) + if processed_request is not None: + processed_requests.append(processed_request) + + def telemetry_config_to_ingestion_config_py( telemetry_config: TelemetryConfig, ) -> IngestionConfigFormPy: diff --git a/python/lib/sift_py/ingestion/buffer.py b/python/lib/sift_py/ingestion/buffer.py index 1ca76bc9d..ee55a9f9d 100644 --- a/python/lib/sift_py/ingestion/buffer.py +++ b/python/lib/sift_py/ingestion/buffer.py @@ -71,9 +71,13 @@ def __exit__( else: self.flush() + # Wait for async ingestion threads to complete before re-raising + self._ingestion_service.wait_for_async_ingestion(timeout=30.0) raise exc_val else: self.flush() + # Wait for async ingestion threads to complete before exiting + self._ingestion_service.wait_for_async_ingestion(timeout=30.0) return True diff --git a/python/lib/sift_py/ingestion/service.py b/python/lib/sift_py/ingestion/service.py index 2a9b9ddb9..044add347 100644 --- a/python/lib/sift_py/ingestion/service.py +++ b/python/lib/sift_py/ingestion/service.py @@ -248,3 +248,52 @@ def try_create_flows(self, *flow_configs: FlowConfig): See `try_create_flows`. """ super().try_create_flow(*flow_configs) + + def wait_for_async_ingestion(self, timeout: Optional[float] = None) -> bool: + """ + Wait for all async ingestion threads to complete. + + This method is useful for ensuring all data has been sent before shutting down + or when you need to guarantee that all async ingestion operations have finished. + + Args: + timeout: Maximum time to wait in seconds. If None, wait indefinitely. + + Returns: + bool: True if all threads completed within timeout, False otherwise. + + Example: + ```python + # Start some async ingestion + ingestion_service.ingest_async(request1, request2) + ingestion_service.ingest_async(request3, request4) + + # Wait for all to complete (with 30 second timeout) + if not ingestion_service.wait_for_async_ingestion(timeout=30.0): + print("Some ingestion threads did not complete in time") + ``` + """ + return super().wait_for_async_ingestion(timeout) + + def get_async_thread_count(self) -> int: + """ + Get the number of currently running async ingestion threads. + + This method is useful for monitoring the state of async ingestion operations + and can help with debugging or understanding the current load. + + Returns: + int: Number of active async threads. + + Example: + ```python + # Start some async ingestion + ingestion_service.ingest_async(request1, request2) + ingestion_service.ingest_async(request3, request4) + + # Check how many threads are still running + active_threads = ingestion_service.get_async_thread_count() + print(f"Currently {active_threads} async ingestion threads running") + ``` + """ + return super().get_async_thread_count() From d81e636babae2e5b8db4869806b5b967e16a59a3 Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Wed, 23 Jul 2025 13:40:24 -0700 Subject: [PATCH 09/34] placeholder --- .../lib/sift_py/ingestion/_internal/ingest.py | 85 +++++++++++++++++-- .../lib/sift_py/ingestion/_internal/stream.py | 23 ----- python/lib/sift_py/ingestion/service.py | 49 ----------- 3 files changed, 76 insertions(+), 81 deletions(-) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 2ad30e3f6..074362be9 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -1,6 +1,8 @@ from __future__ import annotations import logging +import threading +import time from collections.abc import Callable from datetime import datetime from typing import Any, Dict, List, Optional, Union, cast @@ -23,7 +25,11 @@ get_ingestion_config_flows, ) from sift_py.ingestion._internal.run import create_run, get_run_id_by_name -from sift_py.ingestion._internal.stream import get_builder, get_run_form, stream_requests +from sift_py.ingestion._internal.stream import ( + get_builder, + get_run_form, + stream_requests, +) from sift_py.ingestion.channel import ( ChannelConfig, ChannelValue, @@ -52,6 +58,8 @@ class _IngestionServiceImpl: ingest_service_stub: IngestServiceStub rule_service: RuleService + _async_threads: List[threading.Thread] + _threads_lock: threading.Lock def __init__( self, @@ -92,28 +100,87 @@ def __init__( self.ingest_service_stub = IngestServiceStub(channel) self.config = config + # Thread tracking for async ingestion + self._async_threads = [] + self._threads_lock = threading.Lock() + def ingest(self, *requests: IngestWithConfigDataStreamRequest): """ Perform data ingestion. """ - stream_requests(self.builder, *requests, self.run_id) + self._ingest_async(*requests) - def ingest_async(self, *requests: IngestWithConfigDataStreamRequest): + def _ingest_async(self, *requests: IngestWithConfigDataStreamRequest): """ Perform data ingestion asynchronously in a background thread. This allows multiple ingest calls to run in parallel. """ - import threading + + def _ingest_and_cleanup(): + try: + stream_requests(self.builder, *requests, self.run_id) + finally: + # Remove this thread from tracking when it completes + with self._threads_lock: + if threading.current_thread() in self._async_threads: + self._async_threads.remove(threading.current_thread()) thread = threading.Thread( - target=stream_requests, - args=(self.builder, *requests), - kwargs={"run_id": self.run_id or ""}, + target=_ingest_and_cleanup, daemon=True, ) + + # Track the thread + with self._threads_lock: + self._async_threads.append(thread) + thread.start() return thread + def wait_for_async_ingestion(self, timeout: Optional[float] = None) -> bool: + """ + Wait for all async ingestion threads to complete. + + Args: + timeout: Maximum time to wait in seconds. If None, wait indefinitely. + + Returns: + bool: True if all threads completed within timeout, False otherwise. + """ + with self._threads_lock: + threads_to_wait = self._async_threads.copy() + + if not threads_to_wait: + return True + + # Wait for each thread with the remaining timeout + start_time = time.time() if timeout is not None else None + + for thread in threads_to_wait: + if timeout is not None: + remaining_timeout = timeout - (time.time() - start_time) + if remaining_timeout <= 0: + return False + thread.join(timeout=remaining_timeout) + if thread.is_alive(): + return False + else: + thread.join() + + return True + + def get_async_thread_count(self) -> int: + """ + Get the number of currently running async ingestion threads. + + Returns: + int: Number of active async threads. + """ + with self._threads_lock: + # Clean up any completed threads + self._async_threads = [t for t in self._async_threads if t.is_alive()] + return len(self._async_threads) + def ingest_flows(self, *flows: FlowOrderedChannelValues): """ Combines the requests creation step and ingestion into a single call. @@ -128,7 +195,7 @@ def ingest_flows(self, *flows: FlowOrderedChannelValues): req = self.create_ingestion_request(flow_name, timestamp, channel_values) requests.append(req) - stream_requests(self.builder, requests, self.run_id) + self._ingest_async(*requests) def try_ingest_flows(self, *flows: Flow): """ @@ -144,7 +211,7 @@ def try_ingest_flows(self, *flows: Flow): req = self.try_create_ingestion_request(flow_name, timestamp, channel_values) requests.append(req) - stream_requests(self.builder, requests, self.run_id) + self._ingest_async(*requests) def attach_run( self, diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index 8e5ef4aa9..ae517240c 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -151,29 +151,6 @@ def stream_requests( asyncio.run(stream_requests_async(builder, run_id, *requests)) -def stream_requests_test( - builder: SiftStreamBuilderPy, - *requests: IngestWithConfigDataStreamRequest, - run_id: str = "", -) -> None: - """ - Stream requests using the stream bindings synchronously. - """ - - async def ingestion_test(): - sift_stream = await builder.build() - for request in processed_requests: - sift_stream = await sift_stream.send_requests(request) - await sift_stream.finish() - - print(f"Starting stream requests test for {len(requests)} requests") - processed_requests = [] - for request in requests: - processed_request = ingest_request_to_ingest_request_py(request, run_id) - if processed_request is not None: - processed_requests.append(processed_request) - - def telemetry_config_to_ingestion_config_py( telemetry_config: TelemetryConfig, ) -> IngestionConfigFormPy: diff --git a/python/lib/sift_py/ingestion/service.py b/python/lib/sift_py/ingestion/service.py index 044add347..2a9b9ddb9 100644 --- a/python/lib/sift_py/ingestion/service.py +++ b/python/lib/sift_py/ingestion/service.py @@ -248,52 +248,3 @@ def try_create_flows(self, *flow_configs: FlowConfig): See `try_create_flows`. """ super().try_create_flow(*flow_configs) - - def wait_for_async_ingestion(self, timeout: Optional[float] = None) -> bool: - """ - Wait for all async ingestion threads to complete. - - This method is useful for ensuring all data has been sent before shutting down - or when you need to guarantee that all async ingestion operations have finished. - - Args: - timeout: Maximum time to wait in seconds. If None, wait indefinitely. - - Returns: - bool: True if all threads completed within timeout, False otherwise. - - Example: - ```python - # Start some async ingestion - ingestion_service.ingest_async(request1, request2) - ingestion_service.ingest_async(request3, request4) - - # Wait for all to complete (with 30 second timeout) - if not ingestion_service.wait_for_async_ingestion(timeout=30.0): - print("Some ingestion threads did not complete in time") - ``` - """ - return super().wait_for_async_ingestion(timeout) - - def get_async_thread_count(self) -> int: - """ - Get the number of currently running async ingestion threads. - - This method is useful for monitoring the state of async ingestion operations - and can help with debugging or understanding the current load. - - Returns: - int: Number of active async threads. - - Example: - ```python - # Start some async ingestion - ingestion_service.ingest_async(request1, request2) - ingestion_service.ingest_async(request3, request4) - - # Check how many threads are still running - active_threads = ingestion_service.get_async_thread_count() - print(f"Currently {active_threads} async ingestion threads running") - ``` - """ - return super().get_async_thread_count() From d8c39ce5c49a3fde753ed7f7f9921d4e5c21d335 Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Thu, 24 Jul 2025 14:45:42 -0700 Subject: [PATCH 10/34] fix: Fix runs, bitfields, clean up --- .../lib/sift_py/ingestion/_internal/ingest.py | 19 ++-- python/lib/sift_py/ingestion/_internal/run.py | 23 ++-- .../lib/sift_py/ingestion/_internal/stream.py | 105 ++++-------------- python/lib/sift_py/ingestion/service.py | 4 +- 4 files changed, 49 insertions(+), 102 deletions(-) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 074362be9..f42bf92f8 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -108,9 +108,9 @@ def ingest(self, *requests: IngestWithConfigDataStreamRequest): """ Perform data ingestion. """ - self._ingest_async(*requests) + self.ingest_async(*requests) - def _ingest_async(self, *requests: IngestWithConfigDataStreamRequest): + def ingest_async(self, *requests: IngestWithConfigDataStreamRequest): """ Perform data ingestion asynchronously in a background thread. This allows multiple ingest calls to run in parallel. @@ -195,7 +195,7 @@ def ingest_flows(self, *flows: FlowOrderedChannelValues): req = self.create_ingestion_request(flow_name, timestamp, channel_values) requests.append(req) - self._ingest_async(*requests) + self.ingest_async(*requests) def try_ingest_flows(self, *flows: Flow): """ @@ -211,18 +211,18 @@ def try_ingest_flows(self, *flows: Flow): req = self.try_create_ingestion_request(flow_name, timestamp, channel_values) requests.append(req) - self._ingest_async(*requests) + self.ingest_async(*requests) def attach_run( self, channel: SiftChannel, run_name: str, - client_key: Optional[str] = None, description: Optional[str] = None, organization_id: Optional[str] = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Union[str, float, bool]]] = None, force_new: bool = False, + client_key: Optional[str] = None, ): """ Retrieve an existing run or create one to use during this period of ingestion. @@ -239,14 +239,19 @@ def attach_run( self.run_id = create_run( channel=channel, run_name=run_name, - run_client_key=client_key or "", + run_client_key=client_key, description=description or "", organization_id=organization_id or "", tags=tags or [], metadata=metadata, ) - self.builder.run = get_run_form(run_name, description or "", client_key or "", tags or []) + self.builder.run = get_run_form( + run_name=run_name, + run_description=description or "", + client_key=client_key, + run_tags=tags, + ) def detach_run(self): """ diff --git a/python/lib/sift_py/ingestion/_internal/run.py b/python/lib/sift_py/ingestion/_internal/run.py index 604f83ba4..bf313ccd2 100644 --- a/python/lib/sift_py/ingestion/_internal/run.py +++ b/python/lib/sift_py/ingestion/_internal/run.py @@ -32,23 +32,28 @@ def get_run_id_by_name( def create_run( channel: SiftChannel, run_name: str, - run_client_key: str, description: str, organization_id: str, tags: List[str], metadata: Optional[Dict[str, Union[str, float, bool]]] = None, + run_client_key: Optional[str] = None, ) -> str: svc = RunServiceStub(channel) _metadata = metadata_dict_to_pb(metadata) if metadata else None - req = CreateRunRequest( - name=run_name, - client_key=run_client_key, - description=description, - organization_id=organization_id, - tags=tags, - metadata=_metadata, - ) + kwargs = { + "name": run_name, + "description": description, + "organization_id": organization_id, + "tags": tags, + "metadata": _metadata, + } + if run_client_key: + kwargs["client_key"] = run_client_key + + print(f"Creating run with kwargs: {kwargs}") + + req = CreateRunRequest(**kwargs) res = cast(CreateRunResponse, svc.CreateRun(req)) return res.run.run_id diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index ae517240c..a3f6ae22c 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -1,7 +1,6 @@ import asyncio -import re from queue import Queue -from typing import List +from typing import List, Optional from sift.ingest.v1.ingest_pb2 import IngestWithConfigDataStreamRequest from sift_stream_bindings import ( @@ -21,53 +20,6 @@ from sift_py.grpc.transport import SiftChannel from sift_py.ingestion.config.telemetry import TelemetryConfig -""" -TODO: - - helper to fetch ingestion config id via client key -""" - - -def _sanitize_client_key(client_key: str) -> str: - """ - Validate and sanitize a client key to meet Sift constraints. - - Client key must be 3-128 characters, start and end with alphanumeric, - and contain only [a-zA-Z0-9_~.-] - - Args: - client_key: The client key to validate - - Returns: - str: A valid client key - - Raises: - ValueError: If the client key cannot be made valid - """ - # TODO: Test - if not client_key: - raise ValueError("Client key cannot be empty") - - # Remove any characters that don't match the allowed pattern - sanitized = re.sub(r"[^a-zA-Z0-9_~.-]", "_", client_key) - - # Ensure it starts with alphanumeric - if sanitized and not sanitized[0].isalnum(): - sanitized = "a" + sanitized - - # Ensure it ends with alphanumeric - if sanitized and not sanitized[-1].isalnum(): - sanitized = sanitized + "0" - - # Check length constraints - if len(sanitized) < 3: - # Pad with alphanumeric characters to meet minimum length - sanitized = sanitized + "00"[: 3 - len(sanitized)] - elif len(sanitized) > 128: - # Truncate to 128 characters, ensuring it ends with alphanumeric - sanitized = sanitized[:126] + "0" - - return sanitized - def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> SiftStreamBuilderPy: """ @@ -86,9 +38,11 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift if not uri or not apikey: raise ValueError(f"Channel config is missing uri or apikey: {channel.config}") - if not uri.startswith("https://"): - uri = f"http://{uri}" - print(f"Using URI: {uri}") + if not uri.startswith("http"): + if "localhost" in uri: + uri = f"http://{uri}" + else: + uri = f"https://{uri}" builder = SiftStreamBuilderPy(uri, apikey) builder.ingestion_config = telemetry_config_to_ingestion_config_py(ingestion_config) @@ -265,7 +219,7 @@ def convert_channel_data_type(data_type) -> ChannelDataTypePy: def get_run_form( - run_name: str, run_description: str, client_key: str, run_tags: List[str] + run_name: str, run_description: str, client_key: Optional[str] = None, run_tags: List[str] = [] ) -> RunFormPy: """ Get a run form. @@ -279,14 +233,10 @@ def get_run_form( Returns: RunFormPy: The run form """ - # Use provided client_key or sanitize run_name as fallback - if not client_key: - client_key = _sanitize_client_key(run_name) - return RunFormPy( name=run_name, description=run_description, - client_key=client_key, + client_key=client_key or "", tags=run_tags, ) @@ -346,47 +296,34 @@ def convert_channel_value_to_channel_value_py(channel_value) -> ChannelValuePy: if not isinstance(channel_value, IngestWithConfigDataChannelValue): raise ValueError(f"Expected IngestWithConfigDataChannelValue, got {type(channel_value)}") - # Extract the value from the oneof field - # Note: We need a channel name, but the protobuf doesn't contain it - # This is a limitation - we'll use a placeholder name - channel_name = "unknown_channel" # This is a limitation of the conversion - if channel_value.HasField("string"): - return ChannelValuePy.string(channel_name, channel_value.string) + return ChannelValuePy.string("", channel_value.string) elif channel_value.HasField("double"): - return ChannelValuePy.double(channel_name, channel_value.double) + return ChannelValuePy.double("", channel_value.double) elif channel_value.HasField("float"): - return ChannelValuePy.float(channel_name, channel_value.float) + return ChannelValuePy.float("", channel_value.float) elif channel_value.HasField("bool"): - return ChannelValuePy.bool(channel_name, channel_value.bool) + return ChannelValuePy.bool("", channel_value.bool) elif channel_value.HasField("int32"): - return ChannelValuePy.int32(channel_name, channel_value.int32) + return ChannelValuePy.int32("", channel_value.int32) elif channel_value.HasField("uint32"): - return ChannelValuePy.uint32(channel_name, channel_value.uint32) + return ChannelValuePy.uint32("", channel_value.uint32) elif channel_value.HasField("int64"): - return ChannelValuePy.int64(channel_name, channel_value.int64) + return ChannelValuePy.int64("", channel_value.int64) elif channel_value.HasField("uint64"): - return ChannelValuePy.uint64(channel_name, channel_value.uint64) + return ChannelValuePy.uint64("", channel_value.uint64) elif channel_value.HasField("enum"): # For enum values, we need to create a ChannelEnumTypePy enum_type = ChannelEnumTypePy(name=f"enum_{channel_value.enum}", key=channel_value.enum) - return ChannelValuePy.enum_value(channel_name, enum_type) + return ChannelValuePy.enum_value("", enum_type) elif channel_value.HasField("bit_field"): - # For bit field values, we need to create ChannelBitFieldElementPy list - # This is a simplified conversion - in practice you'd need the actual bit field definition - bit_field_elements = [] - for i, byte in enumerate(channel_value.bit_field): - if byte != 0: - bit_field_elements.append( - ChannelBitFieldElementPy(name=f"bit_{i}", index=i, bit_count=1) - ) - return ChannelValuePy.bitfield(channel_name, bit_field_elements) + return ChannelValuePy.bitfield("", channel_value.bitfield) elif channel_value.HasField("bytes"): # For bytes values, we'll convert to a string representation - return ChannelValuePy.string(channel_name, str(channel_value.bytes)) + return ChannelValuePy.string("", str(channel_value.bytes)) elif channel_value.HasField("empty"): # For empty values, we'll return a default value - return ChannelValuePy.string(channel_name, "") + return ChannelValuePy.string("", "") else: # No field set, return empty string - return ChannelValuePy.string(channel_name, "") + return ChannelValuePy.string("", "") diff --git a/python/lib/sift_py/ingestion/service.py b/python/lib/sift_py/ingestion/service.py index 2a9b9ddb9..6eb82fd7e 100644 --- a/python/lib/sift_py/ingestion/service.py +++ b/python/lib/sift_py/ingestion/service.py @@ -63,12 +63,12 @@ def attach_run( self, channel: SiftChannel, run_name: str, - client_key: Optional[str] = None, description: Optional[str] = None, organization_id: Optional[str] = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Union[str, float, bool]]] = None, force_new: bool = False, + client_key: Optional[str] = None, ): """ Retrieve an existing run or create one to use during this period of ingestion. @@ -76,7 +76,7 @@ def attach_run( Include `force_new=True` to force the creation of a new run, which will allow creation of a new run using an existing name. """ super().attach_run( - channel, run_name, client_key, description, organization_id, tags, metadata, force_new + channel, run_name, description, organization_id, tags, metadata, force_new, client_key ) def detach_run(self): From 9fa13507a94abd652ac6c26aa75dcd4407b718c8 Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Fri, 25 Jul 2025 14:38:33 -0700 Subject: [PATCH 11/34] just debugging --- .../lib/sift_py/ingestion/_internal/stream.py | 51 +++++++++---------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index a3f6ae22c..e6ab4d6cb 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -1,4 +1,5 @@ import asyncio +import random from queue import Queue from typing import List, Optional @@ -8,9 +9,9 @@ ChannelConfigPy, ChannelDataTypePy, ChannelEnumTypePy, - ChannelValuePy, FlowConfigPy, IngestionConfigFormPy, + IngestWithConfigDataChannelValuePy, IngestWithConfigDataStreamRequestPy, RunFormPy, SiftStreamBuilderPy, @@ -75,6 +76,9 @@ async def ingestion_thread(): # Put each request individually into the queue, filtering out None values processed_requests = [] for request in requests: + if not isinstance(request, IngestWithConfigDataStreamRequest): + print(f"Skipping request: {request} of type {type(request)}") + continue processed_request = ingest_request_to_ingest_request_py(request, run_id) if processed_request is not None: processed_requests.append(processed_request) @@ -236,7 +240,7 @@ def get_run_form( return RunFormPy( name=run_name, description=run_description, - client_key=client_key or "", + client_key=client_key or f"random_key_{str(random.randint(1000, 9999))}", tags=run_tags, ) @@ -255,6 +259,8 @@ def ingest_request_to_ingest_request_py( Returns: IngestWithConfigDataStreamRequestPy: The converted request """ + if isinstance(request, str): + print(f"Converting request: {request} of type {type(request)}") if request is None: return None @@ -280,50 +286,43 @@ def ingest_request_to_ingest_request_py( ) -def convert_channel_value_to_channel_value_py(channel_value) -> ChannelValuePy: +def convert_channel_value_to_channel_value_py(channel_value) -> IngestWithConfigDataChannelValuePy: """ - Convert an IngestWithConfigDataChannelValue to ChannelValuePy. + Convert an IngestWithConfigDataChannelValue to IngestWithConfigDataChannelValuePy. Args: channel_value: The IngestWithConfigDataChannelValue to convert Returns: - ChannelValuePy: The converted channel value + IngestWithConfigDataChannelValuePy: The converted channel value """ - # Import here to avoid circular imports - from sift.ingest.v1.ingest_pb2 import IngestWithConfigDataChannelValue - - if not isinstance(channel_value, IngestWithConfigDataChannelValue): - raise ValueError(f"Expected IngestWithConfigDataChannelValue, got {type(channel_value)}") - if channel_value.HasField("string"): - return ChannelValuePy.string("", channel_value.string) + return IngestWithConfigDataChannelValuePy.string(channel_value.string) elif channel_value.HasField("double"): - return ChannelValuePy.double("", channel_value.double) + return IngestWithConfigDataChannelValuePy.double(channel_value.double) elif channel_value.HasField("float"): - return ChannelValuePy.float("", channel_value.float) + return IngestWithConfigDataChannelValuePy.float(channel_value.float) elif channel_value.HasField("bool"): - return ChannelValuePy.bool("", channel_value.bool) + return IngestWithConfigDataChannelValuePy.bool(channel_value.bool) elif channel_value.HasField("int32"): - return ChannelValuePy.int32("", channel_value.int32) + return IngestWithConfigDataChannelValuePy.int32(channel_value.int32) elif channel_value.HasField("uint32"): - return ChannelValuePy.uint32("", channel_value.uint32) + return IngestWithConfigDataChannelValuePy.uint32(channel_value.uint32) elif channel_value.HasField("int64"): - return ChannelValuePy.int64("", channel_value.int64) + return IngestWithConfigDataChannelValuePy.int64(channel_value.int64) elif channel_value.HasField("uint64"): - return ChannelValuePy.uint64("", channel_value.uint64) + return IngestWithConfigDataChannelValuePy.uint64(channel_value.uint64) elif channel_value.HasField("enum"): # For enum values, we need to create a ChannelEnumTypePy - enum_type = ChannelEnumTypePy(name=f"enum_{channel_value.enum}", key=channel_value.enum) - return ChannelValuePy.enum_value("", enum_type) + return IngestWithConfigDataChannelValuePy.enum_value(channel_value.enum) elif channel_value.HasField("bit_field"): - return ChannelValuePy.bitfield("", channel_value.bitfield) + return IngestWithConfigDataChannelValuePy.bitfield(channel_value.bit_field) elif channel_value.HasField("bytes"): # For bytes values, we'll convert to a string representation - return ChannelValuePy.string("", str(channel_value.bytes)) + return IngestWithConfigDataChannelValuePy.string(str(channel_value.bytes)) elif channel_value.HasField("empty"): # For empty values, we'll return a default value - return ChannelValuePy.string("", "") + return IngestWithConfigDataChannelValuePy.empty() else: - # No field set, return empty string - return ChannelValuePy.string("", "") + # No field set, return empty value + return IngestWithConfigDataChannelValuePy.empty() From 7fb083328397ee01e6f872aefa62c47913145ac4 Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Tue, 29 Jul 2025 16:23:19 -0700 Subject: [PATCH 12/34] fix: Insecure channel return type --- python/lib/sift_py/grpc/transport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lib/sift_py/grpc/transport.py b/python/lib/sift_py/grpc/transport.py index 02d244a6f..383eb93e0 100644 --- a/python/lib/sift_py/grpc/transport.py +++ b/python/lib/sift_py/grpc/transport.py @@ -125,7 +125,7 @@ def use_sift_async_channel( def _use_insecure_sift_channel( config: SiftChannelConfig, metadata: Optional[Dict[str, Any]] = None -) -> grpc.Channel: +) -> SiftChannel: """ FOR DEVELOPMENT PURPOSES ONLY """ From 57c845abf7f9aaa5d1110d2253beda0e89bba38f Mon Sep 17 00:00:00 2001 From: Ailin Yu Date: Tue, 29 Jul 2025 16:26:11 -0700 Subject: [PATCH 13/34] chore: buffered ingestion can call regular ingest now --- python/lib/sift_py/ingestion/buffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lib/sift_py/ingestion/buffer.py b/python/lib/sift_py/ingestion/buffer.py index ee55a9f9d..0e9749dec 100644 --- a/python/lib/sift_py/ingestion/buffer.py +++ b/python/lib/sift_py/ingestion/buffer.py @@ -165,7 +165,7 @@ def flush(self): def _flush(self): if len(self._buffer) > 0: - self._ingestion_service.ingest_async(*self._buffer) + self._ingestion_service.ingest(*self._buffer) self._buffer.clear() def _start_flush_timer(self): From 5fd84367ef90b093fc48b6139c8d6d2dd81afe8a Mon Sep 17 00:00:00 2001 From: Ian Later Date: Wed, 6 Aug 2025 17:18:00 -0700 Subject: [PATCH 14/34] FD-83: Use longer lived threads for ingestion. --- .../ingestion_with_python_config/main.py | 6 + .../ingestion_with_python_config/simulator.py | 2 +- .../examples/ingestion_with_threading/main.py | 3 + .../sample_data/sample_logs.txt | 356 ++++++++++++++++++ .../ingestion_with_threading/simulator.py | 3 +- python/lib/sift_py/grpc/transport.py | 1 + .../lib/sift_py/ingestion/_internal/ingest.py | 67 ++-- .../ingestion/_internal/ingestion_config.py | 1 + .../lib/sift_py/ingestion/_internal/stream.py | 152 ++++++-- python/pyproject.toml | 2 +- 10 files changed, 512 insertions(+), 81 deletions(-) create mode 100644 python/examples/ingestion_with_threading/sample_data/sample_logs.txt diff --git a/python/examples/ingestion_with_python_config/main.py b/python/examples/ingestion_with_python_config/main.py index a27d288f9..fc7b62139 100644 --- a/python/examples/ingestion_with_python_config/main.py +++ b/python/examples/ingestion_with_python_config/main.py @@ -16,11 +16,14 @@ load_dotenv() apikey = os.getenv("SIFT_API_KEY") + apikey = "aA5ZGxfVIhc1DjNKs47HOhrlptC8QLBp3ms20NPp" if apikey is None: raise Exception("Missing 'SIFT_API_KEY' environment variable.") base_uri = os.getenv("BASE_URI") + if not base_uri.startswith("http"): + base_uri = f"http://localhost:50051" if base_uri is None: raise Exception("Missing 'BASE_URI' environment variable.") @@ -30,6 +33,9 @@ # Create a gRPC transport channel configured specifically for the Sift API sift_channel_config = SiftChannelConfig(uri=base_uri, apikey=apikey) + sift_channel_config["use_ssl"] = False + # sift_channel_config["cert_via_openssl"] = False + print(f"sift_channel_config: {sift_channel_config}") with use_sift_channel(sift_channel_config) as channel: # Create ingestion service using the telemetry config we loaded in diff --git a/python/examples/ingestion_with_python_config/simulator.py b/python/examples/ingestion_with_python_config/simulator.py index 46fd0a8ae..5671f7695 100644 --- a/python/examples/ingestion_with_python_config/simulator.py +++ b/python/examples/ingestion_with_python_config/simulator.py @@ -68,7 +68,7 @@ def run(self): logs_interval_s = 1 / LOGS_FREQUENCY_HZ partial_readings_with_log_interval_s = 1 / PARTIAL_READINGS_WITH_LOG_FREQUENCY_HZ - with self.ingestion_service.buffered_ingestion() as buffered_ingestion: + with self.ingestion_service.buffered_ingestion(buffer_size=10) as buffered_ingestion: while time.time() < end_time: current_time = time.time() diff --git a/python/examples/ingestion_with_threading/main.py b/python/examples/ingestion_with_threading/main.py index 3fc808dbf..135eacd10 100644 --- a/python/examples/ingestion_with_threading/main.py +++ b/python/examples/ingestion_with_threading/main.py @@ -43,6 +43,8 @@ def ingestion_thread(data_queue: Queue): raise Exception("Missing 'SIFT_API_KEY' environment variable.") base_uri = os.getenv("BASE_URI") + if not base_uri.startswith("http"): + base_uri = f"http://{base_uri}" if base_uri is None: raise Exception("Missing 'BASE_URI' environment variable.") @@ -52,6 +54,7 @@ def ingestion_thread(data_queue: Queue): # Create a gRPC transport channel configured specifically for the Sift API sift_channel_config = SiftChannelConfig(uri=base_uri, apikey=apikey) + sift_channel_config["use_ssl"] = False with use_sift_channel(sift_channel_config) as channel: # Create ingestion service using the telemetry config we loaded in diff --git a/python/examples/ingestion_with_threading/sample_data/sample_logs.txt b/python/examples/ingestion_with_threading/sample_data/sample_logs.txt new file mode 100644 index 000000000..1475b0961 --- /dev/null +++ b/python/examples/ingestion_with_threading/sample_data/sample_logs.txt @@ -0,0 +1,356 @@ +[sshd(pam_unix)[14281]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=217.60.212.66 user=guest +kernel:\\n audit(1122475266.4294965305:0): initialized +su(pam_unix)[2605]: session closed for user cyrus +syslog: klogd startup succeeded +kernel:\\n audit(1122475266.4294965305:0): initialized +sshd(pam_unix)[10035]: check pass; user unknown +bluetooth: sdpd startup succeeded +rpc.statd[1618]: Version 1.0.6 Starting +kernel:\\n There is already a security framework initialized + register_security failed. +gdm(pam_unix)[2803]: authentication failure; logname= uid=0 euid=0 tty=:0 ruser= rhost= +sshd(pam_unix)[28975]: check pass; user unknown +kernel:\\n usbcore: registered new driver hub +rpc.statd[1618]: Version 1.0.6 Starting +xinetd[26482]: warning: can't get client address: Connection reset by peer +kernel:\\n Intel machine check architecture supported. +kernel:\\n CPU: Intel Pentium III (Coppermine) stepping 06 +ftpd[24534]: connection from 217.187.83.139 () at Sun Jul 10 03:55:15 2005 +kernel:\\n CPU: Intel Pentium III (Coppermine) stepping 06 +kernel:\\n audit: initializing netlink socket (disabled) +kernel:\\n Linux version 2.6.5-1.358 (bhcompile@bugs.build.redhat.com) (gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7)) #1 Sat May 8 09:04:50 EDT 2004 +su(pam_unix)[1595]: session closed for user news +kernel:\\n audit(1122475266.4294965305:0): initialized +sshd(pam_unix)[24030]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=61-220-159-99.hinet-ip.hinet.net user=root +su(pam_unix)[2605]: session closed for user cyrus +ftpd[24487]: connection from 203.101.45.59 (dsl-Chn-static-059.45.101.203.touchtelindia.net) at Sun Jul 17 15:09:17 2005 +gdm(pam_unix)[2803]: authentication failure; logname= uid=0 euid=0 tty=:0 ruser= rhost= +hcid[1690]: HCI daemon ver 2.4 started +kernel:\\n You can enable it with acpi=force +bluetooth: sdpd startup succeeded +kernel:\\n BIOS-provided physical RAM map: +kernel:\\n Memory: 125312k/129720k available (1540k kernel code +3860k reserved +599k data +144k init +0k highmem +kernel:\\n SELinux: Registering netfilter hooks +kernel:\\n PCI: Probing PCI hardware (bus 00) +kernel:\\n PCI: Invalid ACPI-PCI IRQ routing table +kernel:\\n Console: colour VGA+ 80x25 +kernel:\\n Calibrating delay loop... 1445.88 BogoMIPS +ftpd[16781]: ANONYMOUS FTP LOGIN FROM 84.102.20.2 + (anonymous) +cups: cupsd shutdown succeeded +ftpd[24378]: connection from 207.30.238.8 (host8.topspot.net) at Sun Jul 17 14:03:05 2005 +ftpd[16782]: ANONYMOUS FTP LOGIN FROM 84.102.20.2 + (anonymous) +kernel:\\n Enabling unmasked SIMD FPU exception support... done. +kernel:\\n Initializing Cryptographic API +kernel:\\n Capability LSM initialized +kernel:\\n Enabling fast FPU save and restore... done. +kernel:\\n You can enable it with acpi=force +sshd(pam_unix)[5586]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=193.110.106.11 user=root +kernel:\\n Failure registering capabilities with the kernel +ftpd[24091]: connection from 206.196.21.129 (host129.206.196.21.maximumasp.com) at Sat Jul 9 22:53:19 2005 +xinetd[26482]: warning: can't get client address: Connection reset by peer +kernel:\\n BIOS-provided physical RAM map: +kernel:\\n Linux Plug and Play Support v0.97 (c) Adam Belay +sshd(pam_unix)[31848]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=82.77.200.128 user=root +kernel:\\n Intel machine check architecture supported. +kernel:\\n Security Scaffold v1.0.0 initialized +kernel:\\n DMI 2.3 present. +gdm(pam_unix)[2803]: authentication failure; logname= uid=0 euid=0 tty=:0 ruser= rhost= +network: Setting network parameters: succeeded +kernel:\\n PCI: Invalid ACPI-PCI IRQ routing table +kernel:\\n Checking 'hlt' instruction... OK. +ftpd[25239]: connection from 82.68.222.195 (82-68-222-195.dsl.in-addr.zen.co.uk) at Sun Jul 17 23:21:54 2005 +kernel:\\n apm: BIOS version 1.2 Flags 0x03 (Driver version 1.16ac) +kernel:\\n Transparent bridge - 0000:00:1e.0 +kernel:\\n PCI: Using IRQ router PIIX/ICH [8086/2410] at 0000:00:1f.0 +ftpd[16782]: connection from 84.102.20.2 () at Sun Jul 24 02:38:22 2005 +ftpd[16781]: ANONYMOUS FTP LOGIN FROM 84.102.20.2 + (anonymous) +kernel:\\n BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved) +ftpd[12299]: connection from 211.42.188.206 () at Fri Jul 22 09:27:24 2005 +kernel:\\n audit(1122475266.4294965305:0): initialized +kernel:\\n POSIX conformance testing by UNIFIX +kernel:\\n Initializing CPU#0 +kernel:\\n Kernel command line: ro root=LABEL=/ rhgb quiet +xinetd[26482]: warning: can't get client address: Connection reset by peer +ftpd[16782]: connection from 84.102.20.2 () at Sun Jul 24 02:38:22 2005 +ftpd[12299]: connection from 211.42.188.206 () at Fri Jul 22 09:27:24 2005 +kernel:\\n Real Time Clock Driver v1.12 +ftpd[24378]: connection from 207.30.238.8 (host8.topspot.net) at Sun Jul 17 14:03:05 2005 +ftpd[26466]: getpeername (ftpd): Transport endpoint is not connected +sshd(pam_unix)[10035]: check pass; user unknown +kernel:\\n Console: colour VGA+ 80x25 +kernel:\\n POSIX conformance testing by UNIFIX +kernel:\\n ACPI disabled because your bios is from 2000 and too old +ftpd[26466]: getpeername (ftpd): Transport endpoint is not connected +kernel:\\n Initializing CPU#0 +kernel:\\n SELinux: Starting in permissive mode +kernel:\\n CPU: Intel Pentium III (Coppermine) stepping 06 +ftpd[16782]: connection from 84.102.20.2 () at Sun Jul 24 02:38:22 2005 +kernel:\\n VFS: Disk quotas dquot_6.5.1 +ftpd[17689]: connection from 212.65.68.82 () at Sat Jul 16 08:14:07 2005 +sshd(pam_unix)[23798]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=p15105218.pureserver.info user=root +ftpd[12299]: connection from 211.42.188.206 () at Fri Jul 22 09:27:24 2005 +sshd(pam_unix)[5586]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=193.110.106.11 user=root +kernel:\\n NET: Registered protocol family 16 +su(pam_unix)[16058]: session opened for user cyrus by (uid=0) +gdm-binary[2803]: Couldn't authenticate user +ftpd[24978]: connection from 206.47.209.10 () at Mon Jul 25 06:39:18 2005 +kernel:\\n HighMem zone: 0 pages + LIFO batch:1 +kernel:\\n Capability LSM initialized +kernel:\\n ACPI: ACPI tables contain no PCI IRQ routing entries +kernel:\\n audit(1122475266.4294965305:0): initialized +ftpd[16782]: connection from 84.102.20.2 () at Sun Jul 24 02:38:22 2005 +logrotate: ALERT exited abnormally with [1] +kernel:\\n DMI 2.3 present. +kernel:\\n zapping low mappings. +sdpd[1696]: sdpd v1.5 started +kernel:\\n CPU: Intel Pentium III (Coppermine) stepping 06 +ftpd[25648]: connection from 211.72.151.162 () at Mon Jul 18 03:26:49 2005 +kernel:\\n SELinux: Starting in permissive mode +syslogd 1.4.1: restart. +ftpd[24534]: connection from 217.187.83.139 () at Sun Jul 10 03:55:15 2005 +bluetooth: sdpd startup succeeded +gdm-binary[2803]: Couldn't authenticate user +kernel:\\n CPU 0 irqstacks + hard=02345000 soft=02344000 +ftpd[24978]: connection from 206.47.209.10 () at Mon Jul 25 06:39:18 2005 +kernel:\\n Transparent bridge - 0000:00:1e.0 +ftpd[16781]: ANONYMOUS FTP LOGIN FROM 84.102.20.2 + (anonymous) +sshd(pam_unix)[31207]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=adsl-70-242-75-179.dsl.ksc2mo.swbell.net +kernel:\\n Using tsc for high-res timesource +network: Bringing up loopback interface: succeeded +kernel:\\n klogd 1.4.1 + log source = /proc/kmsg started. +kernel:\\n Enabling unmasked SIMD FPU exception support... done. +kernel:\\n SELinux: Registering netfilter hooks +network: Bringing up loopback interface: succeeded +kernel:\\n ACPI: Subsystem revision 20040326 +sdpd[1696]: sdpd v1.5 started +kernel:\\n CPU: L2 cache: 256K +sdpd[1696]: sdpd v1.5 started +sshd(pam_unix)[10035]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=65.166.159.14 +kernel:\\n Linux version 2.6.5-1.358 (bhcompile@bugs.build.redhat.com) (gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7)) #1 Sat May 8 09:04:50 EDT 2004 +su(pam_unix)[10583]: session closed for user news +ftpd[16781]: ANONYMOUS FTP LOGIN FROM 84.102.20.2 + (anonymous) +ftpd[16781]: ANONYMOUS FTP LOGIN FROM 84.102.20.2 + (anonymous) +kernel:\\n BIOS-e820: 00000000ffb00000 - 0000000100000000 (reserved) +ftpd[16781]: ANONYMOUS FTP LOGIN FROM 84.102.20.2 + (anonymous) +kernel:\\n DMA zone: 4096 pages + LIFO batch:1 +sshd(pam_unix)[8113]: session opened for user test by (uid=509) +kernel:\\n Inode-cache hash table entries: 8192 (order: 3 + 32768 bytes) +kernel:\\n audit(1122475266.4294965305:0): initialized +kernel:\\n Checking 'hlt' instruction... OK. +kernel:\\n Enabling fast FPU save and restore... done. +kernel:\\n PCI: Using IRQ router PIIX/ICH [8086/2410] at 0000:00:1f.0 +kernel:\\n Checking 'hlt' instruction... OK. +syslog: klogd startup succeeded +kernel:\\n BIOS-e820: 0000000007eae000 - 0000000008000000 (reserved) +ftpd[24487]: connection from 203.101.45.59 (dsl-Chn-static-059.45.101.203.touchtelindia.net) at Sun Jul 17 15:09:17 2005 +kernel:\\n Memory: 125312k/129720k available (1540k kernel code + 3860k reserved + 599k data + 144k init + 0k highmem) +named[2306]: notify question section contains no SOA +kernel:\\n usbcore: registered new driver usbfs +kernel:\\n Inode-cache hash table entries: 8192 (order: 3 + 32768 bytes) +kernel:\\n PID hash table entries: 512 (order 9: 4096 bytes) +kernel:\\n audit(1122475266.4294965305:0): initialized +kernel:\\n Mount-cache hash table entries: 512 (order: 0 + 4096 bytes) +kernel:\\n ACPI: Subsystem revision 20040326 +nfslock: rpc.statd startup succeeded +kernel:\\n ACPI: Subsystem revision 20040326 +logrotate: ALERT exited abnormally with [1] +kernel:\\n BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved) +kernel:\\n pci_hotplug: PCI Hot Plug PCI Core version: 0.5 +sshd(pam_unix)[8113]: session closed for user test +gdm(pam_unix)[2803]: check pass; user unknown +ftpd[24091]: connection from 206.196.21.129 (host129.206.196.21.maximumasp.com) at Sat Jul 9 22:53:19 2005 +sshd(pam_unix)[14281]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=217.60.212.66 user=guest +ftpd[26463]: getpeername (ftpd): Transport endpoint is not connected +kernel:\\n PCI: Invalid ACPI-PCI IRQ routing table +kernel:\\n Kernel command line: ro root=LABEL=/ rhgb quiet +ftpd[24487]: connection from 203.101.45.59 (dsl-Chn-static-059.45.101.203.touchtelindia.net) at Sun Jul 17 15:09:17 2005 +kernel:\\n Normal zone: 28334 pages + LIFO batch:6 +ftpd[24487]: connection from 203.101.45.59 (dsl-Chn-static-059.45.101.203.touchtelindia.net) at Sun Jul 17 15:09:17 2005 +kernel:\\n Capability LSM initialized +kernel:\\n pci_hotplug: PCI Hot Plug PCI Core version: 0.5 +kernel:\\n BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved) +su(pam_unix)[10583]: session closed for user news +kernel:\\n Built 1 zonelists +ftpd[17689]: connection from 212.65.68.82 () at Sat Jul 16 08:14:07 2005 +kernel:\\n CPU: L2 cache: 256K +syslogd 1.4.1: restart. +ftpd[13162]: connection from 67.95.49.172 () at Fri Jul 22 19:29:10 2005 +kernel:\\n Enabling unmasked SIMD FPU exception support... done. +kernel:\\n Capability LSM initialized +bluetooth: sdpd startup succeeded +kernel:\\n Real Time Clock Driver v1.12 +sshd(pam_unix)[8113]: session opened for user test by (uid=509) +kernel:\\n BIOS-e820: 0000000000000000 - 00000000000a0000 (usable) +kernel:\\n pci_hotplug: PCI Hot Plug PCI Core version: 0.5 +rpc.statd[1618]: Version 1.0.6 Starting +su(pam_unix)[2605]: session opened for user cyrus by (uid=0) +ftpd[26463]: getpeername (ftpd): Transport endpoint is not connected +su(pam_unix)[10583]: session opened for user news by (uid=0) +kernel:\\n Initializing CPU#0 +kernel:\\n PCI: Using configuration type 1 +kernel:\\n isapnp: No Plug & Play device found +sshd(pam_unix)[8113]: session opened for user test by (uid=509) +irqbalance: irqbalance startup succeeded +kernel:\\n SELinux: Registering netfilter hooks +kernel:\\n Security Scaffold v1.0.0 initialized +ftpd[25239]: connection from 82.68.222.195 (82-68-222-195.dsl.in-addr.zen.co.uk) at Sun Jul 17 23:21:54 2005 +kernel:\\n PCI: Using configuration type 1 +kernel:\\n audit: initializing netlink socket (disabled) +ftpd[24378]: connection from 207.30.238.8 (host8.topspot.net) at Sun Jul 17 14:03:05 2005 +sshd(pam_unix)[8113]: session closed for user test +kernel:\\n PCI: Invalid ACPI-PCI IRQ routing table +kernel:\\n SELinux: Starting in permissive mode +kernel:\\n Dentry cache hash table entries: 16384 (order: 4 + 65536 bytes) +ftpd[15342]: connection from 211.107.232.1 () at Fri Jul 15 23:42:44 2005 +kernel:\\n zapping low mappings. +ftpd[25239]: connection from 82.68.222.195 (82-68-222-195.dsl.in-addr.zen.co.uk) at Sun Jul 17 23:21:54 2005 +kernel:\\n CPU: L1 I cache: 16K + L1 D cache: 16K +kernel:\\n CPU: L1 I cache: 16K + L1 D cache: 16K +gdm(pam_unix)[2803]: authentication failure; logname= uid=0 euid=0 tty=:0 ruser= rhost= +network: Bringing up loopback interface: succeeded +kernel:\\n Enabling unmasked SIMD FPU exception support... done. +kernel:\\n Linux Plug and Play Support v0.97 (c) Adam Belay +sshd(pam_unix)[14281]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=217.60.212.66 user=guest +kernel:\\n CPU: L2 cache: 256K +kernel:\\n PCI: Using IRQ router PIIX/ICH [8086/2410] at 0000:00:1f.0 +kernel:\\n ACPI disabled because your bios is from 2000 and too old +kernel:\\n Console: colour VGA+ 80x25 +kernel:\\n HighMem zone: 0 pages + LIFO batch:1 +kernel:\\n Intel machine check reporting enabled on CPU#0. +rpcidmapd: rpc.idmapd startup succeeded +su(pam_unix)[1595]: session closed for user news +kernel:\\n Intel machine check reporting enabled on CPU#0. +kernel:\\n BIOS-e820: 0000000007eae000 - 0000000008000000 (reserved) +kernel:\\n PCI: Using IRQ router PIIX/ICH [8086/2410] at 0000:00:1f.0 +kernel:\\n Mount-cache hash table entries: 512 (order: 0 + 4096 bytes) +kernel:\\n ACPI: Interpreter disabled. +kernel:\\n BIOS-e820: 0000000000100000 - 0000000007eae000 (usable) +kernel:\\n ACPI: ACPI tables contain no PCI IRQ routing entries +kernel:\\n There is already a security framework initialized + register_security failed. +kernel:\\n Normal zone: 28334 pages + LIFO batch:6 +bluetooth: hcid startup succeeded +su(pam_unix)[16058]: session opened for user cyrus by (uid=0) +kernel:\\n Checking 'hlt' instruction... OK. +kernel:\\n Enabling fast FPU save and restore... done. +sshd(pam_unix)[28975]: check pass; user unknown +sshd(pam_unix)[14281]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=217.60.212.66 user=guest +kernel:\\n usbcore: registered new driver usbfs +kernel:\\n On node 0 totalpages: 32430 +kernel:\\n BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved) +kernel:\\n PCI: Probing PCI hardware (bus 00) +kernel:\\n You can enable it with acpi=force +kernel:\\n CPU: L2 cache: 256K +kernel:\\n Linux agpgart interface v0.100 (c) Dave Jones +ftpd[24487]: connection from 203.101.45.59 (dsl-Chn-static-059.45.101.203.touchtelindia.net) at Sun Jul 17 15:09:17 2005 +logrotate: ALERT exited abnormally with [1] +ftpd[26466]: getpeername (ftpd): Transport endpoint is not connected +ftpd[24487]: connection from 203.101.45.59 (dsl-Chn-static-059.45.101.203.touchtelindia.net) at Sun Jul 17 15:09:17 2005 +ftpd[24978]: connection from 206.47.209.10 () at Mon Jul 25 06:39:18 2005 +kernel:\\n PID hash table entries: 512 (order 9: 4096 bytes) +kernel:\\n Detected 731.219 MHz processor. +sshd(pam_unix)[23798]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=p15105218.pureserver.info user=root +kernel:\\n Intel machine check reporting enabled on CPU#0. +ftpd[17689]: connection from 212.65.68.82 () at Sat Jul 16 08:14:07 2005 +sshd(pam_unix)[5586]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=193.110.106.11 user=root +ftpd[13162]: connection from 67.95.49.172 () at Fri Jul 22 19:29:10 2005 +kernel:\\n apm: BIOS version 1.2 Flags 0x03 (Driver version 1.16ac) +kernel:\\n Intel machine check reporting enabled on CPU#0. +kernel:\\n Linux Plug and Play Support v0.97 (c) Adam Belay +su(pam_unix)[10583]: session opened for user news by (uid=0) +kernel:\\n Calibrating delay loop... 1445.88 BogoMIPS +su(pam_unix)[10583]: session opened for user news by (uid=0) +kernel:\\n Intel machine check architecture supported. +sshd(pam_unix)[30631]: session closed for user test +kernel:\\n PID hash table entries: 512 (order 9: 4096 bytes) +portmap: portmap startup succeeded +kernel:\\n Using tsc for high-res timesource +ftpd[26466]: getpeername (ftpd): Transport endpoint is not connected +kernel:\\n usbcore: registered new driver hub +ftpd[25239]: connection from 82.68.222.195 (82-68-222-195.dsl.in-addr.zen.co.uk) at Sun Jul 17 23:21:54 2005 +kernel:\\n Memory: 125312k/129720k available (1540k kernel code + 3860k reserved + 599k data + 144k init + 0k highmem) +ftpd[16781]: ANONYMOUS FTP LOGIN FROM 84.102.20.2 + (anonymous) +kernel:\\n Failure registering capabilities with the kernel +kernel:\\n NET: Registered protocol family 16 +ftpd[25648]: connection from 211.72.151.162 () at Mon Jul 18 03:26:49 2005 +bluetooth: sdpd startup succeeded +sshd(pam_unix)[14281]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=217.60.212.66 user=guest +kernel:\\n Intel machine check architecture supported. +hcid[1690]: HCI daemon ver 2.4 started +kernel:\\n BIOS-e820: 0000000000100000 - 0000000007eae000 (usable) +sshd(pam_unix)[31848]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=82.77.200.128 user=root +kernel:\\n ACPI: Interpreter disabled. +kernel:\\n 126MB LOWMEM available. +kernel:\\n ACPI: ACPI tables contain no PCI IRQ routing entries +sshd(pam_unix)[30632]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=150.183.249.110 user=root +sshd(pam_unix)[30632]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=150.183.249.110 user=root +named[2306]: notify question section contains no SOA +kernel:\\n DMI 2.3 present. +rpc.statd[1618]: Version 1.0.6 Starting +ftpd[16781]: connection from 84.102.20.2 () at Sun Jul 24 02:38:22 2005 +kernel:\\n PCI: Using IRQ router PIIX/ICH [8086/2410] at 0000:00:1f.0 +kernel:\\n Real Time Clock Driver v1.12 +sshd(pam_unix)[31848]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=82.77.200.128 user=root +sshd(pam_unix)[31848]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=82.77.200.128 user=root +syslog: syslogd startup succeeded +kernel:\\n PCI: Probing PCI hardware (bus 00) +su(pam_unix)[2605]: session closed for user cyrus +rc: Starting pcmcia: succeeded +logrotate: ALERT exited abnormally with [1] +kernel:\\n Initializing CPU#0 +kernel:\\n isapnp: Scanning for PnP cards... +sshd(pam_unix)[8113]: session opened for user test by (uid=509) +kernel:\\n BIOS-e820: 00000000ffb00000 - 0000000100000000 (reserved) +kernel:\\n 0MB HIGHMEM available. +sysctl: kernel.core_uses_pid = 1 +kernel:\\n PCI: Using IRQ router PIIX/ICH [8086/2410] at 0000:00:1f.0 +sshd(pam_unix)[30631]: session closed for user test +kernel:\\n Linux Plug and Play Support v0.97 (c) Adam Belay +sshd(pam_unix)[24030]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=61-220-159-99.hinet-ip.hinet.net user=root +kernel:\\n CPU: L2 cache: 256K +kernel:\\n zapping low mappings. +kernel:\\n Calibrating delay loop... 1445.88 BogoMIPS +kernel:\\n There is already a security framework initialized +register_security failed. +su(pam_unix)[10583]: session closed for user news +kernel:\\n Memory: 125312k/129720k available (1540k kernel code +3860k reserved +599k data +144k init +0k highmem) +ftpd[15342]: connection from 211.107.232.1 () at Fri Jul 15 23:42:44 2005 +random: Initializing random number generator: succeeded] diff --git a/python/examples/ingestion_with_threading/simulator.py b/python/examples/ingestion_with_threading/simulator.py index 92f73404e..52cf3168d 100644 --- a/python/examples/ingestion_with_threading/simulator.py +++ b/python/examples/ingestion_with_threading/simulator.py @@ -39,7 +39,8 @@ def __init__(self, data_queue: Queue, asset_name: str, run_id: Optional[str]): sample_bit_field_values = ["00001001", "00100011", "00001101", "11000001"] self.sample_bit_field_values = [bytes([int(byte, 2)]) for byte in sample_bit_field_values] - sample_logs = Path().joinpath("sample_data").joinpath("sample_logs.txt") + dir_path = Path(__file__).parent + sample_logs = dir_path.joinpath("sample_data").joinpath("sample_logs.txt") with open(sample_logs, "r") as file: self.sample_logs = file.readlines() diff --git a/python/lib/sift_py/grpc/transport.py b/python/lib/sift_py/grpc/transport.py index 383eb93e0..8a4546795 100644 --- a/python/lib/sift_py/grpc/transport.py +++ b/python/lib/sift_py/grpc/transport.py @@ -91,6 +91,7 @@ def use_sift_channel( if not use_ssl: channel = _use_insecure_sift_channel(config, metadata) + print("insecure channel") return SiftChannelWithConfig(config, channel) credentials = get_ssl_credentials(cert_via_openssl) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index f42bf92f8..34bed6f8b 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -1,6 +1,8 @@ from __future__ import annotations import logging +import atexit +from queue import Queue import threading import time from collections.abc import Callable @@ -26,6 +28,7 @@ ) from sift_py.ingestion._internal.run import create_run, get_run_id_by_name from sift_py.ingestion._internal.stream import ( + IngestionThread, get_builder, get_run_form, stream_requests, @@ -43,7 +46,7 @@ from sift_py.rule.service import RuleService logger = logging.getLogger(__name__) - +logger.setLevel(logging.DEBUG) class _IngestionServiceImpl: transport_channel: SiftChannel @@ -58,8 +61,8 @@ class _IngestionServiceImpl: ingest_service_stub: IngestServiceStub rule_service: RuleService - _async_threads: List[threading.Thread] - _threads_lock: threading.Lock + _request_queue: Queue + _ingestion_thread: IngestionThread def __init__( self, @@ -101,8 +104,10 @@ def __init__( self.config = config # Thread tracking for async ingestion - self._async_threads = [] - self._threads_lock = threading.Lock() + self._request_queue = Queue() + self._ingestion_thread = IngestionThread(self.builder, self._request_queue) + self._ingestion_thread.start() + atexit.register(self.wait_for_async_ingestion, timeout=0.1) def ingest(self, *requests: IngestWithConfigDataStreamRequest): """ @@ -115,27 +120,8 @@ def ingest_async(self, *requests: IngestWithConfigDataStreamRequest): Perform data ingestion asynchronously in a background thread. This allows multiple ingest calls to run in parallel. """ - - def _ingest_and_cleanup(): - try: - stream_requests(self.builder, *requests, self.run_id) - finally: - # Remove this thread from tracking when it completes - with self._threads_lock: - if threading.current_thread() in self._async_threads: - self._async_threads.remove(threading.current_thread()) - - thread = threading.Thread( - target=_ingest_and_cleanup, - daemon=True, - ) - - # Track the thread - with self._threads_lock: - self._async_threads.append(thread) - - thread.start() - return thread + # TODO: Create a thread pool and add to whichever queue is smallest + stream_requests(self._request_queue, *requests, self.run_id) def wait_for_async_ingestion(self, timeout: Optional[float] = None) -> bool: """ @@ -147,28 +133,17 @@ def wait_for_async_ingestion(self, timeout: Optional[float] = None) -> bool: Returns: bool: True if all threads completed within timeout, False otherwise. """ - with self._threads_lock: - threads_to_wait = self._async_threads.copy() - - if not threads_to_wait: - return True - - # Wait for each thread with the remaining timeout - start_time = time.time() if timeout is not None else None - - for thread in threads_to_wait: - if timeout is not None: - remaining_timeout = timeout - (time.time() - start_time) - if remaining_timeout <= 0: - return False - thread.join(timeout=remaining_timeout) - if thread.is_alive(): - return False - else: - thread.join() - + self._request_queue.put(None) + self._ingestion_thread.join(timeout=timeout) + if self._ingestion_thread.is_alive(): + logger.error( + f"Ingestion thread did not finish after {timeout} seconds. Forcing stop." + ) + self._ingestion_thread.stop() + return False return True + def get_async_thread_count(self) -> int: """ Get the number of currently running async ingestion threads. diff --git a/python/lib/sift_py/ingestion/_internal/ingestion_config.py b/python/lib/sift_py/ingestion/_internal/ingestion_config.py index a0e93c95e..c5a66205e 100644 --- a/python/lib/sift_py/ingestion/_internal/ingestion_config.py +++ b/python/lib/sift_py/ingestion/_internal/ingestion_config.py @@ -31,6 +31,7 @@ def get_ingestion_config_by_client_key( Returns `None` if no ingestion config can be matched with the provided `client_key` """ + print(f"channel.config: {channel.config}") svc = IngestionConfigServiceStub(channel) req = ListIngestionConfigsRequest( filter=f'client_key=="{client_key}"', diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index e6ab4d6cb..6b47fbccc 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -1,5 +1,9 @@ import asyncio +import logging import random +import time +import threading +from datetime import datetime, timedelta from queue import Queue from typing import List, Optional @@ -13,6 +17,8 @@ IngestionConfigFormPy, IngestWithConfigDataChannelValuePy, IngestWithConfigDataStreamRequestPy, + RecoveryStrategyPy, + RetryPolicyPy, RunFormPy, SiftStreamBuilderPy, TimeValuePy, @@ -21,6 +27,95 @@ from sift_py.grpc.transport import SiftChannel from sift_py.ingestion.config.telemetry import TelemetryConfig +logger = logging.getLogger(__name__) + +class IngestionThread(threading.Thread): + """ + Manages ingestion for a single ingestion config. + """ + + OUTER_LOOP_PERIOD = 0.1 # Time of intervals loop will sleep while waiting for data. + SIFT_STREAM_FINISH_TIMEOUT = 0.06 # Measured ~0.05s to finish stream. + CLEANUP_TIMEOUT = OUTER_LOOP_PERIOD + SIFT_STREAM_FINISH_TIMEOUT + + def __init__( + self, + sift_stream_builder: SiftStreamBuilderPy, + data_queue: Queue, + metric_interval: float = 0.5, + ): + """ + Initialize the IngestionThread. + + Args: + sift_stream_builder: The sift stream builder to build a new stream. + data_queue: The queue to put IngestWithConfigDataStreamRequestPy requests into for ingestion. + ingestion_config: The ingestion config to use for ingestion. + metric_interval: Time (seconds) to wait between logging metrics. + """ + super().__init__(daemon=True) + self.data_queue = data_queue + self._stop = threading.Event() + self.sift_stream_builder = sift_stream_builder + self.metric_interval = timedelta(seconds=metric_interval) + + def stop(self): + self._stop.set() + # Give a brief chance to finish the stream (should take < 50ms). + time.sleep(self.CLEANUP_TIMEOUT) + self.task.cancel() + + async def main(self): + logger.debug("Ingestion thread started") + sift_stream = await self.sift_stream_builder.build() + time_since_last_metric = datetime.now() - timedelta(seconds=1) + count = 0 + try: + while True: + while not self.data_queue.empty(): + if self._stop.is_set(): + # Being forced to stop. Try to finish the stream. + logger.info( + f"Ingestion thread received stop signal. Exiting. Sent {count} requests. {self.data_queue.qsize()} requests remaining." + ) + await sift_stream.finish() + return + item = self.data_queue.get() + if item is None: + self._stop.set() + continue + sift_stream = await sift_stream.send_requests(item) + count += 1 + if datetime.now() - time_since_last_metric > self.metric_interval: + logger.debug( + f"Ingestion thread sent {count} requests, remaining: {self.data_queue.qsize()}" + ) + time_since_last_metric = datetime.now() + + if self._stop.is_set(): + logger.debug( + f"No more requests. Stopping. Sent {count} requests. {self.data_queue.qsize()} requests remaining." + ) + await sift_stream.finish() + return + else: + time.sleep(self.OUTER_LOOP_PERIOD) + + except asyncio.CancelledError: + # It's possible the thread was joined while sleeping waiting for data. Only note error if we have data left. + if self.data_queue.qsize() > 0: + logger.error( + f"Ingestion thread cancelled without finishing stream. {self.data_queue.qsize()} requests were not sent." + ) + + async def _run(self): + self.task = asyncio.create_task(self.main()) + await self.task + + def run(self): + """This thread will handle sending data to Sift.""" + # Even thought this is a thread, we need to run this async task to await send_requests otherwise we get sift_stream consumed errors. + asyncio.run(self._run()) def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> SiftStreamBuilderPy: """ @@ -47,66 +142,59 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift builder = SiftStreamBuilderPy(uri, apikey) builder.ingestion_config = telemetry_config_to_ingestion_config_py(ingestion_config) + print(f"builder.ingestion_config: {builder.ingestion_config.client_key}, {builder.ingestion_config.asset_name}") builder.enable_tls = channel.config.get("use_ssl", True) + # FD-177: Expose configuration for recovery strategy. + builder.recovery_strategy = ( + RecoveryStrategyPy.retry_only( + RetryPolicyPy.default() + ) + ) + return builder async def stream_requests_async( - builder: SiftStreamBuilderPy, run_id: str, *requests: IngestWithConfigDataStreamRequest + data_queue: Queue, run_id: str, *requests: IngestWithConfigDataStreamRequest ): - async def ingestion_thread(): - # Create stream and send requests - sift_stream = await builder.build() - try: - while not data_queue.empty(): - item = data_queue.get() - sift_stream = await sift_stream.send_requests(item) - await sift_stream.finish() - except Exception as e: - # Ensure stream is finished even if there's an error - try: - await sift_stream.finish() - except: - pass - raise e - - # Create a dedicated queue for this batch of requests - data_queue = Queue() + """ + Non-blocking: Convert requests for rust bindings and put them into a queue. + + Args: + data_queue: The queue to put IngestWithConfigDataStreamRequestPy requests into for ingestion. + run_id: Optional run ID to associate with the requests + requests: List of IngestWithConfigDataStreamRequest protobuf objects + """ # Put each request individually into the queue, filtering out None values processed_requests = [] for request in requests: if not isinstance(request, IngestWithConfigDataStreamRequest): - print(f"Skipping request: {request} of type {type(request)}") + if isinstance(request, str): + print(f"Skipping request: {request} of type {type(request)}") + else: + raise ValueError(f"Received unexpected request: {request} of type {type(request)}") continue processed_request = ingest_request_to_ingest_request_py(request, run_id) if processed_request is not None: processed_requests.append(processed_request) data_queue.put(processed_requests) - print(f"Processing {len(requests)} requests in queue") - - # Process this batch - await ingestion_thread() - def stream_requests( - builder: SiftStreamBuilderPy, + data_queue: Queue, *requests: IngestWithConfigDataStreamRequest, run_id: str = "", ) -> None: """ - Stream requests using the stream bindings synchronously. - Each call to this function creates its own queue and stream, allowing multiple - batches to be processed concurrently when called from different threads. + Blocking: Convert requests for rust bindings and put them into a queue. Args: - builder: The SiftStreamBuilderPy to use for streaming + data_queue: The queue to put IngestWithConfigDataStreamRequestPy requests into for ingestion. requests: List of IngestWithConfigDataStreamRequest protobuf objects run_id: Optional run ID to associate with the requests """ - print(f"Starting stream requests for {len(requests)} requests") - asyncio.run(stream_requests_async(builder, run_id, *requests)) + asyncio.run(stream_requests_async(data_queue, run_id, *requests)) def telemetry_config_to_ingestion_config_py( diff --git a/python/pyproject.toml b/python/pyproject.toml index 90e14ef67..b1adac5b0 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "pydantic_core~=2.3", "requests~=2.25", "requests-toolbelt~=1.0", - "sift-stream-bindings>=0.1", + "sift-stream-bindings>=0.1.2", "alive-progress~=3.0", # May move these to optional dependencies in the future. "pandas-stubs~=2.0", From 563a3efbb63565579cef0df24d38c9b46bf79678 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 7 Aug 2025 12:43:51 -0700 Subject: [PATCH 15/34] Build builder after attaching run so data goes to run and don't create an additional RunForm. --- .../ingestion_with_python_config/main.py | 5 --- .../examples/ingestion_with_threading/main.py | 3 -- python/lib/sift_py/grpc/transport.py | 3 +- .../lib/sift_py/ingestion/_internal/ingest.py | 44 ++++++------------- .../ingestion/_internal/ingestion_config.py | 1 - python/lib/sift_py/ingestion/_internal/run.py | 2 - .../lib/sift_py/ingestion/_internal/stream.py | 29 ++++-------- 7 files changed, 22 insertions(+), 65 deletions(-) diff --git a/python/examples/ingestion_with_python_config/main.py b/python/examples/ingestion_with_python_config/main.py index fc7b62139..bf6960dee 100644 --- a/python/examples/ingestion_with_python_config/main.py +++ b/python/examples/ingestion_with_python_config/main.py @@ -22,8 +22,6 @@ raise Exception("Missing 'SIFT_API_KEY' environment variable.") base_uri = os.getenv("BASE_URI") - if not base_uri.startswith("http"): - base_uri = f"http://localhost:50051" if base_uri is None: raise Exception("Missing 'BASE_URI' environment variable.") @@ -33,9 +31,6 @@ # Create a gRPC transport channel configured specifically for the Sift API sift_channel_config = SiftChannelConfig(uri=base_uri, apikey=apikey) - sift_channel_config["use_ssl"] = False - # sift_channel_config["cert_via_openssl"] = False - print(f"sift_channel_config: {sift_channel_config}") with use_sift_channel(sift_channel_config) as channel: # Create ingestion service using the telemetry config we loaded in diff --git a/python/examples/ingestion_with_threading/main.py b/python/examples/ingestion_with_threading/main.py index 135eacd10..3fc808dbf 100644 --- a/python/examples/ingestion_with_threading/main.py +++ b/python/examples/ingestion_with_threading/main.py @@ -43,8 +43,6 @@ def ingestion_thread(data_queue: Queue): raise Exception("Missing 'SIFT_API_KEY' environment variable.") base_uri = os.getenv("BASE_URI") - if not base_uri.startswith("http"): - base_uri = f"http://{base_uri}" if base_uri is None: raise Exception("Missing 'BASE_URI' environment variable.") @@ -54,7 +52,6 @@ def ingestion_thread(data_queue: Queue): # Create a gRPC transport channel configured specifically for the Sift API sift_channel_config = SiftChannelConfig(uri=base_uri, apikey=apikey) - sift_channel_config["use_ssl"] = False with use_sift_channel(sift_channel_config) as channel: # Create ingestion service using the telemetry config we loaded in diff --git a/python/lib/sift_py/grpc/transport.py b/python/lib/sift_py/grpc/transport.py index 8a4546795..02d244a6f 100644 --- a/python/lib/sift_py/grpc/transport.py +++ b/python/lib/sift_py/grpc/transport.py @@ -91,7 +91,6 @@ def use_sift_channel( if not use_ssl: channel = _use_insecure_sift_channel(config, metadata) - print("insecure channel") return SiftChannelWithConfig(config, channel) credentials = get_ssl_credentials(cert_via_openssl) @@ -126,7 +125,7 @@ def use_sift_async_channel( def _use_insecure_sift_channel( config: SiftChannelConfig, metadata: Optional[Dict[str, Any]] = None -) -> SiftChannel: +) -> grpc.Channel: """ FOR DEVELOPMENT PURPOSES ONLY """ diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 34bed6f8b..6129d9a4e 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -1,12 +1,10 @@ from __future__ import annotations -import logging import atexit -from queue import Queue -import threading -import time +import logging from collections.abc import Callable from datetime import datetime +from queue import Queue from typing import Any, Dict, List, Optional, Union, cast from google.protobuf.timestamp_pb2 import Timestamp @@ -30,7 +28,6 @@ from sift_py.ingestion._internal.stream import ( IngestionThread, get_builder, - get_run_form, stream_requests, ) from sift_py.ingestion.channel import ( @@ -46,7 +43,7 @@ from sift_py.rule.service import RuleService logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) + class _IngestionServiceImpl: transport_channel: SiftChannel @@ -105,8 +102,8 @@ def __init__( # Thread tracking for async ingestion self._request_queue = Queue() + # Don't start thread here since user may attach a run after creating the ingestion service self._ingestion_thread = IngestionThread(self.builder, self._request_queue) - self._ingestion_thread.start() atexit.register(self.wait_for_async_ingestion, timeout=0.1) def ingest(self, *requests: IngestWithConfigDataStreamRequest): @@ -121,7 +118,10 @@ def ingest_async(self, *requests: IngestWithConfigDataStreamRequest): This allows multiple ingest calls to run in parallel. """ # TODO: Create a thread pool and add to whichever queue is smallest - stream_requests(self._request_queue, *requests, self.run_id) + # Start thread on first ingest on the assumption all modifications to the ingestion config have concluded. + if not self._ingestion_thread.is_alive(): + self._ingestion_thread.start() + stream_requests(self._request_queue, *requests, run_id=str(self.run_id)) def wait_for_async_ingestion(self, timeout: Optional[float] = None) -> bool: """ @@ -136,26 +136,11 @@ def wait_for_async_ingestion(self, timeout: Optional[float] = None) -> bool: self._request_queue.put(None) self._ingestion_thread.join(timeout=timeout) if self._ingestion_thread.is_alive(): - logger.error( - f"Ingestion thread did not finish after {timeout} seconds. Forcing stop." - ) + logger.error(f"Ingestion thread did not finish after {timeout} seconds. Forcing stop.") self._ingestion_thread.stop() return False return True - - def get_async_thread_count(self) -> int: - """ - Get the number of currently running async ingestion threads. - - Returns: - int: Number of active async threads. - """ - with self._threads_lock: - # Clean up any completed threads - self._async_threads = [t for t in self._async_threads if t.is_alive()] - return len(self._async_threads) - def ingest_flows(self, *flows: FlowOrderedChannelValues): """ Combines the requests creation step and ingestion into a single call. @@ -204,6 +189,9 @@ def attach_run( Include `force_new=True` to force the creation of a new run, which will allow creation of a new run using an existing name. """ + if self._ingestion_thread.is_alive(): + raise IngestionValidationError("Cannot attach run while ingestion thread is running. Invoke before ingesting.") + if not force_new: run_id = get_run_id_by_name(channel, run_name) @@ -220,13 +208,7 @@ def attach_run( tags=tags or [], metadata=metadata, ) - - self.builder.run = get_run_form( - run_name=run_name, - run_description=description or "", - client_key=client_key, - run_tags=tags, - ) + self.builder.run_id = self.run_id def detach_run(self): """ diff --git a/python/lib/sift_py/ingestion/_internal/ingestion_config.py b/python/lib/sift_py/ingestion/_internal/ingestion_config.py index c5a66205e..a0e93c95e 100644 --- a/python/lib/sift_py/ingestion/_internal/ingestion_config.py +++ b/python/lib/sift_py/ingestion/_internal/ingestion_config.py @@ -31,7 +31,6 @@ def get_ingestion_config_by_client_key( Returns `None` if no ingestion config can be matched with the provided `client_key` """ - print(f"channel.config: {channel.config}") svc = IngestionConfigServiceStub(channel) req = ListIngestionConfigsRequest( filter=f'client_key=="{client_key}"', diff --git a/python/lib/sift_py/ingestion/_internal/run.py b/python/lib/sift_py/ingestion/_internal/run.py index bf313ccd2..df30bad39 100644 --- a/python/lib/sift_py/ingestion/_internal/run.py +++ b/python/lib/sift_py/ingestion/_internal/run.py @@ -52,8 +52,6 @@ def create_run( if run_client_key: kwargs["client_key"] = run_client_key - print(f"Creating run with kwargs: {kwargs}") - req = CreateRunRequest(**kwargs) res = cast(CreateRunResponse, svc.CreateRun(req)) return res.run.run_id diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index 6b47fbccc..f92ea3b19 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -1,8 +1,8 @@ import asyncio import logging import random -import time import threading +import time from datetime import datetime, timedelta from queue import Queue from typing import List, Optional @@ -29,6 +29,7 @@ logger = logging.getLogger(__name__) + class IngestionThread(threading.Thread): """ Manages ingestion for a single ingestion config. @@ -117,6 +118,7 @@ def run(self): # Even thought this is a thread, we need to run this async task to await send_requests otherwise we get sift_stream consumed errors. asyncio.run(self._run()) + def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> SiftStreamBuilderPy: """ Get a builder for a stream. @@ -142,39 +144,30 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift builder = SiftStreamBuilderPy(uri, apikey) builder.ingestion_config = telemetry_config_to_ingestion_config_py(ingestion_config) - print(f"builder.ingestion_config: {builder.ingestion_config.client_key}, {builder.ingestion_config.asset_name}") builder.enable_tls = channel.config.get("use_ssl", True) # FD-177: Expose configuration for recovery strategy. - builder.recovery_strategy = ( - RecoveryStrategyPy.retry_only( - RetryPolicyPy.default() - ) - ) + builder.recovery_strategy = RecoveryStrategyPy.retry_only(RetryPolicyPy.default()) return builder async def stream_requests_async( - data_queue: Queue, run_id: str, *requests: IngestWithConfigDataStreamRequest + data_queue: Queue, *requests: IngestWithConfigDataStreamRequest, run_id: str = "" ): """ Non-blocking: Convert requests for rust bindings and put them into a queue. Args: data_queue: The queue to put IngestWithConfigDataStreamRequestPy requests into for ingestion. - run_id: Optional run ID to associate with the requests requests: List of IngestWithConfigDataStreamRequest protobuf objects + run_id: Optional run ID to associate with the requests """ # Put each request individually into the queue, filtering out None values processed_requests = [] for request in requests: if not isinstance(request, IngestWithConfigDataStreamRequest): - if isinstance(request, str): - print(f"Skipping request: {request} of type {type(request)}") - else: - raise ValueError(f"Received unexpected request: {request} of type {type(request)}") - continue + raise ValueError(f"Received unexpected request: {request} of type {type(request)}") processed_request = ingest_request_to_ingest_request_py(request, run_id) if processed_request is not None: processed_requests.append(processed_request) @@ -194,7 +187,7 @@ def stream_requests( requests: List of IngestWithConfigDataStreamRequest protobuf objects run_id: Optional run ID to associate with the requests """ - asyncio.run(stream_requests_async(data_queue, run_id, *requests)) + asyncio.run(stream_requests_async(data_queue, *requests, run_id=run_id)) def telemetry_config_to_ingestion_config_py( @@ -260,7 +253,6 @@ def telemetry_config_to_ingestion_config_py( ) flow_configs_py.append(flow_config_py) - # Create ingestion config ingestion_config_py = IngestionConfigFormPy( asset_name=telemetry_config.asset_name, @@ -347,11 +339,6 @@ def ingest_request_to_ingest_request_py( Returns: IngestWithConfigDataStreamRequestPy: The converted request """ - if isinstance(request, str): - print(f"Converting request: {request} of type {type(request)}") - if request is None: - return None - timestamp_py = None if request.HasField("timestamp"): timestamp_py = TimeValuePy.from_timestamp( From e8fdaba605c2cb25b9b4d4790776e2c3f9bf1633 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 7 Aug 2025 14:48:52 -0700 Subject: [PATCH 16/34] TODO -> ticket --- python/lib/sift_py/ingestion/_internal/ingest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 6129d9a4e..d0d15800c 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -117,7 +117,7 @@ def ingest_async(self, *requests: IngestWithConfigDataStreamRequest): Perform data ingestion asynchronously in a background thread. This allows multiple ingest calls to run in parallel. """ - # TODO: Create a thread pool and add to whichever queue is smallest + # FD-179: Create a thread pool and add to whichever queue is smallest # Start thread on first ingest on the assumption all modifications to the ingestion config have concluded. if not self._ingestion_thread.is_alive(): self._ingestion_thread.start() From 1c72294f3e8c9a9ca3159483f9d3d0b68d224df7 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 7 Aug 2025 14:51:48 -0700 Subject: [PATCH 17/34] lint --- python/lib/sift_py/ingestion/_internal/ingest.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index d0d15800c..d9d24d0d2 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -190,8 +190,10 @@ def attach_run( Include `force_new=True` to force the creation of a new run, which will allow creation of a new run using an existing name. """ if self._ingestion_thread.is_alive(): - raise IngestionValidationError("Cannot attach run while ingestion thread is running. Invoke before ingesting.") - + raise IngestionValidationError( + "Cannot attach run while ingestion thread is running. Invoke before ingesting." + ) + if not force_new: run_id = get_run_id_by_name(channel, run_name) From 5683a74ba02ff01098430ae75ff62fbfcfde979b Mon Sep 17 00:00:00 2001 From: Ian Later Date: Tue, 12 Aug 2025 19:28:40 -0700 Subject: [PATCH 18/34] PR fb. --- .../ingestion_with_python_config/main.py | 1 - .../ingestion_with_python_config/simulator.py | 2 +- .../lib/sift_py/ingestion/_internal/ingest.py | 2 +- .../lib/sift_py/ingestion/_internal/stream.py | 33 ++++++++----------- 4 files changed, 15 insertions(+), 23 deletions(-) diff --git a/python/examples/ingestion_with_python_config/main.py b/python/examples/ingestion_with_python_config/main.py index bf6960dee..a27d288f9 100644 --- a/python/examples/ingestion_with_python_config/main.py +++ b/python/examples/ingestion_with_python_config/main.py @@ -16,7 +16,6 @@ load_dotenv() apikey = os.getenv("SIFT_API_KEY") - apikey = "aA5ZGxfVIhc1DjNKs47HOhrlptC8QLBp3ms20NPp" if apikey is None: raise Exception("Missing 'SIFT_API_KEY' environment variable.") diff --git a/python/examples/ingestion_with_python_config/simulator.py b/python/examples/ingestion_with_python_config/simulator.py index 5671f7695..46fd0a8ae 100644 --- a/python/examples/ingestion_with_python_config/simulator.py +++ b/python/examples/ingestion_with_python_config/simulator.py @@ -68,7 +68,7 @@ def run(self): logs_interval_s = 1 / LOGS_FREQUENCY_HZ partial_readings_with_log_interval_s = 1 / PARTIAL_READINGS_WITH_LOG_FREQUENCY_HZ - with self.ingestion_service.buffered_ingestion(buffer_size=10) as buffered_ingestion: + with self.ingestion_service.buffered_ingestion() as buffered_ingestion: while time.time() < end_time: current_time = time.time() diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index d9d24d0d2..44ec2f4c5 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -121,7 +121,7 @@ def ingest_async(self, *requests: IngestWithConfigDataStreamRequest): # Start thread on first ingest on the assumption all modifications to the ingestion config have concluded. if not self._ingestion_thread.is_alive(): self._ingestion_thread.start() - stream_requests(self._request_queue, *requests, run_id=str(self.run_id)) + stream_requests(self._request_queue, *requests) def wait_for_async_ingestion(self, timeout: Optional[float] = None) -> bool: """ diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index f92ea3b19..9140bc543 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -3,7 +3,6 @@ import random import threading import time -from datetime import datetime, timedelta from queue import Queue from typing import List, Optional @@ -35,9 +34,9 @@ class IngestionThread(threading.Thread): Manages ingestion for a single ingestion config. """ - OUTER_LOOP_PERIOD = 0.1 # Time of intervals loop will sleep while waiting for data. + IDLE_LOOP_PERIOD = 0.1 # Time of intervals loop will sleep while waiting for data. SIFT_STREAM_FINISH_TIMEOUT = 0.06 # Measured ~0.05s to finish stream. - CLEANUP_TIMEOUT = OUTER_LOOP_PERIOD + SIFT_STREAM_FINISH_TIMEOUT + CLEANUP_TIMEOUT = IDLE_LOOP_PERIOD + SIFT_STREAM_FINISH_TIMEOUT def __init__( self, @@ -58,7 +57,7 @@ def __init__( self.data_queue = data_queue self._stop = threading.Event() self.sift_stream_builder = sift_stream_builder - self.metric_interval = timedelta(seconds=metric_interval) + self.metric_interval = metric_interval def stop(self): self._stop.set() @@ -69,7 +68,7 @@ def stop(self): async def main(self): logger.debug("Ingestion thread started") sift_stream = await self.sift_stream_builder.build() - time_since_last_metric = datetime.now() - timedelta(seconds=1) + time_since_last_metric = time.time() - 1 count = 0 try: while True: @@ -87,11 +86,11 @@ async def main(self): continue sift_stream = await sift_stream.send_requests(item) count += 1 - if datetime.now() - time_since_last_metric > self.metric_interval: + if time.time() - time_since_last_metric > self.metric_interval: logger.debug( f"Ingestion thread sent {count} requests, remaining: {self.data_queue.qsize()}" ) - time_since_last_metric = datetime.now() + time_since_last_metric = time.time() if self._stop.is_set(): logger.debug( @@ -100,7 +99,7 @@ async def main(self): await sift_stream.finish() return else: - time.sleep(self.OUTER_LOOP_PERIOD) + time.sleep(self.IDLE_LOOP_PERIOD) except asyncio.CancelledError: # It's possible the thread was joined while sleeping waiting for data. Only note error if we have data left. @@ -136,6 +135,7 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift if not uri or not apikey: raise ValueError(f"Channel config is missing uri or apikey: {channel.config}") + # SiftStreamBuilder needs URI to start with http or https if not uri.startswith("http"): if "localhost" in uri: uri = f"http://{uri}" @@ -152,7 +152,7 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift async def stream_requests_async( - data_queue: Queue, *requests: IngestWithConfigDataStreamRequest, run_id: str = "" + data_queue: Queue, *requests: IngestWithConfigDataStreamRequest ): """ Non-blocking: Convert requests for rust bindings and put them into a queue. @@ -160,24 +160,20 @@ async def stream_requests_async( Args: data_queue: The queue to put IngestWithConfigDataStreamRequestPy requests into for ingestion. requests: List of IngestWithConfigDataStreamRequest protobuf objects - run_id: Optional run ID to associate with the requests """ # Put each request individually into the queue, filtering out None values processed_requests = [] for request in requests: if not isinstance(request, IngestWithConfigDataStreamRequest): - raise ValueError(f"Received unexpected request: {request} of type {type(request)}") - processed_request = ingest_request_to_ingest_request_py(request, run_id) - if processed_request is not None: - processed_requests.append(processed_request) + raise ValueError(f"Received unexpected request: {request} of type {type(request)}") + processed_requests.append(ingest_request_to_ingest_request_py(request)) data_queue.put(processed_requests) def stream_requests( data_queue: Queue, *requests: IngestWithConfigDataStreamRequest, - run_id: str = "", ) -> None: """ Blocking: Convert requests for rust bindings and put them into a queue. @@ -185,9 +181,8 @@ def stream_requests( Args: data_queue: The queue to put IngestWithConfigDataStreamRequestPy requests into for ingestion. requests: List of IngestWithConfigDataStreamRequest protobuf objects - run_id: Optional run ID to associate with the requests """ - asyncio.run(stream_requests_async(data_queue, *requests, run_id=run_id)) + asyncio.run(stream_requests_async(data_queue, *requests)) def telemetry_config_to_ingestion_config_py( @@ -327,7 +322,6 @@ def get_run_form( def ingest_request_to_ingest_request_py( request, - run_id: str = "", ) -> IngestWithConfigDataStreamRequestPy: """ Convert an IngestWithConfigDataStreamRequest to IngestWithConfigDataStreamRequestPy. @@ -355,7 +349,7 @@ def ingest_request_to_ingest_request_py( flow=request.flow, timestamp=timestamp_py, channel_values=channel_values_py, - run_id=run_id or "", + run_id=request.run_id or "", end_stream_on_validation_error=request.end_stream_on_validation_error, organization_id=request.organization_id, ) @@ -388,7 +382,6 @@ def convert_channel_value_to_channel_value_py(channel_value) -> IngestWithConfig elif channel_value.HasField("uint64"): return IngestWithConfigDataChannelValuePy.uint64(channel_value.uint64) elif channel_value.HasField("enum"): - # For enum values, we need to create a ChannelEnumTypePy return IngestWithConfigDataChannelValuePy.enum_value(channel_value.enum) elif channel_value.HasField("bit_field"): return IngestWithConfigDataChannelValuePy.bitfield(channel_value.bit_field) From eade3c37e96733b961a4fca8095ccc7423b36ea7 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Tue, 12 Aug 2025 19:29:27 -0700 Subject: [PATCH 19/34] lint --- python/lib/sift_py/ingestion/_internal/stream.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index 9140bc543..66e332c27 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -151,9 +151,7 @@ def get_builder(channel: SiftChannel, ingestion_config: TelemetryConfig) -> Sift return builder -async def stream_requests_async( - data_queue: Queue, *requests: IngestWithConfigDataStreamRequest -): +async def stream_requests_async(data_queue: Queue, *requests: IngestWithConfigDataStreamRequest): """ Non-blocking: Convert requests for rust bindings and put them into a queue. @@ -166,7 +164,7 @@ async def stream_requests_async( processed_requests = [] for request in requests: if not isinstance(request, IngestWithConfigDataStreamRequest): - raise ValueError(f"Received unexpected request: {request} of type {type(request)}") + raise ValueError(f"Received unexpected request: {request} of type {type(request)}") processed_requests.append(ingest_request_to_ingest_request_py(request)) data_queue.put(processed_requests) From 324185d078c2b07d0086575f31f418dc23c15787 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Wed, 13 Aug 2025 17:05:03 -0700 Subject: [PATCH 20/34] Rename SiftChannelWithConfig instead of type alias. --- python/lib/sift_py/grpc/transport.py | 7 +++---- python/lib/sift_py/ingestion/_internal/stream.py | 14 +++++++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/python/lib/sift_py/grpc/transport.py b/python/lib/sift_py/grpc/transport.py index 02d244a6f..f7139caa6 100644 --- a/python/lib/sift_py/grpc/transport.py +++ b/python/lib/sift_py/grpc/transport.py @@ -22,7 +22,7 @@ from sift_py.grpc.keepalive import DEFAULT_KEEPALIVE_CONFIG, KeepaliveConfig -class SiftChannelWithConfig: +class SiftChannel: """ A wrapper around grpc.Channel that includes the configuration used to create it. This allows access to the original config for debugging or other purposes. @@ -44,7 +44,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): self._channel.close() -SiftChannel: TypeAlias = SiftChannelWithConfig SiftAsyncChannel: TypeAlias = grpc_aio.Channel @@ -91,7 +90,7 @@ def use_sift_channel( if not use_ssl: channel = _use_insecure_sift_channel(config, metadata) - return SiftChannelWithConfig(config, channel) + return SiftChannel(config, channel) credentials = get_ssl_credentials(cert_via_openssl) options = _compute_channel_options(config) @@ -99,7 +98,7 @@ def use_sift_channel( channel = grpc.secure_channel(api_uri, credentials, options) interceptors = _compute_sift_interceptors(config, metadata) intercepted_channel = grpc.intercept_channel(channel, *interceptors) - return SiftChannelWithConfig(config, intercepted_channel) + return SiftChannel(config, intercepted_channel) def use_sift_async_channel( diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index 66e332c27..1d5870fd3 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -6,7 +6,10 @@ from queue import Queue from typing import List, Optional -from sift.ingest.v1.ingest_pb2 import IngestWithConfigDataStreamRequest +from sift.ingest.v1.ingest_pb2 import ( + IngestWithConfigDataChannelValue, + IngestWithConfigDataStreamRequest, +) from sift_stream_bindings import ( ChannelBitFieldElementPy, ChannelConfigPy, @@ -319,7 +322,7 @@ def get_run_form( def ingest_request_to_ingest_request_py( - request, + request: IngestWithConfigDataStreamRequest, ) -> IngestWithConfigDataStreamRequestPy: """ Convert an IngestWithConfigDataStreamRequest to IngestWithConfigDataStreamRequestPy. @@ -353,7 +356,9 @@ def ingest_request_to_ingest_request_py( ) -def convert_channel_value_to_channel_value_py(channel_value) -> IngestWithConfigDataChannelValuePy: +def convert_channel_value_to_channel_value_py( + channel_value: IngestWithConfigDataChannelValue, +) -> IngestWithConfigDataChannelValuePy: """ Convert an IngestWithConfigDataChannelValue to IngestWithConfigDataChannelValuePy. @@ -390,5 +395,4 @@ def convert_channel_value_to_channel_value_py(channel_value) -> IngestWithConfig # For empty values, we'll return a default value return IngestWithConfigDataChannelValuePy.empty() else: - # No field set, return empty value - return IngestWithConfigDataChannelValuePy.empty() + raise ValueError(f"{channel_value} missing type field.") From db38aa47a5221f75670d1a4e78ae5ba8e3d7b6de Mon Sep 17 00:00:00 2001 From: Ian Later Date: Wed, 13 Aug 2025 18:02:37 -0700 Subject: [PATCH 21/34] Ensure thread is alive before joining --- python/lib/sift_py/ingestion/_internal/ingest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index 44ec2f4c5..d5d4b7894 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -134,7 +134,8 @@ def wait_for_async_ingestion(self, timeout: Optional[float] = None) -> bool: bool: True if all threads completed within timeout, False otherwise. """ self._request_queue.put(None) - self._ingestion_thread.join(timeout=timeout) + if self._ingestion_thread.is_alive(): + self._ingestion_thread.join(timeout=timeout) if self._ingestion_thread.is_alive(): logger.error(f"Ingestion thread did not finish after {timeout} seconds. Forcing stop.") self._ingestion_thread.stop() From 63d53e2d516c2b9eb60c310f94e4adab6f1015f6 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 14 Aug 2025 10:57:09 -0700 Subject: [PATCH 22/34] add bytes type. --- python/lib/sift_py/ingestion/channel.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/lib/sift_py/ingestion/channel.py b/python/lib/sift_py/ingestion/channel.py index c1ab71dc1..5a4366b17 100644 --- a/python/lib/sift_py/ingestion/channel.py +++ b/python/lib/sift_py/ingestion/channel.py @@ -209,6 +209,7 @@ class ChannelDataTypeStrRep(Enum): INT_64 = "int64" UINT_32 = "uint32" UINT_64 = "uint64" + BYTES = "bytes" @staticmethod def from_api_format(val: str) -> Optional["ChannelDataTypeStrRep"]: @@ -224,6 +225,7 @@ def from_api_format(val: str) -> Optional["ChannelDataTypeStrRep"]: "CHANNEL_DATA_TYPE_INT_64": ChannelDataTypeStrRep.INT_64, "CHANNEL_DATA_TYPE_UINT_32": ChannelDataTypeStrRep.UINT_32, "CHANNEL_DATA_TYPE_UINT_64": ChannelDataTypeStrRep.UINT_64, + "CHANNEL_DATA_TYPE_BYTES": ChannelDataTypeStrRep.BYTES, }[val] except KeyError: return None @@ -244,6 +246,7 @@ class ChannelDataType(Enum): INT_64 = channel_pb.CHANNEL_DATA_TYPE_INT_64 UINT_32 = channel_pb.CHANNEL_DATA_TYPE_UINT_32 UINT_64 = channel_pb.CHANNEL_DATA_TYPE_UINT_64 + BYTES = channel_pb.CHANNEL_DATA_TYPE_BYTES @classmethod def from_pb(cls, val: channel_pb.ChannelDataType.ValueType) -> "ChannelDataType": @@ -267,6 +270,8 @@ def from_pb(cls, val: channel_pb.ChannelDataType.ValueType) -> "ChannelDataType" return cls.UINT_32 elif val == cls.UINT_64.value: return cls.UINT_64 + elif val == cls.BYTES.value: + return cls.BYTES else: raise ValueError(f"Unknown channel data type '{val}'.") @@ -302,6 +307,8 @@ def from_str(cls, raw: str) -> Optional["ChannelDataType"]: return cls.UINT_32 elif val == ChannelDataTypeStrRep.UINT_64: return cls.UINT_64 + elif val == ChannelDataTypeStrRep.BYTES: + return cls.BYTES else: raise Exception("Unreachable") @@ -334,6 +341,8 @@ def as_human_str(self, api_format: bool = False) -> str: return ( "CHANNEL_DATA_TYPE_UINT_64" if api_format else ChannelDataTypeStrRep.UINT_64.value ) + elif self == ChannelDataType.BYTES: + return "CHANNEL_DATA_TYPE_BYTES" if api_format else ChannelDataTypeStrRep.BYTES.value else: raise Exception("Unreachable.") From 27d4f9ce09189829c823298b7047748f6738342c Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 14 Aug 2025 11:50:02 -0700 Subject: [PATCH 23/34] mypy --- python/lib/sift_py/ingestion/_internal/run.py | 2 +- python/lib/sift_py/ingestion/channel.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/lib/sift_py/ingestion/_internal/run.py b/python/lib/sift_py/ingestion/_internal/run.py index df30bad39..46e20fbd1 100644 --- a/python/lib/sift_py/ingestion/_internal/run.py +++ b/python/lib/sift_py/ingestion/_internal/run.py @@ -52,6 +52,6 @@ def create_run( if run_client_key: kwargs["client_key"] = run_client_key - req = CreateRunRequest(**kwargs) + req = CreateRunRequest(**kwargs) # type: ignore res = cast(CreateRunResponse, svc.CreateRun(req)) return res.run.run_id diff --git a/python/lib/sift_py/ingestion/channel.py b/python/lib/sift_py/ingestion/channel.py index 5a4366b17..dc7cef89e 100644 --- a/python/lib/sift_py/ingestion/channel.py +++ b/python/lib/sift_py/ingestion/channel.py @@ -451,3 +451,6 @@ def is_data_type(val: IngestWithConfigDataChannelValue, target_type: ChannelData return val.HasField("uint32") elif target_type == ChannelDataType.UINT_64: return val.HasField("uint64") + elif target_type == ChannelDataType.BYTES: + return val.HasField("bytes") + raise ValueError(f"Unknown channel data type '{target_type}'.") From e4437effeb26321c53b841ad10a3117afb38d90d Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 14 Aug 2025 13:56:03 -0700 Subject: [PATCH 24/34] Fix tests. --- python/lib/sift_py/_internal/test_util/channel.py | 4 ++++ python/lib/sift_py/asset/_service_test.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/lib/sift_py/_internal/test_util/channel.py b/python/lib/sift_py/_internal/test_util/channel.py index a549c28c7..67935514f 100644 --- a/python/lib/sift_py/_internal/test_util/channel.py +++ b/python/lib/sift_py/_internal/test_util/channel.py @@ -6,6 +6,8 @@ from grpc.aio import Channel as AsyncChannel from grpc_testing import Channel +from sift_py.grpc.transport import SiftChannelConfig + SerializingFunction = Callable[[Any], bytes] DeserializingFunction = Callable[[bytes], Any] DoneCallbackType = Callable[[Any], None] @@ -18,6 +20,8 @@ class MockChannel(Channel): Used as a mock gRPC channel """ + config = SiftChannelConfig(uri="localhost:50051", apikey="fake-api-key", use_ssl=False) + def take_unary_unary(self, method_descriptor): pass diff --git a/python/lib/sift_py/asset/_service_test.py b/python/lib/sift_py/asset/_service_test.py index ecf03970e..c113db25a 100644 --- a/python/lib/sift_py/asset/_service_test.py +++ b/python/lib/sift_py/asset/_service_test.py @@ -2,6 +2,7 @@ from unittest import TestCase from unittest.mock import MagicMock +import grpc from sift.assets.v1.assets_pb2 import ( Asset, GetAssetResponse, @@ -12,7 +13,6 @@ from sift_py._internal.metadata import metadata_dict_to_pb from sift_py.asset.config import AssetConfig from sift_py.asset.service import AssetService -from sift_py.grpc.transport import SiftChannel class TestAssetService(TestCase): @@ -23,7 +23,7 @@ class TestAssetService(TestCase): """ def setUp(self): - self.channel = MagicMock(spec=SiftChannel) + self.channel = MagicMock(spec=grpc.Channel) self.service = AssetService(self.channel) self.asset_service_stub = self.service._asset_service_stub From ba8c06cd5199aab370fd13987578992bab504551 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 14 Aug 2025 15:24:03 -0700 Subject: [PATCH 25/34] Deconflict _stop event w/ Threading internal _stop function. --- python/examples/ingestion_with_python_config/main.py | 2 +- .../examples/ingestion_with_python_config/simulator.py | 2 +- python/lib/sift_py/ingestion/_internal/stream.py | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/examples/ingestion_with_python_config/main.py b/python/examples/ingestion_with_python_config/main.py index a27d288f9..568e84868 100644 --- a/python/examples/ingestion_with_python_config/main.py +++ b/python/examples/ingestion_with_python_config/main.py @@ -29,7 +29,7 @@ telemetry_config = nostromos_lv_426() # Create a gRPC transport channel configured specifically for the Sift API - sift_channel_config = SiftChannelConfig(uri=base_uri, apikey=apikey) + sift_channel_config = SiftChannelConfig(uri=base_uri, apikey=apikey, use_ssl=False) with use_sift_channel(sift_channel_config) as channel: # Create ingestion service using the telemetry config we loaded in diff --git a/python/examples/ingestion_with_python_config/simulator.py b/python/examples/ingestion_with_python_config/simulator.py index 46fd0a8ae..5671f7695 100644 --- a/python/examples/ingestion_with_python_config/simulator.py +++ b/python/examples/ingestion_with_python_config/simulator.py @@ -68,7 +68,7 @@ def run(self): logs_interval_s = 1 / LOGS_FREQUENCY_HZ partial_readings_with_log_interval_s = 1 / PARTIAL_READINGS_WITH_LOG_FREQUENCY_HZ - with self.ingestion_service.buffered_ingestion() as buffered_ingestion: + with self.ingestion_service.buffered_ingestion(buffer_size=10) as buffered_ingestion: while time.time() < end_time: current_time = time.time() diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index 1d5870fd3..fb2e7ad20 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -58,12 +58,12 @@ def __init__( """ super().__init__(daemon=True) self.data_queue = data_queue - self._stop = threading.Event() + self._stop_event = threading.Event() self.sift_stream_builder = sift_stream_builder self.metric_interval = metric_interval def stop(self): - self._stop.set() + self._stop_event.set() # Give a brief chance to finish the stream (should take < 50ms). time.sleep(self.CLEANUP_TIMEOUT) self.task.cancel() @@ -76,7 +76,7 @@ async def main(self): try: while True: while not self.data_queue.empty(): - if self._stop.is_set(): + if self._stop_event.is_set(): # Being forced to stop. Try to finish the stream. logger.info( f"Ingestion thread received stop signal. Exiting. Sent {count} requests. {self.data_queue.qsize()} requests remaining." @@ -85,7 +85,7 @@ async def main(self): return item = self.data_queue.get() if item is None: - self._stop.set() + self._stop_event.set() continue sift_stream = await sift_stream.send_requests(item) count += 1 @@ -95,7 +95,7 @@ async def main(self): ) time_since_last_metric = time.time() - if self._stop.is_set(): + if self._stop_event.is_set(): logger.debug( f"No more requests. Stopping. Sent {count} requests. {self.data_queue.qsize()} requests remaining." ) From 63263fa32ec38ae56983598b938f8ce604109b4d Mon Sep 17 00:00:00 2001 From: Ian Later Date: Mon, 18 Aug 2025 18:16:51 -0700 Subject: [PATCH 26/34] re-checkout examples from main. use uuid for client key generation. --- python/examples/ingestion_with_python_config/main.py | 2 +- python/examples/ingestion_with_python_config/simulator.py | 2 +- python/lib/sift_py/ingestion/_internal/stream.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/examples/ingestion_with_python_config/main.py b/python/examples/ingestion_with_python_config/main.py index 568e84868..a27d288f9 100644 --- a/python/examples/ingestion_with_python_config/main.py +++ b/python/examples/ingestion_with_python_config/main.py @@ -29,7 +29,7 @@ telemetry_config = nostromos_lv_426() # Create a gRPC transport channel configured specifically for the Sift API - sift_channel_config = SiftChannelConfig(uri=base_uri, apikey=apikey, use_ssl=False) + sift_channel_config = SiftChannelConfig(uri=base_uri, apikey=apikey) with use_sift_channel(sift_channel_config) as channel: # Create ingestion service using the telemetry config we loaded in diff --git a/python/examples/ingestion_with_python_config/simulator.py b/python/examples/ingestion_with_python_config/simulator.py index 5671f7695..46fd0a8ae 100644 --- a/python/examples/ingestion_with_python_config/simulator.py +++ b/python/examples/ingestion_with_python_config/simulator.py @@ -68,7 +68,7 @@ def run(self): logs_interval_s = 1 / LOGS_FREQUENCY_HZ partial_readings_with_log_interval_s = 1 / PARTIAL_READINGS_WITH_LOG_FREQUENCY_HZ - with self.ingestion_service.buffered_ingestion(buffer_size=10) as buffered_ingestion: + with self.ingestion_service.buffered_ingestion() as buffered_ingestion: while time.time() < end_time: current_time = time.time() diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index fb2e7ad20..9b97ed8a8 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -1,8 +1,8 @@ import asyncio import logging -import random import threading import time +import uuid from queue import Queue from typing import List, Optional @@ -316,7 +316,7 @@ def get_run_form( return RunFormPy( name=run_name, description=run_description, - client_key=client_key or f"random_key_{str(random.randint(1000, 9999))}", + client_key=client_key or str(uuid.uuid4()), tags=run_tags, ) From e835275296b55a9780f51177299a7d8a5cd64fbb Mon Sep 17 00:00:00 2001 From: Ian Later Date: Tue, 19 Aug 2025 13:56:31 -0700 Subject: [PATCH 27/34] Release candidate version --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index b1adac5b0..6860f897f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sift_stack_py" -version = "0.8.3" +version = "0.8.4rc" description = "Python client library for the Sift API" requires-python = ">=3.8" readme = { file = "README.md", content-type = "text/markdown" } From 37bbe0947f1801b09efbfd32e1db85ce3225fa3c Mon Sep 17 00:00:00 2001 From: Ian Later Date: Tue, 19 Aug 2025 13:58:58 -0700 Subject: [PATCH 28/34] Release candidate version correction --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 6860f897f..f1abbaf3a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sift_stack_py" -version = "0.8.4rc" +version = "0.8.4-rc.1" description = "Python client library for the Sift API" requires-python = ">=3.8" readme = { file = "README.md", content-type = "text/markdown" } From 6227fbe833a2e58c89298878963caa7d9768af60 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Tue, 19 Aug 2025 14:01:37 -0700 Subject: [PATCH 29/34] Release candidate version correction --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index f1abbaf3a..4f6af8f26 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sift_stack_py" -version = "0.8.4-rc.1" +version = "0.8.5-rc.1" description = "Python client library for the Sift API" requires-python = ">=3.8" readme = { file = "README.md", content-type = "text/markdown" } From 0814bd67a3fcd1d5ad04a8456ab6f45d79d39afd Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 21 Aug 2025 15:01:39 -0700 Subject: [PATCH 30/34] Dead threads must be reassigned. --- python/lib/sift_py/ingestion/_internal/ingest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/lib/sift_py/ingestion/_internal/ingest.py b/python/lib/sift_py/ingestion/_internal/ingest.py index d5d4b7894..562660576 100644 --- a/python/lib/sift_py/ingestion/_internal/ingest.py +++ b/python/lib/sift_py/ingestion/_internal/ingest.py @@ -120,6 +120,7 @@ def ingest_async(self, *requests: IngestWithConfigDataStreamRequest): # FD-179: Create a thread pool and add to whichever queue is smallest # Start thread on first ingest on the assumption all modifications to the ingestion config have concluded. if not self._ingestion_thread.is_alive(): + self._ingestion_thread = IngestionThread(self.builder, self._request_queue) self._ingestion_thread.start() stream_requests(self._request_queue, *requests) From 25d61272b607d3d810540be5bb53240ae5509aea Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 21 Aug 2025 15:48:23 -0700 Subject: [PATCH 31/34] toml rev --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 4f6af8f26..a5ff86d0d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sift_stack_py" -version = "0.8.5-rc.1" +version = "0.8.5-rc.2" description = "Python client library for the Sift API" requires-python = ">=3.8" readme = { file = "README.md", content-type = "text/markdown" } From 872df8e2fbbb8ae925de8cc05e0b6853155e3473 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Fri, 22 Aug 2025 18:05:44 -0700 Subject: [PATCH 32/34] Check for existing asyncio loop. --- python/examples/ingestion_with_threading/main.py | 2 +- python/lib/sift_py/ingestion/_internal/stream.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/python/examples/ingestion_with_threading/main.py b/python/examples/ingestion_with_threading/main.py index 3fc808dbf..5af284916 100644 --- a/python/examples/ingestion_with_threading/main.py +++ b/python/examples/ingestion_with_threading/main.py @@ -16,7 +16,7 @@ def ingestion_thread(data_queue: Queue): it to Sift. """ # Can tune ingestion performance with buffer_size and flush_interval_sec - with ingestion_service.buffered_ingestion() as buffered_ingestion: + with ingestion_service.buffered_ingestion(buffer_size=200, flush_interval_sec=1) as buffered_ingestion: while True: try: item = data_queue.get(timeout=1) diff --git a/python/lib/sift_py/ingestion/_internal/stream.py b/python/lib/sift_py/ingestion/_internal/stream.py index 9b97ed8a8..4b74fa541 100644 --- a/python/lib/sift_py/ingestion/_internal/stream.py +++ b/python/lib/sift_py/ingestion/_internal/stream.py @@ -102,7 +102,7 @@ async def main(self): await sift_stream.finish() return else: - time.sleep(self.IDLE_LOOP_PERIOD) + await asyncio.sleep(self.IDLE_LOOP_PERIOD) except asyncio.CancelledError: # It's possible the thread was joined while sleeping waiting for data. Only note error if we have data left. @@ -183,7 +183,12 @@ def stream_requests( data_queue: The queue to put IngestWithConfigDataStreamRequestPy requests into for ingestion. requests: List of IngestWithConfigDataStreamRequest protobuf objects """ - asyncio.run(stream_requests_async(data_queue, *requests)) + try: + loop = asyncio.get_running_loop() + loop.run_until_complete(stream_requests_async(data_queue, *requests)) + except RuntimeError: + # No running loop, start new loop + asyncio.run(stream_requests_async(data_queue, *requests)) def telemetry_config_to_ingestion_config_py( From 8dcb1f1c79a5630be159fb353597bad2b5160f63 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Fri, 22 Aug 2025 18:05:54 -0700 Subject: [PATCH 33/34] asdf --- python/examples/ingestion_with_threading/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/examples/ingestion_with_threading/main.py b/python/examples/ingestion_with_threading/main.py index 5af284916..87a39ab98 100644 --- a/python/examples/ingestion_with_threading/main.py +++ b/python/examples/ingestion_with_threading/main.py @@ -16,7 +16,9 @@ def ingestion_thread(data_queue: Queue): it to Sift. """ # Can tune ingestion performance with buffer_size and flush_interval_sec - with ingestion_service.buffered_ingestion(buffer_size=200, flush_interval_sec=1) as buffered_ingestion: + with ingestion_service.buffered_ingestion( + buffer_size=200, flush_interval_sec=1 + ) as buffered_ingestion: while True: try: item = data_queue.get(timeout=1) From 9a75eeaeeedecc9de014ca1b58cd95048092099b Mon Sep 17 00:00:00 2001 From: Ian Later Date: Tue, 9 Sep 2025 13:35:42 -0700 Subject: [PATCH 34/34] Rev version. --- python/CHANGELOG.md | 9 +++++++++ python/pyproject.toml | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index f0dff305e..e0445e7dd 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -3,6 +3,15 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## [v0.8.6-rc.1] - September 9, 2025 +### What's New +Update ingestion to use compiled Rust binary under the hood for performance improvements. + +## [v0.8.5] - August 31, 2025 +### What's New +#### Bytes support +Add plumbing to allow specifying bytes type data for ingestion. + ## [v0.8.3] - August 11, 2025 - [Fix windows utf-8 encoding bug with Hdf5UploadService](https://github.com/sift-stack/sift/pull/289) diff --git a/python/pyproject.toml b/python/pyproject.toml index a5ff86d0d..4f4a813e7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sift_stack_py" -version = "0.8.5-rc.2" +version = "0.8.6-rc.1" description = "Python client library for the Sift API" requires-python = ">=3.8" readme = { file = "README.md", content-type = "text/markdown" }