Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,23 @@ All notable changes to this project will be documented in this file.
- Support configuring JVM arguments ([#677]).
- Aggregate emitted Kubernetes events on the CustomResources ([#677]).

## Changed
### Changed

- Increased the default temporary secret lifetime for coordinators from 1 day to 15 days.
This is because Trino currently does not offer a HA setup for them, a restart kills all running queries ([#694]).
- Default to OCI for image metadata and product image selection ([#695]).

### Fixed

- Add a startupProbe, which checks via `/v1/info` that the coordinator/worker have finished starting.
Also migrate the other probes from `tcpSocket` to `httpGet` on `/v1/info` ([#715]).

[#676]: https://github.com/stackabletech/trino-operator/pull/676
[#677]: https://github.com/stackabletech/trino-operator/pull/677
[#687]: https://github.com/stackabletech/trino-operator/pull/687
[#694]: https://github.com/stackabletech/trino-operator/pull/694
[#695]: https://github.com/stackabletech/trino-operator/pull/695
[#715]: https://github.com/stackabletech/trino-operator/pull/715

## [24.11.1] - 2025-01-10

Expand Down
88 changes: 62 additions & 26 deletions rust/operator-binary/src/controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ use stackable_operator::{
api::{
apps::v1::{StatefulSet, StatefulSetSpec},
core::v1::{
ConfigMap, ConfigMapVolumeSource, ContainerPort, EnvVar, EnvVarSource, Probe,
Secret, SecretKeySelector, Service, ServicePort, ServiceSpec, TCPSocketAction,
ConfigMap, ConfigMapVolumeSource, ContainerPort, EnvVar, EnvVarSource, ExecAction,
HTTPGetAction, Probe, Secret, SecretKeySelector, Service, ServicePort, ServiceSpec,
Volume,
},
},
Expand Down Expand Up @@ -1061,6 +1061,8 @@ fn build_rolegroup_statefulset(
.context(AddVolumeMountSnafu)?
.add_container_ports(container_ports(trino))
.resources(merged_config.resources.clone().into())
// The probes are set on coordinators and workers
.startup_probe(startup_probe(trino))
.readiness_probe(readiness_probe(trino))
.liveness_probe(liveness_probe(trino))
.build();
Expand Down Expand Up @@ -1484,40 +1486,74 @@ fn container_ports(trino: &v1alpha1::TrinoCluster) -> Vec<ContainerPort> {
ports
}

fn readiness_probe(trino: &v1alpha1::TrinoCluster) -> Probe {
let port_name = if trino.expose_https_port() {
HTTPS_PORT_NAME
} else {
HTTP_PORT_NAME
};
fn startup_probe(trino: &v1alpha1::TrinoCluster) -> Probe {
Probe {
exec: Some(finished_starting_probe(trino)),
period_seconds: Some(5),
// Give the coordinator or worker 10 minutes to start up
failure_threshold: Some(120),
timeout_seconds: Some(3),
..Default::default()
}
}

fn readiness_probe(trino: &v1alpha1::TrinoCluster) -> Probe {
Probe {
initial_delay_seconds: Some(10),
period_seconds: Some(10),
failure_threshold: Some(5),
tcp_socket: Some(TCPSocketAction {
port: IntOrString::String(port_name.to_string()),
..TCPSocketAction::default()
}),
http_get: Some(http_get_probe(trino)),
period_seconds: Some(5),
failure_threshold: Some(1),
timeout_seconds: Some(3),
..Probe::default()
}
}

fn liveness_probe(trino: &v1alpha1::TrinoCluster) -> Probe {
let port_name = if trino.expose_https_port() {
HTTPS_PORT_NAME
Probe {
http_get: Some(http_get_probe(trino)),
period_seconds: Some(5),
// Coordinators are currently not highly available, so you always have a singe instance.
// Restarting it causes all queries to fail, so let's not restart it directly after the first
// probe failure, but wait for 3 failures
// NOTE: This also applies to workers
failure_threshold: Some(3),
timeout_seconds: Some(3),
..Probe::default()
}
}

/// Check that `/v1/info` returns `200`.
///
/// This is the same probe as the [upstream helm-chart](https://github.com/trinodb/charts/blob/7cd0a7bff6c52e0ee6ca6d5394cd72c150ad4379/charts/trino/templates/deployment-coordinator.yaml#L214)
/// is using.
fn http_get_probe(trino: &v1alpha1::TrinoCluster) -> HTTPGetAction {
let (schema, port_name) = if trino.expose_https_port() {
("HTTPS", HTTPS_PORT_NAME)
} else {
HTTP_PORT_NAME
("HTTP", HTTP_PORT_NAME)
};

Probe {
initial_delay_seconds: Some(30),
period_seconds: Some(10),
tcp_socket: Some(TCPSocketAction {
port: IntOrString::String(port_name.to_string()),
..TCPSocketAction::default()
}),
..Probe::default()
HTTPGetAction {
port: IntOrString::String(port_name.to_string()),
scheme: Some(schema.to_string()),
path: Some("/v1/info".to_string()),
..Default::default()
}
}

/// Wait until `/v1/info` returns `"starting":false`.
///
/// This probe works on coordinators and workers.
fn finished_starting_probe(trino: &v1alpha1::TrinoCluster) -> ExecAction {
let port = trino.exposed_port();
ExecAction {
command: Some(vec![
"/bin/bash".to_string(),
"-x".to_string(),
"-euo".to_string(),
"pipefail".to_string(),
"-c".to_string(),
format!("curl --fail --insecure https://127.0.0.1:{port}/v1/info | grep --silent '\\\"starting\\\":false'"),
]),
}
}

Expand Down
2 changes: 1 addition & 1 deletion rust/operator-binary/src/crd/catalog/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ pub mod versioned {
pub struct TrinoCatalogSpec {
/// The `connector` defines which connector is used.
pub connector: TrinoCatalogConnector,
#[serde(default)]

#[serde(default)]
/// The `configOverrides` allow overriding arbitrary Trino settings.
/// For example, for Hive you could add `hive.metastore.username: trino`.
pub config_overrides: HashMap<String, String>,
Expand Down