Ingester: err="rpc error: code = Unavailable desc = Starting" msg=gRPC
              
              #5101
            
            
          -
| I'm deploying Cortex using Helm chart with the following Chart values: alertmanager:
  enabled: false
clusterDomain: cortex.example.com
config:
  auth_enabled: true
  blocks_storage:
    backend: s3
    s3:
      access_key_id: <redacted>
      bucket_name: <redacted>
      endpoint: <redacted>
      region: <redacted>
      secret_access_key: <redacted>
nginx:
  enabled: false
ruler:
  enabled: false
tags:
  blocks-storage-memcached: true
compactor:
  resources:
    requests:
      memory: "64Mi"
      cpu: "250m"
    limits:
      memory: "1Gi"
      cpu: "1000m"
distributor:
  replicas: 3
  resources:
    requests:
      memory: "64Mi"
      cpu: "250m"
    limits:
      memory: "4Gi"
      cpu: "2000m"
  extraArgs:
    "-log.level": "debug"
    "-distributor.ingestion-rate-limit": "500000000"
    # To support `-ingester.max-global-series-per-user` flag
    # "-distributor.shard-by-all-labels": "true"
ingester:
  replicas: 3
  resources:
    requests:
      memory: "64Mi"
      cpu: "250m"
    limits:
      memory: "6Gi"
      cpu: "2000m"
  extraArgs:
    "-log.level": "debug"
    # "-ingester.max-series-per-user": "500000000" 
    # "-ingester.max-global-series-per-user": "500000000"
    "-ingester.ignore-series-limit-for-metric-names": "node_network_transmit_errs_total,apiserver_request_duration_seconds_bucket,container_blkio_device_usage_total,container_tasks_state,node_network_address_assign_type,node_network_carrier,node_network_carrier_changes_total,node_network_carrier_down_changes_total,node_network_carrier_up_changes_total,node_network_device_id,node_network_dormant,node_network_flags,node_network_iface_id,node_network_iface_link,node_network_iface_link_mode,node_network_info,node_network_mtu_bytes,node_network_name_assign_type,node_network_net_dev_group,node_network_protocol_type,node_network_receive_bytes_total,node_network_receive_compressed_total,node_network_receive_drop_total,node_network_receive_errs_total,node_network_receive_fifo_total,node_network_receive_frame_total,node_network_receive_multicast_total,node_network_receive_packets_total,node_network_transmit_bytes_total,node_network_transmit_carrier_total,node_network_transmit_colls_total,node_network_transmit_compressed_total,node_network_transmit_drop_total,node_network_transmit_fifo_total,node_network_transmit_packets_total,node_network_transmit_queue_length,node_network_up,root_ca_cert_publisher_sync_duration_seconds_bucket"
  # statefulSet:
  #   enabled: true
  # persistentVolume:
  #   enabled: true
  #   size: 100Gi
  #   storageClass: ms-xfs-2-replicas
memcached-blocks:
  resources:
    requests:
      memory: "64Mi"
      cpu: "250m"
    limits:
      memory: "128Mi"
      cpu: "500m"
memcached-blocks-index:
  resources:
    requests:
      memory: "64Mi"
      cpu: "250m"
    limits:
      memory: "128Mi"
      cpu: "500m"
memcached-blocks-metadata:
  resources:
    requests:
      memory: "64Mi"
      cpu: "250m"
    limits:
      memory: "128Mi"
      cpu: "500m"
querier:
  resources:
    requests:
      memory: "64Mi"
      cpu: "250m"
    limits:
      memory: "128Mi"
      cpu: "500m"
query_frontend:
  resources:
    requests:
      memory: "64Mi"
      cpu: "250m"
    limits:
      memory: "128Mi"
      cpu: "500m"
store_gateway:
  replicas: 2
  resources:
    requests:
      memory: "64Mi"
      cpu: "250m"
    limits:
      memory: "1Gi"
      cpu: "500m"
runtimeconfigmap:
  runtime_config:
    overrides:
      abc-cluster:
        max_series_per_metric: 500000000
        max_global_series_per_metric: 500000000
        max_series_per_user: 500000000 
        max_global_series_per_user: 500000000 
        max_label_names_per_series: 5000 
      def-cluster:
        max_series_per_metric: 500000000
        max_global_series_per_metric: 500000000
        max_series_per_user: 500000000 
        max_global_series_per_user: 500000000 
        max_label_names_per_series: 5000 
      xyz-cluster:
        max_series_per_metric: 500000000
        max_global_series_per_metric: 500000000
        max_series_per_user: 500000000 
        max_global_series_per_user: 500000000 
        max_label_names_per_series: 5000 This Cortex is responsible to store all my other cluster metrics where every external cluster/tenant will do remote write and send their metrics to  The Ingress is up and running with no issues. The  The external clusters/tenants ( ...which I think leads to the problem that I'm seeing right now. The Ingesters and Distributors are not happy and I'm out of ideas what else I can try to make them happy. I appreciate and welcome all tips/ideas from the community here. Ingester logs: Distributor logs: Prometheus logs from one of the tenants ( Any ideas? | 
Beta Was this translation helpful? Give feedback.
Replies: 3 comments 5 replies
-
| Are the ingesters jn a crash loop? They should stay in starting only while replaying the wall | 
Beta Was this translation helpful? Give feedback.
-
| Configure your ingester resources like this: if you specify less requests than limits, you can run into resources issues. This is standard for all pods in kubernetes | 
Beta Was this translation helpful? Give feedback.
-
| I'm changing the Ingester from Deployment to Statefulset (as per @alanprot's suggestion) with 100GB volume each. I also increased the requests and limits, making them same values (as per @friedrichg's suggestion). My Helm values now look like this: alertmanager:
  enabled: false
clusterDomain: cortex.example.com
config:
  auth_enabled: true
  blocks_storage:
    backend: s3
    s3:
      access_key_id: <redacted>
      bucket_name: <redacted>
      endpoint: <redacted>
      region: <redacted>
      secret_access_key: <redacted>
nginx:
  enabled: false
ruler:
  enabled: false
tags:
  blocks-storage-memcached: true
compactor:
  resources:
    requests:
      memory: "1Gi"
      cpu: "1000m"
    limits:
      memory: "1Gi"
      cpu: "1000m"
distributor:
  replicas: 3
  resources:
    requests:
      memory: "4Gi"
      cpu: "2000m"
    limits:
      memory: "4Gi"
      cpu: "2000m"
  extraArgs:
    "-log.level": "debug"
    "-distributor.ingestion-rate-limit": "500000000"
    # To support `-ingester.max-global-series-per-user` flag
    # "-distributor.shard-by-all-labels": "true"
ingester:
  replicas: 3
  resources:
    requests:
      memory: "35Gi"
      cpu: "4000m"
    limits:
      memory: "35Gi"
      cpu: "4000m"
  extraArgs:
    "-log.level": "debug"
    # "-ingester.max-series-per-user": "500000000" 
    # "-ingester.max-global-series-per-user": "500000000"
    "-ingester.ignore-series-limit-for-metric-names": "node_network_transmit_errs_total,apiserver_request_duration_seconds_bucket,container_blkio_device_usage_total,container_tasks_state,node_network_address_assign_type,node_network_carrier,node_network_carrier_changes_total,node_network_carrier_down_changes_total,node_network_carrier_up_changes_total,node_network_device_id,node_network_dormant,node_network_flags,node_network_iface_id,node_network_iface_link,node_network_iface_link_mode,node_network_info,node_network_mtu_bytes,node_network_name_assign_type,node_network_net_dev_group,node_network_protocol_type,node_network_receive_bytes_total,node_network_receive_compressed_total,node_network_receive_drop_total,node_network_receive_errs_total,node_network_receive_fifo_total,node_network_receive_frame_total,node_network_receive_multicast_total,node_network_receive_packets_total,node_network_transmit_bytes_total,node_network_transmit_carrier_total,node_network_transmit_colls_total,node_network_transmit_compressed_total,node_network_transmit_drop_total,node_network_transmit_fifo_total,node_network_transmit_packets_total,node_network_transmit_queue_length,node_network_up,root_ca_cert_publisher_sync_duration_seconds_bucket"
  statefulSet:
    enabled: true
  persistentVolume:
    enabled: true
    size: 100Gi
    storageClass: ms-xfs-2-replicas
memcached-blocks:
  resources:
    requests:
      memory: "128Mi"
      cpu: "500m"
    limits:
      memory: "128Mi"
      cpu: "500m"
memcached-blocks-index:
  resources:
    requests:
      memory: "128Mi"
      cpu: "500m"
    limits:
      memory: "128Mi"
      cpu: "500m"
memcached-blocks-metadata:
  resources:
    requests:
      memory: "128Mi"
      cpu: "500m"
    limits:
      memory: "128Mi"
      cpu: "500m"
querier:
  resources:
    requests:
      memory: "128Mi"
      cpu: "500m"
    limits:
      memory: "128Mi"
      cpu: "500m"
query_frontend:
  resources:
    requests:
      memory: "128Mi"
      cpu: "500m"
    limits:
      memory: "128Mi"
      cpu: "500m"
store_gateway:
  replicas: 2
  resources:
    requests:
      memory: "1Gi"
      cpu: "500m"
    limits:
      memory: "1Gi"
      cpu: "500m"
runtimeconfigmap:
  runtime_config:
    overrides:
      abc-cluster:
        max_series_per_metric: 500000000
        max_global_series_per_metric: 500000000
        max_series_per_user: 500000000 
        max_global_series_per_user: 500000000 
        max_label_names_per_series: 5000 
      def-cluster:
        max_series_per_metric: 500000000
        max_global_series_per_metric: 500000000
        max_series_per_user: 500000000 
        max_global_series_per_user: 500000000 
        max_label_names_per_series: 5000 
      xyz-cluster:
        max_series_per_metric: 500000000
        max_global_series_per_metric: 500000000
        max_series_per_user: 500000000 
        max_global_series_per_user: 500000000 
        max_label_names_per_series: 5000 While I don't see  Any idea? | 
Beta Was this translation helpful? Give feedback.
Configure your ingester resources like this:
if you specify less requests than limits, you can run into resources issues. This is standard for all pods in kubernetes