diff --git a/go.mod b/go.mod index e25daf7d88..5452e1f3cc 100644 --- a/go.mod +++ b/go.mod @@ -171,6 +171,9 @@ require ( github.com/stretchr/objx v0.5.3 // indirect github.com/tonistiigi/dchapes-mode v0.0.0-20250318174251-73d941a28323 // indirect github.com/tonistiigi/go-csvvalue v0.0.0-20240814133006-030d3b2625d0 // indirect + github.com/twmb/franz-go v1.20.6 // indirect + github.com/twmb/franz-go/pkg/kadm v1.17.2 // indirect + github.com/twmb/franz-go/pkg/kmsg v1.12.0 // indirect github.com/wadey/gocovmerge v0.0.0-20160331181800-b5bfa59ec0ad // indirect github.com/wlynxg/anet v0.0.5 // indirect github.com/x448/float16 v0.8.4 // indirect diff --git a/go.sum b/go.sum index e229facc8d..e32cf02207 100644 --- a/go.sum +++ b/go.sum @@ -1025,6 +1025,12 @@ github.com/tonistiigi/units v0.0.0-20180711220420-6950e57a87ea h1:SXhTLE6pb6eld/ github.com/tonistiigi/units v0.0.0-20180711220420-6950e57a87ea/go.mod h1:WPnis/6cRcDZSUvVmezrxJPkiO87ThFYsoUiMwWNDJk= github.com/tonistiigi/vt100 v0.0.0-20240514184818-90bafcd6abab h1:H6aJ0yKQ0gF49Qb2z5hI1UHxSQt4JMyxebFR15KnApw= github.com/tonistiigi/vt100 v0.0.0-20240514184818-90bafcd6abab/go.mod h1:ulncasL3N9uLrVann0m+CDlJKWsIAP34MPcOJF6VRvc= +github.com/twmb/franz-go v1.20.6 h1:TpQTt4QcixJ1cHEmQGPOERvTzo99s8jAutmS7rbSD6w= +github.com/twmb/franz-go v1.20.6/go.mod h1:u+FzH2sInp7b9HNVv2cZN8AxdXy6y/AQ1Bkptu4c0FM= +github.com/twmb/franz-go/pkg/kadm v1.17.2 h1:g5f1sAxnTkYC6G96pV5u715HWhxd66hWaDZUAQ8xHY8= +github.com/twmb/franz-go/pkg/kadm v1.17.2/go.mod h1:ST55zUB+sUS+0y+GcKY/Tf1XxgVilaFpB9I19UubLmU= +github.com/twmb/franz-go/pkg/kmsg v1.12.0 h1:CbatD7ers1KzDNgJqPbKOq0Bz/WLBdsTH75wgzeVaPc= +github.com/twmb/franz-go/pkg/kmsg v1.12.0/go.mod h1:+DPt4NC8RmI6hqb8G09+3giKObE6uD2Eya6CfqBpeJY= github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o= github.com/uber/jaeger-client-go v2.30.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg= diff --git a/services/blockvalidation/Server_test.go b/services/blockvalidation/Server_test.go index 671e3f6cb3..b3051d9275 100644 --- a/services/blockvalidation/Server_test.go +++ b/services/blockvalidation/Server_test.go @@ -25,7 +25,6 @@ import ( "testing" "time" - "github.com/IBM/sarama" "github.com/bsv-blockchain/go-bt/v2" "github.com/bsv-blockchain/go-bt/v2/chainhash" "github.com/bsv-blockchain/go-chaincfg" @@ -1616,9 +1615,7 @@ func Test_consumerMessageHandler(t *testing.T) { require.NoError(t, err) msg := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Value: msgBytes, - }, + Value: msgBytes, } handler := server.consumerMessageHandler(ctx) @@ -1660,9 +1657,7 @@ func Test_consumerMessageHandler(t *testing.T) { // Invalid message that will cause a parsing error msg := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Value: []byte("invalid protobuf"), - }, + Value: []byte("invalid protobuf"), } handler := server.consumerMessageHandler(ctx) @@ -1707,9 +1702,7 @@ func Test_consumerMessageHandler(t *testing.T) { require.NoError(t, err) msg := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Value: msgBytes, - }, + Value: msgBytes, } handler := server.consumerMessageHandler(ctx) diff --git a/services/blockvalidation/integration_retry_test.go b/services/blockvalidation/integration_retry_test.go index be5d72fc8b..b9355eddf1 100644 --- a/services/blockvalidation/integration_retry_test.go +++ b/services/blockvalidation/integration_retry_test.go @@ -8,7 +8,6 @@ import ( "testing" "time" - "github.com/IBM/sarama" "github.com/bsv-blockchain/go-bt/v2/chainhash" "github.com/bsv-blockchain/teranode/errors" "github.com/bsv-blockchain/teranode/services/blockchain" @@ -151,9 +150,7 @@ func TestIntegrationRetryWithMultipleFailures(t *testing.T) { require.NoError(t, err) kafkaMessage := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Value: msgBytes, - }, + Value: msgBytes, } handler := server.consumerMessageHandler(ctx) diff --git a/services/blockvalidation/malicious_peer_handling_test.go b/services/blockvalidation/malicious_peer_handling_test.go index 04e8a5c4d5..9423fbb60c 100644 --- a/services/blockvalidation/malicious_peer_handling_test.go +++ b/services/blockvalidation/malicious_peer_handling_test.go @@ -6,7 +6,6 @@ import ( "testing" "time" - "github.com/IBM/sarama" "github.com/bsv-blockchain/go-bt/v2/chainhash" "github.com/bsv-blockchain/teranode/errors" "github.com/bsv-blockchain/teranode/model" @@ -185,9 +184,7 @@ func TestKafkaConsumerMessageHandling(t *testing.T) { require.NoError(t, err) kafkaMessage := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Value: msgBytes, - }, + Value: msgBytes, } // Get consumer handler @@ -219,9 +216,7 @@ func TestKafkaConsumerMessageHandling(t *testing.T) { require.NoError(t, err) kafkaMessage := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Value: msgBytes, - }, + Value: msgBytes, } handler := server.consumerMessageHandler(ctx) @@ -259,9 +254,7 @@ func TestKafkaConsumerMessageHandling(t *testing.T) { require.NoError(t, err) kafkaMessage := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Value: msgBytes, - }, + Value: msgBytes, } handler := server.consumerMessageHandler(ctx) diff --git a/services/p2p/Server_test.go b/services/p2p/Server_test.go index 14952a0ff9..7808e7f66a 100644 --- a/services/p2p/Server_test.go +++ b/services/p2p/Server_test.go @@ -18,7 +18,6 @@ import ( "testing" "time" - "github.com/IBM/sarama" "github.com/bsv-blockchain/go-bt/v2/chainhash" "github.com/bsv-blockchain/go-chaincfg" p2pMessageBus "github.com/bsv-blockchain/go-p2p-message-bus" @@ -2194,10 +2193,8 @@ func TestInvalidSubtreeHandlerHappyPath(t *testing.T) { require.NoError(t, err) msg := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Topic: "invalid-subtrees", - Value: payload, - }, + Topic: "invalid-subtrees", + Value: payload, } h := s.invalidSubtreeHandler(context.Background()) @@ -2233,10 +2230,8 @@ func TestInvalidBlockHandler(t *testing.T) { mkKafkaMsg := func(payload []byte) *kafka.KafkaMessage { return &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Topic: "invalid-subtrees", - Value: payload, - }, + Topic: "invalid-subtrees", + Value: payload, } } @@ -2356,10 +2351,8 @@ func TestServerRejectedHandler(t *testing.T) { // helper: incapsulate in kafka.KafkaMessage mkKafkaMsg := func(b []byte) *kafka.KafkaMessage { return &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Topic: "rejected-tx", - Value: b, - }, + Topic: "rejected-tx", + Value: b, } } @@ -2476,10 +2469,8 @@ func TestServerRejectedHandler(t *testing.T) { mkKafkaMsg := func(b []byte) *kafka.KafkaMessage { return &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Topic: "rejected-tx", - Value: b, - }, + Topic: "rejected-tx", + Value: b, } } @@ -2830,10 +2821,8 @@ func TestProcessInvalidBlockMessageSuccess(t *testing.T) { require.NoError(t, err) kafkaMsg := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Topic: "invalid-blocks", - Value: msgBytes, - }, + Topic: "invalid-blocks", + Value: msgBytes, } // Create a real ban manager for testing @@ -2871,10 +2860,8 @@ func TestProcessInvalidBlockMessageUnmarshalError(t *testing.T) { logger := ulogger.New("test") kafkaMsg := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Topic: "invalid-blocks", - Value: invalidBytes, - }, + Topic: "invalid-blocks", + Value: invalidBytes, } server := &Server{ logger: logger, @@ -2894,10 +2881,8 @@ func TestProcessInvalidBlockMessageNoPeerInMap(t *testing.T) { logger := ulogger.New("test") kafkaMsg := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Topic: "invalid-blocks", - Value: msgBytes, - }, + Topic: "invalid-blocks", + Value: msgBytes, } server := &Server{ @@ -2919,10 +2904,8 @@ func TestProcessInvalidBlockMessageWrongTypeInMap(t *testing.T) { logger := ulogger.New("test") kafkaMsg := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Topic: "invalid-blocks", - Value: msgBytes, - }, + Topic: "invalid-blocks", + Value: msgBytes, } server := &Server{ @@ -2947,10 +2930,8 @@ func TestProcessInvalidBlockMessageAddBanScoreFails(t *testing.T) { mockPeerID := peer.ID("peer-fail") kafkaMsg := &kafka.KafkaMessage{ - ConsumerMessage: sarama.ConsumerMessage{ - Topic: "invalid-blocks", - Value: msgBytes, - }, + Topic: "invalid-blocks", + Value: msgBytes, } // Create a real ban manager for testing diff --git a/services/subtreevalidation/txmetaHandler.go b/services/subtreevalidation/txmetaHandler.go index 82af832d1c..5a67c2ab29 100644 --- a/services/subtreevalidation/txmetaHandler.go +++ b/services/subtreevalidation/txmetaHandler.go @@ -41,7 +41,7 @@ func (u *Server) txmetaMessageHandler(ctx context.Context) func(msg *kafka.Kafka // Processing errors are logged and the message is marked as completed // to prevent infinite retry loops on malformed data. func (u *Server) txmetaHandler(ctx context.Context, msg *kafka.KafkaMessage) error { - if msg == nil || len(msg.ConsumerMessage.Value) < 4 { + if msg == nil || len(msg.Value) < 4 { return nil } @@ -50,7 +50,7 @@ func (u *Server) txmetaHandler(ctx context.Context, msg *kafka.KafkaMessage) err go func() { startTime := time.Now() - data := msg.ConsumerMessage.Value + data := msg.Value offset := 0 // Read entry count diff --git a/services/subtreevalidation/txmetaHandler_test.go b/services/subtreevalidation/txmetaHandler_test.go index 7ffc8a310e..47808d00ce 100644 --- a/services/subtreevalidation/txmetaHandler_test.go +++ b/services/subtreevalidation/txmetaHandler_test.go @@ -7,7 +7,6 @@ import ( "testing" "time" - "github.com/IBM/sarama" "github.com/bsv-blockchain/go-bt/v2" "github.com/bsv-blockchain/go-bt/v2/chainhash" "github.com/bsv-blockchain/teranode/errors" @@ -227,12 +226,8 @@ func createKafkaMessage(t *testing.T, delete bool, content []byte) *kafka.KafkaM copy(data[offset:], content) } - consumerMsg := sarama.ConsumerMessage{ - Value: data, - } - return &kafka.KafkaMessage{ - ConsumerMessage: consumerMsg, + Value: data, } } @@ -252,7 +247,7 @@ func TestServer_txmetaHandler(t *testing.T) { { name: "message too short for entry count", setupMocks: func(l *mockLogger, c *mockCache) {}, - input: &kafka.KafkaMessage{ConsumerMessage: sarama.ConsumerMessage{Value: make([]byte, 3)}}, + input: &kafka.KafkaMessage{Value: make([]byte, 3)}, }, { name: "successful delete operation", diff --git a/settings/kafka_settings.go b/settings/kafka_settings.go index 8188c2ee22..d69fc47d46 100644 --- a/settings/kafka_settings.go +++ b/settings/kafka_settings.go @@ -35,5 +35,5 @@ type KafkaSettings struct { TLSKeyFile string `key:"KAFKA_TLS_KEY_FILE" desc:"Path to Kafka TLS client key file" default:"" category:"Kafka" usage:"For client certificate authentication" type:"string" longdesc:"### Purpose\nPath to client private key file for mutual TLS (mTLS) authentication.\n\n### Format\nPEM-encoded private key, RSA or ECDSA (e.g., /etc/kafka/ssl/client-key.pem)\n\n### How It Works\n- Loaded via tls.LoadX509KeyPair with TLSCertFile\n- Private key must correspond to public key in client certificate\n- Validated on startup to ensure key matches certificate\n\n### Recommendations\n- Required when broker requires client certificate authentication\n- **Must be provided together with TLSCertFile** - both or neither\n- File permissions should be restricted (0600) to protect private key"` // Debug logging EnableDebugLogging bool `key:"kafka_enable_debug_logging" desc:"Enable debug logging for Kafka client" default:"false" category:"Kafka" usage:"Useful for troubleshooting connection issues" type:"bool" longdesc:"### Purpose\nEnable verbose debug logging for Kafka client operations.\n\n### How It Works\nWhen enabled, logs detailed information about:\n- Connection attempts and broker communication\n- Metadata requests\n- Produce/consume operations\n- Partition assignments and rebalancing\n- Consumer group coordination\n- Errors\n\n### Trade-offs\n| Setting | Benefit | Drawback |\n|---------|---------|----------|\n| Enabled | Detailed troubleshooting info | High logging overhead, large log volume |\n| Disabled | Normal performance | Less visibility into client behavior |\n\n### Recommendations\n- Use for troubleshooting connection issues\n- Use for debugging message delivery problems\n- **Disable in production** due to performance impact"` - Scheme string `key:"KAFKA_SCHEMA" desc:"Kafka connection scheme" default:"http" category:"Kafka" usage:"Use 'kafka' for standard, 'memory' for testing" type:"string" longdesc:"### Purpose\nConnection scheme for Kafka client.\n\n### Values\n| Scheme | Description |\n|--------|-------------|\n| kafka | Standard Kafka broker connections |\n| memory | In-memory mock implementation (testing only) |\n\n### How It Works\n- **kafka** scheme connects to real brokers specified in KAFKA_HOSTS\n- **memory** scheme uses in-memory broker (imk.GetSharedBroker()) providing sarama.SyncProducer and sarama.Consumer interfaces without requiring a Kafka cluster\n\n### Recommendations\n- **Production**: Must use kafka\n- **CI/CD pipelines**: memory scheme useful for testing\n- **Local development**: memory scheme useful for unit tests"` + Scheme string `key:"KAFKA_SCHEMA" desc:"Kafka connection scheme" default:"http" category:"Kafka" usage:"Use 'kafka' for standard, 'memory' for testing" type:"string" longdesc:"### Purpose\nConnection scheme for Kafka client.\n\n### Values\n| Scheme | Description |\n|--------|-------------|\n| kafka | Standard Kafka broker connections |\n| memory | In-memory mock implementation (testing only) |\n\n### How It Works\n- **kafka** scheme connects to real brokers specified in KAFKA_HOSTS\n- **memory** scheme uses in-memory broker (imk.GetSharedBroker()) providing sync/async producer and consumer behavior without requiring a Kafka cluster\n\n### Recommendations\n- **Production**: Must use kafka\n- **CI/CD pipelines**: memory scheme useful for testing\n- **Local development**: memory scheme useful for unit tests"` } diff --git a/test/longtest/util/kafka/kafka_test.go b/test/longtest/util/kafka/kafka_test.go index 7ec44d2749..eaaafe1091 100644 --- a/test/longtest/util/kafka/kafka_test.go +++ b/test/longtest/util/kafka/kafka_test.go @@ -270,7 +270,7 @@ func TestKafkaProducerAndConsumer(t *testing.T) { } } - err = client.ConsumerGroup.Close() + err = client.Close() require.NoError(t, err) // In total we should have 4 messages (1 + 3 retries) for each of the 2 original messages @@ -608,7 +608,7 @@ func consumeMessages(t *testing.T, ctx context.Context, logger ulogger.Logger, k require.NoError(t, err) defer func() { - _ = consumer.ConsumerGroup.Close() + _ = consumer.Close() }() consumer.Start(ctx, func(msg *ukafka.KafkaMessage) error { diff --git a/util/kafka/in_memory_kafka/in_memory_kafka.go b/util/kafka/in_memory_kafka/in_memory_kafka.go index 6be9cf5ee5..a334b3484f 100644 --- a/util/kafka/in_memory_kafka/in_memory_kafka.go +++ b/util/kafka/in_memory_kafka/in_memory_kafka.go @@ -1,24 +1,24 @@ +// Package inmemorykafka provides an in-memory Kafka implementation for testing. package inmemorykafka import ( "context" - "errors" // nolint:depguard - "fmt" - "log" + "errors" "sync" "time" - - "github.com/IBM/sarama" ) var errSessionClosed = errors.New("in-memory session closed") +var errAlreadyRunning = errors.New("consumer group already running") -// Message represents a simplified message structure. +// Message represents a Kafka message. type Message struct { - Topic string - Key []byte // Added to store message key - Value []byte - Offset int64 + Topic string + Key []byte + Value []byte + Offset int64 + Partition int32 + Timestamp time.Time } // InMemoryBroker is the core in-memory message broker. @@ -55,7 +55,6 @@ func (b *InMemoryBroker) Produce(ctx context.Context, topic string, key []byte, consumers: make([]chan *Message, 0), } } - t = b.topics[topic] b.mu.Unlock() } @@ -63,12 +62,13 @@ func (b *InMemoryBroker) Produce(ctx context.Context, topic string, key []byte, t.mu.Lock() defer t.mu.Unlock() - // Create message with next offset msg := &Message{ - Topic: topic, - Key: key, // Store the key - Value: value, - Offset: int64(len(t.messages)), + Topic: topic, + Key: key, + Value: value, + Offset: int64(len(t.messages)), + Partition: 0, + Timestamp: time.Now(), } t.messages = append(t.messages, msg) @@ -88,331 +88,84 @@ func (b *InMemoryBroker) Topics() []string { b.mu.Lock() defer b.mu.Unlock() topics := make([]string, 0, len(b.topics)) - for topic := range b.topics { topics = append(topics, topic) } - return topics } -// InMemorySyncProducer implements sarama.SyncProducer. -type InMemorySyncProducer struct { - broker *InMemoryBroker -} - -// NewInMemorySyncProducer creates a new in-memory sync producer. -func NewInMemorySyncProducer(broker *InMemoryBroker) sarama.SyncProducer { - return &InMemorySyncProducer{broker: broker} -} - -// SendMessage sends a message to the broker, returning dummy partition and offset. -func (p *InMemorySyncProducer) SendMessage(msg *sarama.ProducerMessage) (int32, int64, error) { - var key []byte - - if msg.Key != nil { - var errKey error - - key, errKey = msg.Key.Encode() - if errKey != nil { - return 0, 0, fmt.Errorf("failed to encode key: %w", errKey) // nolint:forbidigo - } - } - - value, err := msg.Value.Encode() - if err != nil { - return 0, 0, err - } - - err = p.broker.Produce(context.Background(), msg.Topic, key, value) // Pass key - if err != nil { - return 0, 0, err - } - - // Return dummy partition (0) and offset (0) since we don’t manage these - return 0, 0, nil -} - -// SendMessages is required by SyncProducer but not implemented for simplicity. -func (p *InMemorySyncProducer) SendMessages(msgs []*sarama.ProducerMessage) error { - return errors.New("SendMessages not implemented") -} - -// Close does nothing as there’s no resource to clean up. -func (p *InMemorySyncProducer) Close() error { - return nil -} - -// BeginTxn is required by SyncProducer but not implemented. -func (p *InMemorySyncProducer) BeginTxn() error { - return errors.New("BeginTxn not implemented") -} - -// AddMessageToTxn is required by SyncProducer but not implemented. -func (p *InMemorySyncProducer) AddMessageToTxn(msg *sarama.ConsumerMessage, groupID string, metadata *string) error { - return errors.New("AddMessageToTxn not implemented") -} - -// AddOffsetsToTxn is required by SyncProducer but not implemented. -func (p *InMemorySyncProducer) AddOffsetsToTxn(offsets map[string][]*sarama.PartitionOffsetMetadata, groupID string) error { - return errors.New("AddOffsetsToTxn not implemented") -} - -// CommitTxn is required by SyncProducer but not implemented. -func (p *InMemorySyncProducer) CommitTxn() error { - return errors.New("CommitTxn not implemented") -} - -// AbortTxn is required by SyncProducer but not implemented. -func (p *InMemorySyncProducer) AbortTxn() error { - return errors.New("AbortTxn not implemented") -} - -// TxnStatus is required by SyncProducer but not implemented. -func (p *InMemorySyncProducer) TxnStatus() sarama.ProducerTxnStatusFlag { - // Return a default status; assuming no transaction is ever active in this mock. - return sarama.ProducerTxnFlagReady -} - -// IsTransactional is required by SyncProducer. Returns false as this mock isn't transactional. -func (p *InMemorySyncProducer) IsTransactional() bool { - return false -} - -// InMemoryConsumer implements sarama.Consumer. -type InMemoryConsumer struct { - broker *InMemoryBroker - topic string - ch chan *Message - closeOnce sync.Once // Added for safe closing -} - -// NewInMemoryConsumer creates a new in-memory consumer for a topic. -func (b *InMemoryBroker) NewInMemoryConsumer(topic string) (sarama.Consumer, error) { - b.mu.RLock() - t, ok := b.topics[topic] - b.mu.RUnlock() - - if !ok { - b.mu.Lock() - if _, exists := b.topics[topic]; !exists { - b.topics[topic] = &Topic{ - messages: make([]*Message, 0), - consumers: make([]chan *Message, 0), - } - } - - t = b.topics[topic] - b.mu.Unlock() - } - - ch := make(chan *Message, 100) // Buffered channel - - t.mu.Lock() - t.consumers = append(t.consumers, ch) - t.mu.Unlock() - - return &InMemoryConsumer{ - broker: b, - topic: topic, - ch: ch, - }, nil -} - -// ConsumePartition returns a PartitionConsumer for the topic. -func (c *InMemoryConsumer) ConsumePartition(topic string, partition int32, offset int64) (sarama.PartitionConsumer, error) { - // Ignore partition and offset for simplicity; only support one partition (0) - if partition != 0 { - return nil, errors.New("only partition 0 is supported") - } - - return &InMemoryPartitionConsumer{ - consumer: c, - }, nil -} - -// Topics is not implemented as it’s not needed for basic tests. -func (c *InMemoryConsumer) Topics() ([]string, error) { - return nil, errors.New("Topics not implemented") -} - -// Partitions is not implemented as we assume one partition. -func (c *InMemoryConsumer) Partitions(topic string) ([]int32, error) { - return nil, errors.New("Partitions not implemented") -} - -// Close removes the consumer’s channel from the broker and safely closes the channel once. -func (c *InMemoryConsumer) Close() error { - c.closeOnce.Do(func() { // Ensure close logic runs only once - c.broker.mu.RLock() - t, ok := c.broker.topics[c.topic] - c.broker.mu.RUnlock() - - if ok { - t.mu.Lock() - newConsumers := make([]chan *Message, 0, len(t.consumers)) // Safer removal +// --- Shared Broker Singleton --- - for _, ch := range t.consumers { - if ch != c.ch { - newConsumers = append(newConsumers, ch) - } - } +var sharedBroker *InMemoryBroker +var brokerOnce sync.Once - t.consumers = newConsumers - t.mu.Unlock() - } - // Close the channel safely within the sync.Once block - close(c.ch) +// GetSharedBroker returns the singleton InMemoryBroker instance. +func GetSharedBroker() *InMemoryBroker { + brokerOnce.Do(func() { + sharedBroker = NewInMemoryBroker() }) - - return nil -} - -// HighWaterMarks is required by the sarama.Consumer interface but not implemented. -func (c *InMemoryConsumer) HighWaterMarks() map[string]map[int32]int64 { - // Return nil as this mock doesn't track high water marks. - return nil -} - -// Pause is required by the sarama.Consumer interface but not implemented. -func (c *InMemoryConsumer) Pause(partitions map[string][]int32) { - // This mock does not support pausing partitions. -} - -// PauseAll is required by the sarama.Consumer interface but not implemented. -func (c *InMemoryConsumer) PauseAll() { - // This mock does not support pausing. + return sharedBroker } -// Resume is required by the sarama.Consumer interface but not implemented. -func (c *InMemoryConsumer) Resume(partitions map[string][]int32) { - // This mock does not support resuming partitions. -} +// --- Sync Producer --- -// ResumeAll is required by the sarama.Consumer interface but not implemented. -func (c *InMemoryConsumer) ResumeAll() { - // This mock does not support resuming. -} - -// InMemoryPartitionConsumer implements sarama.PartitionConsumer. -type InMemoryPartitionConsumer struct { - consumer *InMemoryConsumer - messages chan *sarama.ConsumerMessage - once sync.Once - isPausedFunc func() bool // Function to check if consumption is paused +// InMemorySyncProducer implements a synchronous producer for testing. +type InMemorySyncProducer struct { + broker *InMemoryBroker } -// Messages returns a channel of ConsumerMessages. -func (pc *InMemoryPartitionConsumer) Messages() <-chan *sarama.ConsumerMessage { - pc.once.Do(func() { - pc.messages = make(chan *sarama.ConsumerMessage, 100) - go func() { - defer close(pc.messages) - - for msg := range pc.consumer.ch { - // Wait while paused - keep checking until unpaused or context cancelled - for pc.isPausedFunc != nil && pc.isPausedFunc() { - // Sleep briefly to avoid busy waiting - time.Sleep(10 * time.Millisecond) - } - - consumerSaramaMsg := &sarama.ConsumerMessage{ - Topic: msg.Topic, - Key: msg.Key, // Populate Key from internal message - Value: msg.Value, - Offset: msg.Offset, - Timestamp: time.Now(), // Mock timestamp - } - select { - case pc.messages <- consumerSaramaMsg: - default: - } - } - }() - }) - - return pc.messages +// NewInMemorySyncProducer creates a new in-memory sync producer. +func NewInMemorySyncProducer(broker *InMemoryBroker) *InMemorySyncProducer { + return &InMemorySyncProducer{broker: broker} } -// Errors returns nil as error handling is simplified. -func (pc *InMemoryPartitionConsumer) Errors() <-chan *sarama.ConsumerError { - return nil +// Send sends a message to the broker. +func (p *InMemorySyncProducer) Send(topic string, key []byte, value []byte) error { + return p.broker.Produce(context.Background(), topic, key, value) } -// Close stops the partition consumer (in this mock, it does nothing specific). -func (pc *InMemoryPartitionConsumer) Close() error { - // Previously called pc.consumer.Close(), causing double close. - // PartitionConsumer closing should be independent. +// Close does nothing as there's no resource to clean up. +func (p *InMemorySyncProducer) Close() error { return nil } -// AsyncClose performs a non-blocking close operation required by sarama.PartitionConsumer. -func (pc *InMemoryPartitionConsumer) AsyncClose() { - // Trigger the close operation asynchronously. - go pc.Close() -} - -// HighWaterMarkOffset returns the high water mark offset. Required by sarama.PartitionConsumer. -func (pc *InMemoryPartitionConsumer) HighWaterMarkOffset() int64 { - // This mock does not track offsets precisely. Return a dummy value or best guess. - // Let's return the current count of messages in the topic as a proxy. - pc.consumer.broker.mu.RLock() - t, ok := pc.consumer.broker.topics[pc.consumer.topic] - pc.consumer.broker.mu.RUnlock() - - if ok { - t.mu.RLock() - defer t.mu.RUnlock() - - return int64(len(t.messages)) - } +// --- Async Producer --- - return 0 // Default if topic not found or not tracking -} - -// IsPaused returns whether this partition consumer is paused. Required by sarama.PartitionConsumer. -func (pc *InMemoryPartitionConsumer) IsPaused() bool { - if pc.isPausedFunc != nil { - return pc.isPausedFunc() - } - return false +// ProducerMessage represents a message to be produced. +type ProducerMessage struct { + Topic string + Key []byte + Value []byte } -// Pause is required by the sarama.PartitionConsumer interface but not implemented. -func (pc *InMemoryPartitionConsumer) Pause() { - // This mock does not support pausing partitions. +// ProducerError represents a producer error. +type ProducerError struct { + Msg *ProducerMessage + Err error } -// Resume is required by the sarama.PartitionConsumer interface but not implemented. -func (pc *InMemoryPartitionConsumer) Resume() { - // This mock does not support resuming partitions. -} - -// InMemoryAsyncProducer implements sarama.AsyncProducer. +// InMemoryAsyncProducer implements an asynchronous producer for testing. type InMemoryAsyncProducer struct { - broker *InMemoryBroker - input chan *sarama.ProducerMessage - successes chan *sarama.ProducerMessage - errors chan *sarama.ProducerError - close chan struct{} - wg sync.WaitGroup + broker *InMemoryBroker + input chan *ProducerMessage + successes chan *ProducerMessage + errors chan *ProducerError + close chan struct{} + wg sync.WaitGroup + closeOnce sync.Once } -// Ensure InMemoryAsyncProducer implements the interface -var _ sarama.AsyncProducer = (*InMemoryAsyncProducer)(nil) - // NewInMemoryAsyncProducer creates a new in-memory async producer. -// Buffer sizes can be adjusted as needed. func NewInMemoryAsyncProducer(broker *InMemoryBroker, bufferSize int) *InMemoryAsyncProducer { if bufferSize <= 0 { - bufferSize = 100 // Default buffer size + bufferSize = 100 } p := &InMemoryAsyncProducer{ broker: broker, - input: make(chan *sarama.ProducerMessage, bufferSize), - successes: make(chan *sarama.ProducerMessage, bufferSize), - errors: make(chan *sarama.ProducerError, bufferSize), + input: make(chan *ProducerMessage, bufferSize), + successes: make(chan *ProducerMessage, bufferSize), + errors: make(chan *ProducerError, bufferSize), close: make(chan struct{}), } @@ -430,155 +183,109 @@ func (p *InMemoryAsyncProducer) messageHandler() { select { case msg, ok := <-p.input: if !ok { - // Input channel closed, likely during Close() return } - var key []byte - - if msg.Key != nil { - var errKey error - - key, errKey = msg.Key.Encode() - if errKey != nil { - p.errors <- &sarama.ProducerError{Msg: msg, Err: fmt.Errorf("failed to encode key: %w", errKey)} // nolint:forbidigo - continue - } - } - - value, err := msg.Value.Encode() + err := p.broker.Produce(context.Background(), msg.Topic, msg.Key, msg.Value) if err != nil { - p.errors <- &sarama.ProducerError{Msg: msg, Err: err} - continue // Skip to next message - } - - // Use context.Background() for the mock Produce call - err = p.broker.Produce(context.Background(), msg.Topic, key, value) // Pass key - if err != nil { - p.errors <- &sarama.ProducerError{Msg: msg, Err: err} + p.errors <- &ProducerError{Msg: msg, Err: err} } else { - // Simulate success: Assign dummy partition/offset if needed, although - // broker.Produce doesn't return them. The original message is sufficient - // for the Successes channel according to Sarama docs. p.successes <- msg } case <-p.close: - // Close signal received return } } } -// AsyncClose triggers a shutdown of the producer and waits for it to complete. -func (p *InMemoryAsyncProducer) AsyncClose() { - close(p.close) // Signal handler goroutine to stop -} - -// Close shuts down the producer and waits for currently processed messages to finish. -func (p *InMemoryAsyncProducer) Close() error { - p.AsyncClose() // Signal handler to stop processing - p.wg.Wait() // Wait for handler goroutine to finish - // Close output channels after the handler has finished to prevent sends on closed channels - close(p.successes) - close(p.errors) - close(p.input) // Close input to signal no more messages - - return nil +// Produce sends a message asynchronously. +func (p *InMemoryAsyncProducer) Produce(topic string, key []byte, value []byte) { + p.input <- &ProducerMessage{ + Topic: topic, + Key: key, + Value: value, + } } // Input returns the input channel for sending messages. -func (p *InMemoryAsyncProducer) Input() chan<- *sarama.ProducerMessage { +func (p *InMemoryAsyncProducer) Input() chan<- *ProducerMessage { return p.input } // Successes returns the success channel. -func (p *InMemoryAsyncProducer) Successes() <-chan *sarama.ProducerMessage { +func (p *InMemoryAsyncProducer) Successes() <-chan *ProducerMessage { return p.successes } // Errors returns the error channel. -func (p *InMemoryAsyncProducer) Errors() <-chan *sarama.ProducerError { +func (p *InMemoryAsyncProducer) Errors() <-chan *ProducerError { return p.errors } -// IsTransactional required by AsyncProducer interface. Always false for this mock. -func (p *InMemoryAsyncProducer) IsTransactional() bool { - return false -} - -// TxnStatus required by AsyncProducer interface. Mock implementation. -func (p *InMemoryAsyncProducer) TxnStatus() sarama.ProducerTxnStatusFlag { - return sarama.ProducerTxnFlagReady -} - -// BeginTxn required by AsyncProducer interface. Mock implementation. -func (p *InMemoryAsyncProducer) BeginTxn() error { - return errors.New("transactions not supported by InMemoryAsyncProducer") -} - -// CommitTxn required by AsyncProducer interface. Mock implementation. -func (p *InMemoryAsyncProducer) CommitTxn() error { - return errors.New("transactions not supported by InMemoryAsyncProducer") +// Close shuts down the producer. Safe to call multiple times. +func (p *InMemoryAsyncProducer) Close() error { + p.closeOnce.Do(func() { + close(p.close) + p.wg.Wait() + close(p.successes) + close(p.errors) + close(p.input) + }) + return nil } -// AbortTxn required by AsyncProducer interface. Mock implementation. -func (p *InMemoryAsyncProducer) AbortTxn() error { - return errors.New("transactions not supported by InMemoryAsyncProducer") -} +// --- Consumer Group --- -// AddMessageToTxn required by AsyncProducer interface. Mock implementation. -func (p *InMemoryAsyncProducer) AddMessageToTxn(msg *sarama.ConsumerMessage, groupID string, metadata *string) error { - return errors.New("transactions not supported by InMemoryAsyncProducer") +// ConsumerGroupSession represents a consumer group session. +type ConsumerGroupSession interface { + Claims() map[string][]int32 + MemberID() string + GenerationID() int32 + MarkOffset(topic string, partition int32, offset int64, metadata string) + ResetOffset(topic string, partition int32, offset int64, metadata string) + MarkMessage(msg *Message, metadata string) + Context() context.Context + Commit() } -// AddOffsetsToTxn required by AsyncProducer interface. Mock implementation. -func (p *InMemoryAsyncProducer) AddOffsetsToTxn(offsets map[string][]*sarama.PartitionOffsetMetadata, groupID string) error { - return errors.New("transactions not supported by InMemoryAsyncProducer") +// ConsumerGroupClaim represents a consumer group claim. +type ConsumerGroupClaim interface { + Topic() string + Partition() int32 + InitialOffset() int64 + HighWaterMarkOffset() int64 + Messages() <-chan *Message } -// --- Shared Broker Singleton --- - -var sharedBroker *InMemoryBroker -var brokerOnce sync.Once - -// GetSharedBroker returns the singleton InMemoryBroker instance. -func GetSharedBroker() *InMemoryBroker { - brokerOnce.Do(func() { - sharedBroker = NewInMemoryBroker() - }) - - return sharedBroker +// ConsumerGroupHandler is the interface for handling consumer group events. +type ConsumerGroupHandler interface { + Setup(ConsumerGroupSession) error + Cleanup(ConsumerGroupSession) error + ConsumeClaim(ConsumerGroupSession, ConsumerGroupClaim) error } -// --- InMemoryConsumerGroup Implementation --- - -// InMemoryConsumerGroup implements sarama.ConsumerGroup +// InMemoryConsumerGroup implements a consumer group for testing. type InMemoryConsumerGroup struct { broker *InMemoryBroker - topic string // Store topic from config - groupID string // Store groupID from config - handler sarama.ConsumerGroupHandler // Set during Consume call + topic string + groupID string errors chan error closeOnce sync.Once cancelConsume context.CancelFunc wg sync.WaitGroup closed chan struct{} isRunning bool - isPaused bool // Track pause state + isPaused bool mu sync.Mutex } -// Ensure InMemoryConsumerGroup implements the interface -var _ sarama.ConsumerGroup = (*InMemoryConsumerGroup)(nil) - // NewInMemoryConsumerGroup creates a mock consumer group. -// It needs the topic and groupID for potential internal logic or logging. func NewInMemoryConsumerGroup(broker *InMemoryBroker, topic, groupID string) *InMemoryConsumerGroup { return &InMemoryConsumerGroup{ broker: broker, topic: topic, groupID: groupID, - errors: make(chan error, 100), // Buffered error channel + errors: make(chan error, 100), closed: make(chan struct{}), } } @@ -593,213 +300,151 @@ func (mcg *InMemoryConsumerGroup) Close() error { mcg.mu.Lock() if !mcg.isRunning { mcg.mu.Unlock() - // Avoid double close or closing non-running group - // Although closeOnce handles double close, this prevents unnecessary waits return nil } mcg.mu.Unlock() mcg.closeOnce.Do(func() { mcg.mu.Lock() - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Close called\n", mcg.groupID) if mcg.cancelConsume != nil { - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Calling cancelConsume\n", mcg.groupID) - mcg.cancelConsume() // Signal Consume loop to stop + mcg.cancelConsume() } - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Waiting on WaitGroup\n", mcg.groupID) - mcg.wg.Wait() // Wait for Consume loop to finish - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] WaitGroup finished\n", mcg.groupID) - close(mcg.errors) // Close errors channel last - close(mcg.closed) // Signal that Close is complete + mcg.wg.Wait() + close(mcg.errors) + close(mcg.closed) mcg.isRunning = false mcg.mu.Unlock() }) - <-mcg.closed // Wait until close is fully done - + <-mcg.closed return nil } -// Consume joins a cluster of consumers for a given list of topics and -// starts a *blocking* ConsumerGroupSession through the ConsumerGroupHandler. -// This implementation attempts to mimic the blocking behavior of the real Sarama ConsumerGroup. -// It runs Setup, ConsumeClaim, and Cleanup synchronously. -// ConsumeClaim is expected to block until the provided context is cancelled or the handler exits. -func (mcg *InMemoryConsumerGroup) Consume(ctx context.Context, topics []string, handler sarama.ConsumerGroupHandler) error { - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Consume called\n", mcg.groupID) - // Ensure only one Consume call runs at a time for this group instance +// Consume joins a cluster of consumers for a given list of topics. +func (mcg *InMemoryConsumerGroup) Consume(ctx context.Context, topics []string, handler ConsumerGroupHandler) error { mcg.mu.Lock() if mcg.isRunning { mcg.mu.Unlock() - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Consume called but already running\n", mcg.groupID) - return errAlreadyRunning // Or sarama.ErrConcurrentConsumerGroupSessions if defined/preferred + return errAlreadyRunning } - mcg.isRunning = true - // Store the cancel function derived from the input context immediately under lock - // This internal context is what gets passed down and cancelled by Close() internalCtx, cancel := context.WithCancel(ctx) mcg.cancelConsume = cancel mcg.mu.Unlock() - mcg.handler = handler - - // Variables to hold resources that need cleanup - var ( - consumer sarama.Consumer - partitionConsumer sarama.PartitionConsumer - consumeErr error // Stores the primary error to return - ) - - // Defer cleanup actions defer func() { - // Call the stored cancel function ensures the context IS cancelled on exit - // It's safe to call cancel multiple times. if mcg.cancelConsume != nil { mcg.cancelConsume() } - - // Close consumers if they were successfully created - if partitionConsumer != nil { - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Closing partition consumer\n", mcg.groupID) - // In a real scenario, closing partitionConsumer might need coordination - // if its internal processing (Messages loop) runs in a goroutine. - // For this mock, assuming direct close is sufficient. - partitionConsumer.Close() - } - - if consumer != nil { - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Closing main consumer\n", mcg.groupID) - consumer.Close() - } - - // Mark as not running and clear cancel func *after* cleanup mcg.mu.Lock() mcg.isRunning = false - mcg.cancelConsume = nil // Clear the stored cancel func + mcg.cancelConsume = nil mcg.mu.Unlock() }() - // This mock currently only supports consuming a single specified topic if len(topics) != 1 { - consumeErr = errors.New("in-memory consumer group mock only supports exactly one topic") - return consumeErr // Cleanup runs via defer + return errors.New("in-memory consumer group mock only supports exactly one topic") } topicToConsume := topics[0] - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Creating new consumer for topic %s\n", mcg.groupID, topicToConsume) - var err error + // Create consumer channel + mcg.broker.mu.RLock() + t, ok := mcg.broker.topics[topicToConsume] + mcg.broker.mu.RUnlock() - consumer, err = mcg.broker.NewInMemoryConsumer(topicToConsume) - if err != nil { - consumeErr = fmt.Errorf("failed to create underlying consumer: %w", err) // nolint:forbidigo - return consumeErr // Cleanup runs via defer + if !ok { + mcg.broker.mu.Lock() + if _, exists := mcg.broker.topics[topicToConsume]; !exists { + mcg.broker.topics[topicToConsume] = &Topic{ + messages: make([]*Message, 0), + consumers: make([]chan *Message, 0), + } + } + t = mcg.broker.topics[topicToConsume] + mcg.broker.mu.Unlock() } - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Creating partition consumer for topic %s partition 0\n", mcg.groupID, topicToConsume) - // Assume partition 0 and oldest offset for simplicity in the mock - partitionConsumer, err = consumer.ConsumePartition(topicToConsume, 0, sarama.OffsetOldest) - if err != nil { - consumeErr = fmt.Errorf("failed to create partition consumer: %w", err) // nolint:forbidigo - return consumeErr // Cleanup runs via defer + ch := make(chan *Message, 100) + t.mu.Lock() + t.consumers = append(t.consumers, ch) + t.mu.Unlock() + + defer func() { + t.mu.Lock() + newConsumers := make([]chan *Message, 0, len(t.consumers)) + for _, c := range t.consumers { + if c != ch { + newConsumers = append(newConsumers, c) + } + } + t.consumers = newConsumers + t.mu.Unlock() + close(ch) + }() + + session := &InMemoryConsumerGroupSession{ + ctx: internalCtx, + broker: mcg.broker, + groupID: mcg.groupID, } - // If this is our InMemoryPartitionConsumer, set the pause check function - if inMemPC, ok := partitionConsumer.(*InMemoryPartitionConsumer); ok { - inMemPC.isPausedFunc = func() bool { + claim := &InMemoryConsumerGroupClaim{ + topic: topicToConsume, + partition: 0, + messages: ch, + isPausedFunc: func() bool { mcg.mu.Lock() defer mcg.mu.Unlock() return mcg.isPaused - } + }, } - // Prepare session and claim objects using the internalCtx - session := &InMemoryConsumerGroupSession{ctx: internalCtx, broker: mcg.broker, groupID: mcg.groupID} - claim := &InMemoryConsumerGroupClaim{ - topic: topicToConsume, - partition: 0, - partConsumer: partitionConsumer, + // Setup + if err := handler.Setup(session); err != nil { + return err } - // --- Start of Synchronous Lifecycle --- - - // 1. Call Setup - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Calling handler.Setup\n", mcg.groupID) - err = handler.Setup(session) - // Check context cancellation *after* Setup, as Setup might block briefly + // Check context cancellation after Setup if internalCtx.Err() != nil { - consumeErr = internalCtx.Err() // Prioritize context cancellation - if err != nil { - log.Printf("WARN: InMemoryConsumerGroup %s: handler.Setup failed (%v) but context was already cancelled (%v)", mcg.groupID, err, consumeErr) - } - - return consumeErr + return internalCtx.Err() } - if err != nil && !errors.Is(err, errSessionClosed) { - // fmt.Printf("[ERROR InMemoryConsumerGroup %s] Handler Setup failed: %v\n", mcg.groupID, err) - consumeErr = fmt.Errorf("handler setup failed: %w", err) // nolint:forbidigo - // Cleanup (including cancelling internalCtx) happens in defer - return consumeErr - } - - // 2. Call ConsumeClaim - This is the core blocking call. - // It should block until the session's context (internalCtx) is cancelled - // or the handler implementation decides to exit. - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Calling handler.ConsumeClaim (blocking)\n", mcg.groupID) - consumeErr = handler.ConsumeClaim(session, claim) // Assign directly to consumeErr - // fmt.Printf("[DEBUG InMemoryConsumerGroup %s] Handler ConsumeClaim returned: %v\n", mcg.groupID, consumeErr) + // ConsumeClaim + consumeErr := handler.ConsumeClaim(session, claim) - // 3. Call Cleanup - This runs *after* ConsumeClaim returns, even if it errored or context was cancelled. + // Cleanup cleanupErr := handler.Cleanup(session) - if cleanupErr != nil { - // fmt.Printf("[ERROR InMemoryConsumerGroup %s] Handler Cleanup failed: %v\n", mcg.groupID, cleanupErr) - // If ConsumeClaim didn't error, the cleanup error becomes the primary error. - // If ConsumeClaim did error, we prioritize that and just log the cleanup error. - if consumeErr == nil { - consumeErr = fmt.Errorf("handler cleanup failed: %w", cleanupErr) // nolint:forbidigo - } else { - log.Printf("WARN: InMemoryConsumerGroup %s: handler.Cleanup failed (%v) after ConsumeClaim error (%v)", mcg.groupID, cleanupErr, consumeErr) - } + if cleanupErr != nil && consumeErr == nil { + return cleanupErr } - // --- End of Synchronous Lifecycle --- - - // Final cleanup happens in the defer block. - // Return the primary error encountered (ConsumeClaim, context cancellation if propagated, or Cleanup error). return consumeErr } -// PauseAll pauses consumption for all claimed partitions. -// For the in-memory implementation, this sets a flag that prevents messages from being consumed. +// PauseAll pauses consumption. func (mcg *InMemoryConsumerGroup) PauseAll() { mcg.mu.Lock() defer mcg.mu.Unlock() mcg.isPaused = true } -// ResumeAll resumes consumption for all paused partitions. -// For the in-memory implementation, this clears the pause flag allowing messages to be consumed again. +// ResumeAll resumes consumption. func (mcg *InMemoryConsumerGroup) ResumeAll() { mcg.mu.Lock() defer mcg.mu.Unlock() mcg.isPaused = false } -// Pause pauses consumption for the given partitions. No-op for mock. -func (mcg *InMemoryConsumerGroup) Pause(partitions map[string][]int32) { - // No-op: Pausing/Resuming is not actively simulated in this mock -} +// Pause is a no-op for partition map; use PauseAll to pause. +func (mcg *InMemoryConsumerGroup) Pause(_ map[string][]int32) {} -// Resume resumes consumption for the given partitions. No-op for mock. -func (mcg *InMemoryConsumerGroup) Resume(partitions map[string][]int32) { - // No-op: Pausing/Resuming is not actively simulated in this mock -} +// Resume is a no-op for partition map; use ResumeAll to resume. +func (mcg *InMemoryConsumerGroup) Resume(_ map[string][]int32) {} -// --- Mock Session and Claim (Needed for Consume) --- +// --- Session and Claim implementations --- -// InMemoryConsumerGroupSession implements sarama.ConsumerGroupSession +// InMemoryConsumerGroupSession implements ConsumerGroupSession. type InMemoryConsumerGroupSession struct { ctx context.Context broker *InMemoryBroker @@ -808,56 +453,49 @@ type InMemoryConsumerGroupSession struct { func (s *InMemoryConsumerGroupSession) Claims() map[string][]int32 { claims := make(map[string][]int32) - topics := s.broker.Topics() // Call the new Topics() method (Fixes 82efbd65, c78a7c59) - + topics := s.broker.Topics() for _, topic := range topics { - // Mock: Assume claim on partition 0 for all known topics claims[topic] = []int32{0} } - return claims } + func (s *InMemoryConsumerGroupSession) MemberID() string { return "mock-member-" + s.groupID } -func (s *InMemoryConsumerGroupSession) GenerationID() int32 { return 1 } // Mocked generation ID +func (s *InMemoryConsumerGroupSession) GenerationID() int32 { return 1 } func (s *InMemoryConsumerGroupSession) MarkOffset(topic string, partition int32, offset int64, metadata string) { -} // No-op mock -func (s *InMemoryConsumerGroupSession) Commit() {} // No-op mock +} +func (s *InMemoryConsumerGroupSession) Commit() {} func (s *InMemoryConsumerGroupSession) ResetOffset(topic string, partition int32, offset int64, metadata string) { -} // No-op mock -func (s *InMemoryConsumerGroupSession) MarkMessage(msg *sarama.ConsumerMessage, metadata string) {} // No-op mock -func (s *InMemoryConsumerGroupSession) Context() context.Context { return s.ctx } +} +func (s *InMemoryConsumerGroupSession) MarkMessage(msg *Message, metadata string) {} +func (s *InMemoryConsumerGroupSession) Context() context.Context { return s.ctx } -// InMemoryConsumerGroupClaim implements sarama.ConsumerGroupClaim +// InMemoryConsumerGroupClaim implements ConsumerGroupClaim. type InMemoryConsumerGroupClaim struct { topic string partition int32 - partConsumer sarama.PartitionConsumer // The underlying partition consumer + messages <-chan *Message + isPausedFunc func() bool } func (c *InMemoryConsumerGroupClaim) Topic() string { return c.topic } func (c *InMemoryConsumerGroupClaim) Partition() int32 { return c.partition } -func (c *InMemoryConsumerGroupClaim) InitialOffset() int64 { return sarama.OffsetOldest } // Mocked +func (c *InMemoryConsumerGroupClaim) InitialOffset() int64 { return 0 } func (c *InMemoryConsumerGroupClaim) HighWaterMarkOffset() int64 { - if c.partConsumer != nil { - return c.partConsumer.HighWaterMarkOffset() - } - return 0 } -func (c *InMemoryConsumerGroupClaim) Messages() <-chan *sarama.ConsumerMessage { - if c.partConsumer != nil { - return c.partConsumer.Messages() - } - // Return a closed channel if partition consumer is nil to prevent blocking forever - closedChan := make(chan *sarama.ConsumerMessage) - close(closedChan) - - return closedChan +func (c *InMemoryConsumerGroupClaim) Messages() <-chan *Message { + // Create a filtered channel that respects pause state + filtered := make(chan *Message, 100) + go func() { + defer close(filtered) + for msg := range c.messages { + // Wait while paused + for c.isPausedFunc != nil && c.isPausedFunc() { + time.Sleep(10 * time.Millisecond) + } + filtered <- msg + } + }() + return filtered } - -// --- Consumer Group Mock --- - -var errAlreadyRunning = errors.New("consumer group already running") - -// ConsumerGroupHandler represents the interface that needs to be implemented by users of a ConsumerGroup. -// This is copied directly from sarama for compatibility. diff --git a/util/kafka/in_memory_kafka/in_memory_kafka_test.go b/util/kafka/in_memory_kafka/in_memory_kafka_test.go index ce6e9f1636..a14a106666 100644 --- a/util/kafka/in_memory_kafka/in_memory_kafka_test.go +++ b/util/kafka/in_memory_kafka/in_memory_kafka_test.go @@ -7,7 +7,6 @@ import ( "testing" "time" - "github.com/IBM/sarama" "github.com/bsv-blockchain/teranode/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -18,797 +17,280 @@ func TestInMemoryBrokerProduceConsume(t *testing.T) { topic := "test-topic" messageValue := []byte("hello world") - // Create Producer producer := NewInMemorySyncProducer(broker) - defer func() { - _ = producer.Close() - }() - - // Create Consumer - consumer, err := broker.NewInMemoryConsumer(topic) - if err != nil { - t.Fatalf("Failed to create consumer: %v", err) - } - defer func() { - _ = consumer.Close() - }() - - // Consume Partition (only partition 0 is supported in this mock) - partitionConsumer, err := consumer.ConsumePartition(topic, 0, sarama.OffsetNewest) // Start from newest for this test - if err != nil { - t.Fatalf("Failed to consume partition: %v", err) - } - defer func() { - _ = partitionConsumer.Close() - }() + defer func() { _ = producer.Close() }() - // Use a channel to signal message reception - messageReceived := make(chan *sarama.ConsumerMessage, 1) - errCh := make(chan error, 1) // Renamed errs to errCh - - // Start consumer goroutine - go func() { - select { - case msg := <-partitionConsumer.Messages(): - messageReceived <- msg - case consumerErr := <-partitionConsumer.Errors(): - // This channel is nil in the mock, so this case shouldn't be hit - if consumerErr != nil { // Check if error is actually non-nil - errCh <- consumerErr - } - case <-time.After(5 * time.Second): // Timeout - errCh <- errors.NewServiceError("timeout waiting for message") - } - }() + received := make(chan *Message, 1) + handler := &captureHandler{received: received} + cg := NewInMemoryConsumerGroup(broker, topic, "test-group") + defer cg.Close() - // Produce Message - msg := &sarama.ProducerMessage{ - Topic: topic, - Value: sarama.ByteEncoder(messageValue), - } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go func() { _ = cg.Consume(ctx, []string{topic}, handler) }() - _, _, err = producer.SendMessage(msg) - if err != nil { - t.Fatalf("Failed to send message: %v", err) - } + time.Sleep(50 * time.Millisecond) + require.NoError(t, producer.Send(topic, nil, messageValue)) - // Wait for message or error select { - case receivedMsg := <-messageReceived: - if receivedMsg.Topic != topic { - t.Errorf("Received message with wrong topic: got %s, want %s", receivedMsg.Topic, topic) - } + case msg := <-received: + assert.Equal(t, topic, msg.Topic) + assert.True(t, bytes.Equal(msg.Value, messageValue)) + assert.Equal(t, int64(0), msg.Offset) + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for message") + } +} - if !bytes.Equal(receivedMsg.Value, messageValue) { - t.Errorf("Received message with wrong value: got %s, want %s", string(receivedMsg.Value), string(messageValue)) - } +type captureHandler struct{ received chan *Message } - // Check offset - should be 0 for the first message - if receivedMsg.Offset != 0 { - t.Errorf("Received message with wrong offset: got %d, want %d", receivedMsg.Offset, 0) - } - case consumerErr := <-errCh: - t.Fatalf("Consumer error: %v", consumerErr) +func (h *captureHandler) Setup(ConsumerGroupSession) error { return nil } +func (h *captureHandler) Cleanup(ConsumerGroupSession) error { return nil } +func (h *captureHandler) ConsumeClaim(_ ConsumerGroupSession, claim ConsumerGroupClaim) error { + for msg := range claim.Messages() { + h.received <- msg } + return nil } func TestInMemoryBrokerProduceToNewTopic(t *testing.T) { broker := NewInMemoryBroker() - topic := "new-topic" - messageValue := []byte("message for new topic") - producer := NewInMemorySyncProducer(broker) - defer func() { - _ = producer.Close() - }() - - msg := &sarama.ProducerMessage{ - Topic: topic, - Value: sarama.ByteEncoder(messageValue), - } - - _, _, err := producer.SendMessage(msg) - if err != nil { - t.Fatalf("Failed to send message to new topic: %v", err) - } + defer producer.Close() + require.NoError(t, producer.Send("new-topic", nil, []byte("message for new topic"))) - // Verify topic was created broker.mu.RLock() - _, ok := broker.topics[topic] + _, ok := broker.topics["new-topic"] broker.mu.RUnlock() - - if !ok { - t.Errorf("Topic '%s' was not created automatically after producing", topic) - } + assert.True(t, ok) } func TestInMemoryBrokerConsumeFromNewTopic(t *testing.T) { broker := NewInMemoryBroker() - topic := "another-new-topic" - - consumer, err := broker.NewInMemoryConsumer(topic) - if err != nil { - t.Fatalf("Failed to create consumer for new topic: %v", err) - } - defer func() { - _ = consumer.Close() - }() + cg := NewInMemoryConsumerGroup(broker, "another-new-topic", "test-group") + defer cg.Close() + ctx, cancel := context.WithCancel(context.Background()) + cancel() + err := cg.Consume(ctx, []string{"another-new-topic"}, &captureHandler{received: make(chan *Message)}) + require.Error(t, err) + assert.Equal(t, context.Canceled, err) - // Verify topic was created broker.mu.RLock() - _, ok := broker.topics[topic] + _, ok := broker.topics["another-new-topic"] broker.mu.RUnlock() - - if !ok { - t.Errorf("Topic '%s' was not created automatically after consuming", topic) - } -} - -func TestInMemoryConsumerConsumePartitionError(t *testing.T) { - broker := NewInMemoryBroker() - topic := "test-topic-partition" - - // Produce a fake message to ensure a topic exists - producer := NewInMemorySyncProducer(broker) - msg := &sarama.ProducerMessage{Topic: topic, Value: sarama.StringEncoder("dummy")} - - _, _, err := producer.SendMessage(msg) // Assign error from SendMessage - if err != nil { - t.Fatalf("Failed to produce setup message: %v", err) - } - - _ = producer.Close() - - consumer, err := broker.NewInMemoryConsumer(topic) - if err != nil { - t.Fatalf("Failed to create consumer: %v", err) - } - defer func() { - _ = consumer.Close() - }() - - // Try consuming unsupported partition 1 - _, err = consumer.ConsumePartition(topic, 1, sarama.OffsetOldest) - if err == nil { - t.Errorf("Expected error when consuming unsupported partition 1, but got nil") - } else { - t.Logf("Got expected error for unsupported partition: %v", err) // Log success - } + assert.True(t, ok) } -func TestInMemorySyncProducerUnimplementedMethods(t *testing.T) { +func TestInMemorySyncProducerSend(t *testing.T) { broker := NewInMemoryBroker() producer := NewInMemorySyncProducer(broker) - - defer func() { - _ = producer.Close() - }() - - if err := producer.SendMessages(nil); err == nil { - t.Error("Expected error for unimplemented SendMessages, got nil") - } - - if err := producer.BeginTxn(); err == nil { - t.Error("Expected error for unimplemented BeginTxn, got nil") - } - - if err := producer.CommitTxn(); err == nil { - t.Error("Expected error for unimplemented CommitTxn, got nil") - } + defer producer.Close() + require.NoError(t, producer.Send("test-topic", []byte("key"), []byte("value"))) } func TestInMemoryAsyncProducerProduceSuccess(t *testing.T) { broker := NewInMemoryBroker() topic := "async-test-success" - messageValue := "hello async world" - msgToSend := &sarama.ProducerMessage{ - Topic: topic, - Value: sarama.StringEncoder(messageValue), - } - - // 1. Create Consumer FIRST to register its channel with the broker - consumer, err := broker.NewInMemoryConsumer(topic) - if err != nil { - t.Fatalf("Failed to create consumer: %v", err) - } - - defer func() { - _ = consumer.Close() - }() - - pConsumer, err := consumer.ConsumePartition(topic, 0, sarama.OffsetOldest) // Read from start - require.NoError(t, err) - - defer func() { - _ = pConsumer.Close() - }() - - // 2. Create Producer - producer := NewInMemoryAsyncProducer(broker, 1) // Buffer size 1 - defer func() { - _ = producer.Close() - }() + messageValue := []byte("hello async world") + received := make(chan *Message, 1) + handler := &captureHandler{received: received} + cg := NewInMemoryConsumerGroup(broker, topic, "test-group") + defer cg.Close() - // Use WaitGroups to coordinate producer success and consumer reception - var ( - wgProducer sync.WaitGroup - wgConsumer sync.WaitGroup - ) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go func() { _ = cg.Consume(ctx, []string{topic}, handler) }() + time.Sleep(50 * time.Millisecond) - wgProducer.Add(1) - wgConsumer.Add(1) + producer := NewInMemoryAsyncProducer(broker, 1) + defer producer.Close() - // Goroutine to wait for producer success + var wg sync.WaitGroup + wg.Add(2) go func() { - defer wgProducer.Done() - + defer wg.Done() select { - case successMsg := <-producer.Successes(): - assert.Equal(t, msgToSend, successMsg, "Success message should match sent message") - // Add other assertions if needed + case <-producer.Successes(): case err := <-producer.Errors(): - t.Errorf("Expected success, but got error: %v", err) + t.Errorf("expected success, got error: %v", err) case <-time.After(2 * time.Second): - t.Errorf("Timeout waiting for producer success message") // Use t.Errorf for goroutines - } - }() - - // Goroutine to wait for a consumer message - go func() { - defer wgConsumer.Done() - - select { - case msg := <-pConsumer.Messages(): - assert.Equal(t, messageValue, string(msg.Value), "Consumed message value mismatch") - case <-time.After(2 * time.Second): // Increased timeout slightly - t.Errorf("Timeout waiting for consumer message") // Use t.Errorf for goroutines + t.Error("timeout waiting for success") } }() - - // 3. Send the message AFTER consumer is ready - producer.Input() <- msgToSend - - // 4. Wait for both producer and consumer operations to complete - wgProducer.Wait() - wgConsumer.Wait() -} - -func TestInMemoryAsyncProducerProduceError(t *testing.T) { - broker := NewInMemoryBroker() - producer := NewInMemoryAsyncProducer(broker, 1) - - defer func() { - _ = producer.Close() - }() - - topic := "async-test-error" - msgToSend := &sarama.ProducerMessage{ - Topic: topic, - Value: FaultyEncoder{}, // Use the faulty encoder - } - - var wg sync.WaitGroup - - wg.Add(1) - go func() { defer wg.Done() - select { - case <-producer.Successes(): - t.Error("Expected error, but got success") - case prodErr := <-producer.Errors(): - assert.Error(t, prodErr.Err, "Expected an encoding error") - assert.Equal(t, msgToSend, prodErr.Msg, "Error message should match sent message") - assert.Contains(t, prodErr.Err.Error(), "failed to encode") + case msg := <-received: + assert.Equal(t, messageValue, msg.Value) case <-time.After(2 * time.Second): - t.Error("Timeout waiting for error message") + t.Error("timeout waiting for message") } }() - - producer.Input() <- msgToSend + producer.Produce(topic, nil, messageValue) wg.Wait() } func TestInMemoryAsyncProducerClose(t *testing.T) { broker := NewInMemoryBroker() - producer := NewInMemoryAsyncProducer(broker, 10) // Larger buffer - - topic := "async-test-close" - msg1 := &sarama.ProducerMessage{Topic: topic, Value: sarama.StringEncoder("msg1")} - msg2 := &sarama.ProducerMessage{Topic: topic, Value: sarama.StringEncoder("msg2")} - - producer.Input() <- msg1 - - producer.Input() <- msg2 - - // Give some time for messages to potentially be processed, although Close should handle it + producer := NewInMemoryAsyncProducer(broker, 10) + defer producer.Close() + producer.Produce("async-test-close", nil, []byte("msg1")) + producer.Produce("async-test-close", nil, []byte("msg2")) time.Sleep(50 * time.Millisecond) - - err := producer.Close() - require.NoError(t, err) - - // Drain channels after close before checking closure state - for range producer.Successes() { - // Drain successes - } - - for range producer.Errors() { - // Drain errors - } - - // Now check if channels are closed + require.NoError(t, producer.Close()) + for range producer.Successes() {} + for range producer.Errors() {} _, okSuccess := <-producer.Successes() - assert.False(t, okSuccess, "Successes channel should be closed") - + assert.False(t, okSuccess) _, okError := <-producer.Errors() - assert.False(t, okError, "Errors channel should be closed") + assert.False(t, okError) } -type FaultyEncoder struct{} - -// Encode simulates a faulty encoding process that always returns an error. -func (fe FaultyEncoder) Encode() ([]byte, error) { - return nil, errors.New(errors.ERR_UNKNOWN, "failed to encode") -} - -// Length returns 0, simulating a faulty encoder that does not produce valid output. -func (fe FaultyEncoder) Length() int { - return 0 -} - -// TestBrokerTopics tests the Topics() method func TestBrokerTopics(t *testing.T) { broker := NewInMemoryBroker() - - // Initially no topics - topics := broker.Topics() - assert.Empty(t, topics, "Initially broker should have no topics") - - // Add some topics by producing messages + assert.Empty(t, broker.Topics()) producer := NewInMemorySyncProducer(broker) defer producer.Close() - - msg1 := &sarama.ProducerMessage{Topic: "topic1", Value: sarama.StringEncoder("msg1")} - msg2 := &sarama.ProducerMessage{Topic: "topic2", Value: sarama.StringEncoder("msg2")} - msg3 := &sarama.ProducerMessage{Topic: "topic1", Value: sarama.StringEncoder("msg3")} // Same topic - - _, _, err := producer.SendMessage(msg1) - require.NoError(t, err) - _, _, err = producer.SendMessage(msg2) - require.NoError(t, err) - _, _, err = producer.SendMessage(msg3) - require.NoError(t, err) - - topics = broker.Topics() - assert.Len(t, topics, 2, "Should have 2 unique topics") + require.NoError(t, producer.Send("topic1", nil, []byte("msg1"))) + require.NoError(t, producer.Send("topic2", nil, []byte("msg2"))) + require.NoError(t, producer.Send("topic1", nil, []byte("msg3"))) + topics := broker.Topics() + assert.Len(t, topics, 2) assert.Contains(t, topics, "topic1") assert.Contains(t, topics, "topic2") } -// TestSyncProducerSendMessageKeyEncodingError tests key encoding error path -func TestSyncProducerSendMessageKeyEncodingError(t *testing.T) { - broker := NewInMemoryBroker() - producer := NewInMemorySyncProducer(broker) - defer producer.Close() - - msg := &sarama.ProducerMessage{ - Topic: "test-topic", - Key: FaultyEncoder{}, // Faulty key encoder - Value: sarama.StringEncoder("value"), - } - - _, _, err := producer.SendMessage(msg) - assert.Error(t, err) - assert.Contains(t, err.Error(), "failed to encode key") -} - -// TestSyncProducerUnimplementedMethodsCoverage tests all unimplemented methods -func TestSyncProducerUnimplementedMethodsCoverage(t *testing.T) { - broker := NewInMemoryBroker() - producer := NewInMemorySyncProducer(broker) - defer producer.Close() - - // Test all unimplemented transaction methods - assert.Error(t, producer.AddMessageToTxn(nil, "group", nil)) - assert.Error(t, producer.AddOffsetsToTxn(nil, "group")) - assert.Error(t, producer.AbortTxn()) - - // Test status methods - assert.Equal(t, sarama.ProducerTxnFlagReady, producer.TxnStatus()) - assert.False(t, producer.IsTransactional()) -} - -// TestConsumerUnimplementedMethods tests unimplemented consumer methods -func TestConsumerUnimplementedMethods(t *testing.T) { - broker := NewInMemoryBroker() - consumer, err := broker.NewInMemoryConsumer("test-topic") - require.NoError(t, err) - defer consumer.Close() - - // Test unimplemented methods - _, err = consumer.Topics() - assert.Error(t, err) - assert.Contains(t, err.Error(), "Topics not implemented") - - _, err = consumer.Partitions("topic") - assert.Error(t, err) - assert.Contains(t, err.Error(), "Partitions not implemented") - - // Test methods that don't return errors (no-op methods) - assert.Nil(t, consumer.HighWaterMarks()) - consumer.Pause(nil) // Should not panic - consumer.PauseAll() // Should not panic - consumer.Resume(nil) // Should not panic - consumer.ResumeAll() // Should not panic -} - -// TestPartitionConsumerMethods tests partition consumer methods -func TestPartitionConsumerMethods(t *testing.T) { - broker := NewInMemoryBroker() - consumer, err := broker.NewInMemoryConsumer("test-topic") - require.NoError(t, err) - defer consumer.Close() - - partConsumer, err := consumer.ConsumePartition("test-topic", 0, sarama.OffsetOldest) - require.NoError(t, err) - defer partConsumer.Close() - - // Test Errors() method - assert.Nil(t, partConsumer.Errors()) - - // Test AsyncClose method - partConsumer.AsyncClose() // Should not panic - - // Test HighWaterMarkOffset with no messages - assert.Equal(t, int64(0), partConsumer.HighWaterMarkOffset()) - - // Add a message and test HighWaterMarkOffset - producer := NewInMemorySyncProducer(broker) - msg := &sarama.ProducerMessage{Topic: "test-topic", Value: sarama.StringEncoder("test")} - _, _, err = producer.SendMessage(msg) - require.NoError(t, err) - producer.Close() - - assert.Equal(t, int64(1), partConsumer.HighWaterMarkOffset()) - - // Test pause/resume methods - assert.False(t, partConsumer.IsPaused()) - partConsumer.Pause() // Should not panic - partConsumer.Resume() // Should not panic -} - -// TestAsyncProducerNewWithZeroBuffer tests buffer size validation -func TestAsyncProducerNewWithZeroBuffer(t *testing.T) { - broker := NewInMemoryBroker() - producer := NewInMemoryAsyncProducer(broker, 0) // Zero buffer should default to 100 - defer producer.Close() - - // Test that it doesn't panic and has proper channels - assert.NotNil(t, producer.Input()) - assert.NotNil(t, producer.Successes()) - assert.NotNil(t, producer.Errors()) -} - -// TestAsyncProducerMessageHandlerErrorPaths tests error paths in messageHandler -func TestAsyncProducerMessageHandlerErrorPaths(t *testing.T) { - broker := NewInMemoryBroker() - producer := NewInMemoryAsyncProducer(broker, 1) - defer producer.Close() - - // Test with faulty key encoder - msgWithFaultyKey := &sarama.ProducerMessage{ - Topic: "test-topic", - Key: FaultyEncoder{}, - Value: sarama.StringEncoder("value"), - } - - var wg sync.WaitGroup - wg.Add(1) - - go func() { - defer wg.Done() - select { - case err := <-producer.Errors(): - assert.Error(t, err.Err) - assert.Contains(t, err.Err.Error(), "failed to encode key") - assert.Equal(t, msgWithFaultyKey, err.Msg) - case <-time.After(2 * time.Second): - t.Error("Timeout waiting for error") - } - }() - - producer.Input() <- msgWithFaultyKey - wg.Wait() -} - -// TestAsyncProducerUnimplementedMethods tests unimplemented async producer methods -func TestAsyncProducerUnimplementedMethods(t *testing.T) { - broker := NewInMemoryBroker() - producer := NewInMemoryAsyncProducer(broker, 1) - defer producer.Close() - - // Test transaction methods - assert.False(t, producer.IsTransactional()) - assert.Equal(t, sarama.ProducerTxnFlagReady, producer.TxnStatus()) - assert.Error(t, producer.BeginTxn()) - assert.Error(t, producer.CommitTxn()) - assert.Error(t, producer.AbortTxn()) - assert.Error(t, producer.AddMessageToTxn(nil, "group", nil)) - assert.Error(t, producer.AddOffsetsToTxn(nil, "group")) -} - -// TestGetSharedBroker tests the singleton broker func TestGetSharedBroker(t *testing.T) { - broker1 := GetSharedBroker() - broker2 := GetSharedBroker() - - // Should return the same instance - assert.Equal(t, broker1, broker2) - assert.NotNil(t, broker1) + assert.Equal(t, GetSharedBroker(), GetSharedBroker()) + assert.NotNil(t, GetSharedBroker()) } -// TestConsumerGroupBasicFunctionality tests consumer group creation and basic methods func TestConsumerGroupBasicFunctionality(t *testing.T) { broker := NewInMemoryBroker() cg := NewInMemoryConsumerGroup(broker, "test-topic", "test-group") defer cg.Close() - - // Test basic getters assert.NotNil(t, cg.Errors()) - - // Test pause/resume methods (no-ops) cg.PauseAll() cg.ResumeAll() cg.Pause(nil) cg.Resume(nil) } -// TestConsumerGroupCloseWithoutRunning tests closing a non-running consumer group func TestConsumerGroupCloseWithoutRunning(t *testing.T) { broker := NewInMemoryBroker() cg := NewInMemoryConsumerGroup(broker, "test-topic", "test-group") - - // Close without running Consume - err := cg.Close() - assert.NoError(t, err) - - // Second close should also not error - err = cg.Close() - assert.NoError(t, err) -} - -// MockConsumerGroupHandler for testing -type MockConsumerGroupHandler struct { - setupCalled bool - cleanupCalled bool - consumeError error - setupError error - cleanupError error - messageCount int - maxMessages int -} - -func (h *MockConsumerGroupHandler) Setup(sarama.ConsumerGroupSession) error { - h.setupCalled = true - return h.setupError -} - -func (h *MockConsumerGroupHandler) Cleanup(sarama.ConsumerGroupSession) error { - h.cleanupCalled = true - return h.cleanupError + assert.NoError(t, cg.Close()) + assert.NoError(t, cg.Close()) } -func (h *MockConsumerGroupHandler) ConsumeClaim(session sarama.ConsumerGroupSession, claim sarama.ConsumerGroupClaim) error { - if h.consumeError != nil { - return h.consumeError - } - - for { - select { - case msg := <-claim.Messages(): - if msg == nil { - return nil // Channel closed - } - h.messageCount++ - if h.maxMessages > 0 && h.messageCount >= h.maxMessages { - return nil // Stop after max messages - } - case <-session.Context().Done(): - return session.Context().Err() - } - } -} - -// TestConsumerGroupConsumeMultipleTopics tests error for multiple topics func TestConsumerGroupConsumeMultipleTopics(t *testing.T) { broker := NewInMemoryBroker() cg := NewInMemoryConsumerGroup(broker, "test-topic", "test-group") defer cg.Close() - - handler := &MockConsumerGroupHandler{} - + handler := &captureHandler{received: make(chan *Message)} err := cg.Consume(context.Background(), []string{"topic1", "topic2"}, handler) assert.Error(t, err) assert.Contains(t, err.Error(), "only supports exactly one topic") } -// TestConsumerGroupConsumeSetupError tests handler setup error func TestConsumerGroupConsumeSetupError(t *testing.T) { broker := NewInMemoryBroker() cg := NewInMemoryConsumerGroup(broker, "test-topic", "test-group") defer cg.Close() - - handler := &MockConsumerGroupHandler{ - setupError: errors.New(errors.ERR_UNKNOWN, "setup failed"), - } - + handler := &errorSetupHandler{setupErr: errors.New(errors.ERR_UNKNOWN, "setup failed")} err := cg.Consume(context.Background(), []string{"test-topic"}, handler) assert.Error(t, err) - assert.Contains(t, err.Error(), "handler setup failed") + assert.Contains(t, err.Error(), "setup failed") assert.True(t, handler.setupCalled) } -// TestConsumerGroupConsumeCleanupError tests handler cleanup error +type errorSetupHandler struct { + setupCalled bool + setupErr error +} + +func (h *errorSetupHandler) Setup(ConsumerGroupSession) error { + h.setupCalled = true + return h.setupErr +} +func (h *errorSetupHandler) Cleanup(ConsumerGroupSession) error { return nil } +func (h *errorSetupHandler) ConsumeClaim(ConsumerGroupSession, ConsumerGroupClaim) error { return nil } + func TestConsumerGroupConsumeCleanupError(t *testing.T) { broker := NewInMemoryBroker() cg := NewInMemoryConsumerGroup(broker, "test-topic", "test-group") defer cg.Close() - - handler := &MockConsumerGroupHandler{ - cleanupError: errors.New(errors.ERR_UNKNOWN, "cleanup failed"), - consumeError: errors.New(errors.ERR_UNKNOWN, "consume done"), // Return immediately + handler := &errorCleanupHandler{ + consumeErr: errors.New(errors.ERR_UNKNOWN, "consume done"), + cleanupErr: errors.New(errors.ERR_UNKNOWN, "cleanup failed"), } - err := cg.Consume(context.Background(), []string{"test-topic"}, handler) - // The primary error should be from ConsumeClaim, but cleanup error should be logged assert.Error(t, err) - assert.Contains(t, err.Error(), "consume done") // Primary error from ConsumeClaim - assert.True(t, handler.setupCalled) + assert.Contains(t, err.Error(), "consume done") assert.True(t, handler.cleanupCalled) } -// TestConsumerGroupConsumeContextCancel tests context cancellation +type errorCleanupHandler struct { + consumeErr error + cleanupErr error + cleanupCalled bool +} + +func (h *errorCleanupHandler) Setup(ConsumerGroupSession) error { return nil } +func (h *errorCleanupHandler) Cleanup(ConsumerGroupSession) error { + h.cleanupCalled = true + return h.cleanupErr +} +func (h *errorCleanupHandler) ConsumeClaim(ConsumerGroupSession, ConsumerGroupClaim) error { return h.consumeErr } + func TestConsumerGroupConsumeContextCancel(t *testing.T) { broker := NewInMemoryBroker() cg := NewInMemoryConsumerGroup(broker, "test-topic", "test-group") defer cg.Close() - - handler := &MockConsumerGroupHandler{} - + handler := &captureHandler{received: make(chan *Message)} ctx, cancel := context.WithCancel(context.Background()) - cancel() // Cancel immediately - + cancel() err := cg.Consume(ctx, []string{"test-topic"}, handler) assert.Error(t, err) assert.Equal(t, context.Canceled, err) } -// TestConsumerGroupSessionMethods tests session interface methods func TestConsumerGroupSessionMethods(t *testing.T) { broker := NewInMemoryBroker() - - // Add some topics producer := NewInMemorySyncProducer(broker) - msg1 := &sarama.ProducerMessage{Topic: "topic1", Value: sarama.StringEncoder("test1")} - msg2 := &sarama.ProducerMessage{Topic: "topic2", Value: sarama.StringEncoder("test2")} - _, _, err := producer.SendMessage(msg1) - require.NoError(t, err) - _, _, err = producer.SendMessage(msg2) - require.NoError(t, err) + require.NoError(t, producer.Send("topic1", nil, []byte("test1"))) + require.NoError(t, producer.Send("topic2", nil, []byte("test2"))) producer.Close() - session := &InMemoryConsumerGroupSession{ - ctx: context.Background(), - broker: broker, - groupID: "test-group", - } - - // Test Claims() + session := &InMemoryConsumerGroupSession{ctx: context.Background(), broker: broker, groupID: "test-group"} claims := session.Claims() assert.Len(t, claims, 2) assert.Contains(t, claims, "topic1") assert.Contains(t, claims, "topic2") assert.Equal(t, []int32{0}, claims["topic1"]) - - // Test other methods (no-ops) assert.Equal(t, "mock-member-test-group", session.MemberID()) assert.Equal(t, int32(1), session.GenerationID()) assert.Equal(t, context.Background(), session.Context()) - - // Test no-op methods session.MarkOffset("topic", 0, 0, "") session.Commit() session.ResetOffset("topic", 0, 0, "") session.MarkMessage(nil, "") } -// TestConsumerGroupClaimMethods tests claim interface methods func TestConsumerGroupClaimMethods(t *testing.T) { - broker := NewInMemoryBroker() - consumer, err := broker.NewInMemoryConsumer("test-topic") - require.NoError(t, err) - defer consumer.Close() - - partConsumer, err := consumer.ConsumePartition("test-topic", 0, sarama.OffsetOldest) - require.NoError(t, err) - defer partConsumer.Close() - - claim := &InMemoryConsumerGroupClaim{ - topic: "test-topic", - partition: 0, - partConsumer: partConsumer, - } - + ch := make(chan *Message, 1) + close(ch) + claim := &InMemoryConsumerGroupClaim{topic: "test-topic", partition: 0, messages: ch} assert.Equal(t, "test-topic", claim.Topic()) assert.Equal(t, int32(0), claim.Partition()) - assert.Equal(t, sarama.OffsetOldest, claim.InitialOffset()) + assert.Equal(t, int64(0), claim.InitialOffset()) assert.Equal(t, int64(0), claim.HighWaterMarkOffset()) assert.NotNil(t, claim.Messages()) } -// TestConsumerGroupClaimWithNilPartConsumer tests claim with nil partition consumer -func TestConsumerGroupClaimWithNilPartConsumer(t *testing.T) { - claim := &InMemoryConsumerGroupClaim{ - topic: "test-topic", - partition: 0, - partConsumer: nil, - } - - assert.Equal(t, int64(0), claim.HighWaterMarkOffset()) - - // Messages should return a closed channel - msgChan := claim.Messages() - assert.NotNil(t, msgChan) - - // Channel should be closed (reading should return nil, false) - msg, ok := <-msgChan - assert.Nil(t, msg) - assert.False(t, ok) -} - -// TestConsumerCloseBehavior tests consumer close behavior with multiple calls -func TestConsumerCloseBehavior(t *testing.T) { - broker := NewInMemoryBroker() - consumer, err := broker.NewInMemoryConsumer("test-topic") - require.NoError(t, err) - - // First close should succeed - err = consumer.Close() - assert.NoError(t, err) - - // Second close should also succeed (sync.Once behavior) - err = consumer.Close() - assert.NoError(t, err) -} - -// TestSyncProducerSendMessageErrorPaths tests error paths in SendMessage -func TestSyncProducerSendMessageErrorPaths(t *testing.T) { - broker := NewInMemoryBroker() - producer := NewInMemorySyncProducer(broker) - defer producer.Close() - - // Test value encoding error - msg := &sarama.ProducerMessage{ - Topic: "test-topic", - Value: FaultyEncoder{}, // Faulty value encoder - } - - _, _, err := producer.SendMessage(msg) - assert.Error(t, err) - assert.Contains(t, err.Error(), "failed to encode") -} - -// TestConsumerGroupSessionNoOpMethods tests no-op session methods for coverage func TestConsumerGroupSessionNoOpMethods(t *testing.T) { broker := NewInMemoryBroker() - session := &InMemoryConsumerGroupSession{ - ctx: context.Background(), - broker: broker, - groupID: "test-group", - } - - // Test no-op methods for coverage + session := &InMemoryConsumerGroupSession{ctx: context.Background(), broker: broker, groupID: "test-group"} assert.NotPanics(t, func() { session.MarkOffset("topic", 0, 100, "metadata") session.Commit() @@ -817,13 +299,10 @@ func TestConsumerGroupSessionNoOpMethods(t *testing.T) { }) } -// TestConsumerGroupPauseResumeMethods tests pause/resume methods for coverage func TestConsumerGroupPauseResumeMethods(t *testing.T) { broker := NewInMemoryBroker() cg := NewInMemoryConsumerGroup(broker, "test-topic", "test-group") defer cg.Close() - - // Test pause/resume methods (no-ops) partitions := map[string][]int32{"topic1": {0, 1}, "topic2": {0}} assert.NotPanics(t, func() { cg.PauseAll() @@ -833,203 +312,61 @@ func TestConsumerGroupPauseResumeMethods(t *testing.T) { }) } -// TestPartitionConsumerPauseResumeMethods tests partition consumer pause/resume -func TestPartitionConsumerPauseResumeMethods(t *testing.T) { - broker := NewInMemoryBroker() - consumer, err := broker.NewInMemoryConsumer("test-topic") - require.NoError(t, err) - defer consumer.Close() - - // Test consumer pause/resume methods (no-ops) - partitions := map[string][]int32{"topic1": {0, 1}} - assert.NotPanics(t, func() { - consumer.Pause(partitions) - consumer.PauseAll() - consumer.Resume(partitions) - consumer.ResumeAll() - }) - - // Test partition consumer methods - partConsumer, err := consumer.ConsumePartition("test-topic", 0, sarama.OffsetOldest) - require.NoError(t, err) - defer partConsumer.Close() - - assert.NotPanics(t, func() { - partConsumer.Pause() - partConsumer.Resume() - }) -} - -// TestHighWaterMarkOffsetWithNonExistentTopic tests HighWaterMarkOffset edge case -func TestHighWaterMarkOffsetWithNonExistentTopic(t *testing.T) { - broker := NewInMemoryBroker() - consumer, err := broker.NewInMemoryConsumer("test-topic") - require.NoError(t, err) - defer consumer.Close() - - partConsumer, err := consumer.ConsumePartition("test-topic", 0, sarama.OffsetOldest) - require.NoError(t, err) - defer partConsumer.Close() - - // Test with non-existent topic in broker's internal state - // This tests the "topic not found" case in HighWaterMarkOffset - pc := partConsumer.(*InMemoryPartitionConsumer) - pc.consumer.topic = "non-existent-topic" // Change to non-existent topic - - offset := partConsumer.HighWaterMarkOffset() - assert.Equal(t, int64(0), offset) // Should return 0 for non-existent topic -} - -// TestAsyncProducerCloseChannelPath tests the close channel in messageHandler -func TestAsyncProducerCloseChannelPath(t *testing.T) { - broker := NewInMemoryBroker() - producer := NewInMemoryAsyncProducer(broker, 1) - - // Just test normal close to cover the close paths - err := producer.Close() - assert.NoError(t, err) -} - -// TestConsumerGroupCloseNotRunning tests Close() when not running -func TestConsumerGroupCloseNotRunning(t *testing.T) { +func TestAsyncProducerNewWithZeroBuffer(t *testing.T) { broker := NewInMemoryBroker() - cg := NewInMemoryConsumerGroup(broker, "test-topic", "test-group") - - // Close without ever running Consume - err := cg.Close() - assert.NoError(t, err) - - // Second close should also work - err = cg.Close() - assert.NoError(t, err) -} - -// PauseTestHandler is a handler specifically for testing pause/resume -type PauseTestHandler struct { - receivedMessages *[]string - mu *sync.Mutex -} - -func (h *PauseTestHandler) Setup(sarama.ConsumerGroupSession) error { - return nil -} - -func (h *PauseTestHandler) Cleanup(sarama.ConsumerGroupSession) error { - return nil -} - -func (h *PauseTestHandler) ConsumeClaim(session sarama.ConsumerGroupSession, claim sarama.ConsumerGroupClaim) error { - for { - select { - case msg := <-claim.Messages(): - if msg == nil { - return nil // Channel closed - } - h.mu.Lock() - *h.receivedMessages = append(*h.receivedMessages, string(msg.Value)) - h.mu.Unlock() - case <-session.Context().Done(): - return session.Context().Err() - } - } + producer := NewInMemoryAsyncProducer(broker, 0) + defer producer.Close() + assert.NotNil(t, producer.Input()) + assert.NotNil(t, producer.Successes()) + assert.NotNil(t, producer.Errors()) } -// TestConsumerGroupPauseResumeBehavior tests that pause/resume actually stops message delivery func TestConsumerGroupPauseResumeBehavior(t *testing.T) { broker := NewInMemoryBroker() topic := "pause-test-topic" cg := NewInMemoryConsumerGroup(broker, topic, "test-group") - - // Create a handler that tracks messages received var receivedMessages []string var mu sync.Mutex - handler := &PauseTestHandler{ - receivedMessages: &receivedMessages, - mu: &mu, - } + handler := &PauseTestHandler{received: &receivedMessages, mu: &mu} - // Start consuming in a goroutine ctx, cancel := context.WithCancel(context.Background()) defer cancel() - consumeDone := make(chan error, 1) - go func() { - consumeDone <- cg.Consume(ctx, []string{topic}, handler) - }() - - // Give consumer time to start + go func() { consumeDone <- cg.Consume(ctx, []string{topic}, handler) }() time.Sleep(100 * time.Millisecond) - // Produce some messages while not paused producer := NewInMemorySyncProducer(broker) defer producer.Close() - - msg1 := &sarama.ProducerMessage{Topic: topic, Value: sarama.StringEncoder("message1")} - _, _, err := producer.SendMessage(msg1) - require.NoError(t, err) - - msg2 := &sarama.ProducerMessage{Topic: topic, Value: sarama.StringEncoder("message2")} - _, _, err = producer.SendMessage(msg2) - require.NoError(t, err) - - // Wait for messages to be received + require.NoError(t, producer.Send(topic, nil, []byte("message1"))) + require.NoError(t, producer.Send(topic, nil, []byte("message2"))) time.Sleep(100 * time.Millisecond) - mu.Lock() beforePauseCount := len(receivedMessages) mu.Unlock() + assert.Equal(t, 2, beforePauseCount) - // Should have received 2 messages - assert.Equal(t, 2, beforePauseCount, "Should receive messages when not paused") - - // Now pause consumption cg.PauseAll() - - // Give pause time to take effect time.Sleep(50 * time.Millisecond) - - // Produce messages while paused - msg3 := &sarama.ProducerMessage{Topic: topic, Value: sarama.StringEncoder("message3")} - _, _, err = producer.SendMessage(msg3) - require.NoError(t, err) - - msg4 := &sarama.ProducerMessage{Topic: topic, Value: sarama.StringEncoder("message4")} - _, _, err = producer.SendMessage(msg4) - require.NoError(t, err) - - // Wait to ensure messages aren't delivered while paused + require.NoError(t, producer.Send(topic, nil, []byte("message3"))) + require.NoError(t, producer.Send(topic, nil, []byte("message4"))) time.Sleep(200 * time.Millisecond) - mu.Lock() pausedCount := len(receivedMessages) mu.Unlock() + assert.Equal(t, 2, pausedCount) - // Should still be at 2 messages (no new messages received while paused) - assert.Equal(t, 2, pausedCount, "Should not receive messages while paused") - - // Now resume consumption cg.ResumeAll() - - // Wait for resumed messages to be delivered time.Sleep(200 * time.Millisecond) - mu.Lock() afterResumeCount := len(receivedMessages) allMessages := make([]string, len(receivedMessages)) - copy(allMessages, receivedMessages) + for i, m := range receivedMessages { + allMessages[i] = m + } mu.Unlock() - - // Should now have received all 4 messages - assert.Equal(t, 4, afterResumeCount, "Should receive paused messages after resume") + assert.Equal(t, 4, afterResumeCount) assert.Equal(t, []string{"message1", "message2", "message3", "message4"}, allMessages) - - // Test IsPaused on partition consumer - assert.False(t, cg.isPaused, "Consumer group should not be paused after resume") - - // Cancel context to stop consume cancel() - - // Wait for consume to finish select { case err := <-consumeDone: assert.Error(t, err) @@ -1038,3 +375,43 @@ func TestConsumerGroupPauseResumeBehavior(t *testing.T) { t.Fatal("Consume did not exit after context cancel") } } + +type PauseTestHandler struct { + received *[]string + mu *sync.Mutex +} + +func (h *PauseTestHandler) Setup(ConsumerGroupSession) error { return nil } +func (h *PauseTestHandler) Cleanup(ConsumerGroupSession) error { return nil } +func (h *PauseTestHandler) ConsumeClaim(session ConsumerGroupSession, claim ConsumerGroupClaim) error { + msgs := claim.Messages() + for { + select { + case <-session.Context().Done(): + return session.Context().Err() + case msg, ok := <-msgs: + if !ok { + return nil + } + if msg == nil { + return nil + } + h.mu.Lock() + *h.received = append(*h.received, string(msg.Value)) + h.mu.Unlock() + } + } +} + +func TestConsumerGroupCloseNotRunning(t *testing.T) { + broker := NewInMemoryBroker() + cg := NewInMemoryConsumerGroup(broker, "test-topic", "test-group") + assert.NoError(t, cg.Close()) + assert.NoError(t, cg.Close()) +} + +func TestAsyncProducerCloseChannelPath(t *testing.T) { + broker := NewInMemoryBroker() + producer := NewInMemoryAsyncProducer(broker, 1) + assert.NoError(t, producer.Close()) +} diff --git a/util/kafka/kafka_consumer.go b/util/kafka/kafka_consumer.go index cffa44df15..1237185997 100644 --- a/util/kafka/kafka_consumer.go +++ b/util/kafka/kafka_consumer.go @@ -12,37 +12,25 @@ import ( "syscall" "time" - "github.com/IBM/sarama" "github.com/bsv-blockchain/teranode/errors" "github.com/bsv-blockchain/teranode/settings" "github.com/bsv-blockchain/teranode/ulogger" "github.com/bsv-blockchain/teranode/util" inmemorykafka "github.com/bsv-blockchain/teranode/util/kafka/in_memory_kafka" - "github.com/bsv-blockchain/teranode/util/retry" + "github.com/twmb/franz-go/pkg/kgo" + "github.com/twmb/franz-go/pkg/kmsg" ) const memoryScheme = "memory" -// saramaLoggerAdapter adapts ulogger.Logger to sarama.StdLogger interface -type saramaLoggerAdapter struct { - logger ulogger.Logger -} - -func (s *saramaLoggerAdapter) Print(v ...interface{}) { - s.logger.Infof("[SARAMA] %v", v...) -} - -func (s *saramaLoggerAdapter) Printf(format string, v ...interface{}) { - s.logger.Infof("[SARAMA] "+format, v...) -} - -func (s *saramaLoggerAdapter) Println(v ...interface{}) { - s.logger.Infof("[SARAMA] %v", v...) -} - -// KafkaMessage wraps sarama.ConsumerMessage to provide additional functionality. +// KafkaMessage represents a Kafka message with all necessary fields. type KafkaMessage struct { - sarama.ConsumerMessage + Key []byte + Value []byte + Topic string + Partition int32 + Offset int64 + Timestamp time.Time } // KafkaConsumerGroupI defines the interface for Kafka consumer group operations. @@ -77,11 +65,11 @@ type KafkaConsumerConfig struct { Replay bool // Whether to replay messages from the beginning // Timeout configuration (query params: maxProcessingTime, sessionTimeout, heartbeatInterval, rebalanceTimeout, channelBufferSize, consumerTimeout) - MaxProcessingTime time.Duration // Max time to process a message before Sarama stops fetching (Sarama default: 100ms) - SessionTimeout time.Duration // Time broker waits for heartbeat before considering consumer dead (Sarama default: 10s) - HeartbeatInterval time.Duration // Frequency of heartbeats to broker (Sarama default: 3s) - RebalanceTimeout time.Duration // Max time for all consumers to join rebalance (Sarama default: 60s) - ChannelBufferSize int // Number of messages buffered in internal channels (Sarama default: 256) + MaxProcessingTime time.Duration // Max time to process a message (default: 100ms) + SessionTimeout time.Duration // Time broker waits for heartbeat before considering consumer dead (default: 10s) + HeartbeatInterval time.Duration // Frequency of heartbeats to broker (default: 3s) + RebalanceTimeout time.Duration // Max time for all consumers to join rebalance (default: 60s) + ChannelBufferSize int // Number of messages buffered in internal channels (default: 256) ConsumerTimeout time.Duration // Max time without messages before watchdog triggers recovery (default: 90s) // OffsetReset controls what to do when offset is out of range (query param: offsetReset) @@ -96,18 +84,10 @@ type KafkaConsumerConfig struct { TLSKeyFile string // Path to client key file // Debug logging - EnableDebugLogging bool // Enable verbose Sarama (Kafka client) debug logging + EnableDebugLogging bool // Enable verbose debug logging } -// consumeWatchdog monitors Consume() state to detect when stuck in RefreshMetadata and triggers force recovery. -// When stuck is detected (Consume() started but Setup() not called for 90s), it forces recovery by -// closing the consumer group and recreating it. This simulates what happens when Kafka server restarts. -// -// The watchdog tracks two scenarios: -// 1. Consume() called but Setup() never called (stuck in RefreshMetadata before joining group) -// 2. Consume() returns with error, retry loop attempts to call Consume() again, but it hangs -// -// Note: Offset out of range errors are now handled immediately by the error handler, not by the watchdog. +// consumeWatchdog monitors Consume() state to detect when stuck and triggers force recovery. type consumeWatchdog struct { consumeStartTime atomic.Value // time.Time - when Consume() was called setupCalledTime atomic.Value // time.Time - when Setup() was called @@ -152,8 +132,6 @@ func (w *consumeWatchdog) isStuckInRefreshMetadata(threshold time.Duration) (boo return duration > threshold, duration } -// isStuckAfterError detects when Consume() returned with an error, the retry loop is attempting -// to call Consume() again, but it's been stuck for longer than the threshold without Setup() being called. // This catches the case where offset errors cause Consume() to hang in RefreshMetadata on retry. func (w *consumeWatchdog) isStuckAfterError(threshold time.Duration) (bool, time.Duration) { // Check if Consume() has ended (returned with error or success) @@ -192,26 +170,26 @@ func (w *consumeWatchdog) isStuckAfterError(threshold time.Duration) (bool, time return duration > threshold, duration } -// KafkaConsumerGroup implements KafkaConsumerGroupI interface. +// KafkaConsumerGroup implements KafkaConsumerGroupI interface using franz-go. type KafkaConsumerGroup struct { - Config KafkaConsumerConfig - ConsumerGroup sarama.ConsumerGroup - cancel atomic.Value - watchdog *consumeWatchdog // Monitors for stuck RefreshMetadata and triggers force recovery - - // For force recovery when consumer is stuck - consumerMu sync.Mutex // Protects consumer recreation - saramaConfig *sarama.Config // Stored config for recreating consumer + Config KafkaConsumerConfig + client *kgo.Client + cancel atomic.Value + watchdog *consumeWatchdog + clientMu sync.Mutex + clientOpts []kgo.Opt + + // For in-memory support + inMemoryConsumer *inmemorykafka.InMemoryConsumerGroup + isInMemory bool } -// validateTimeoutConfig validates that timeout configuration follows Sarama constraints +// validateTimeoutConfig validates that timeout configuration follows constraints func validateTimeoutConfig(cfg KafkaConsumerConfig) error { - // Only validate if custom timeouts are set (non-zero) if cfg.HeartbeatInterval <= 0 || cfg.SessionTimeout <= 0 { - return nil // Using Sarama defaults, which are already valid + return nil // Using defaults, which are already valid } - // Sarama requires: SessionTimeout >= 3 * HeartbeatInterval if cfg.SessionTimeout < 3*cfg.HeartbeatInterval { return errors.NewConfigurationError( "invalid Kafka consumer timeout configuration for topic %s: sessionTimeout (%v) must be >= 3 * heartbeatInterval (%v). Got ratio: %.2fx", @@ -226,8 +204,6 @@ func validateTimeoutConfig(cfg KafkaConsumerConfig) error { } // NewKafkaConsumerGroupFromURL creates a new KafkaConsumerGroup from a URL. -// This is a convenience function for production code that extracts settings from kafkaSettings. -// For tests, use NewKafkaConsumerGroup directly with a manually constructed config. func NewKafkaConsumerGroupFromURL(logger ulogger.Logger, url *url.URL, consumerGroupID string, autoCommit bool, kafkaSettings *settings.KafkaSettings) (*KafkaConsumerGroup, error) { if url == nil { return nil, errors.NewConfigurationError("missing kafka url") @@ -235,31 +211,26 @@ func NewKafkaConsumerGroupFromURL(logger ulogger.Logger, url *url.URL, consumerG partitions := util.GetQueryParamInt(url, "partitions", 1) - // Generate a unique group ID for the txmeta Kafka listener, to ensure that each instance of this service will process all txmeta messages. - // This is necessary because the txmeta messages are used to populate the txmeta cache, which is shared across all instances of this service. - // groupID := topic + "-" + uuid.New().String() - - // AutoCommitEnabled: - // txMetaCache : true, we CAN miss. - // rejected txs : true, we CAN miss. - // subtree validation : false. - // block persister : false. - // block validation: false. - - // Extract timeout configuration from URL query parameters (in milliseconds) - // Defaults match Sarama's defaults - can be overridden per-topic for slow processing (e.g., subtree validation) - maxProcessingTimeMs := util.GetQueryParamInt(url, "maxProcessingTime", 100) // Sarama default: 100ms - sessionTimeoutMs := util.GetQueryParamInt(url, "sessionTimeout", 10000) // Sarama default: 10s - heartbeatIntervalMs := util.GetQueryParamInt(url, "heartbeatInterval", 3000) // Sarama default: 3s - rebalanceTimeoutMs := util.GetQueryParamInt(url, "rebalanceTimeout", 60000) // Sarama default: 60s - channelBufferSize := util.GetQueryParamInt(url, "channelBufferSize", 256) // Sarama default: 256 - consumerTimeoutMs := util.GetQueryParamInt(url, "consumerTimeout", 90000) // Default: 90s (watchdog timeout for no messages) - - // Extract offset reset strategy (how to handle offset out of range errors) - // Values: "latest" (default), "earliest", or "" (empty uses Replay setting) + // AutoCommitEnabled: whether the consumer commits offsets automatically after processing. + // Per-topic semantics matter for correctness and at-least-once vs best-effort delivery: + // - txMetaCache: true, we CAN miss (best-effort populating cache). + // - rejected txs: true, we CAN miss. + // - subtree validation: false (at-least-once). + // - block persister: false. + // - block validation: false. + + // Extract timeout configuration from URL query parameters (in milliseconds). + // Defaults match common Kafka client defaults; can be overridden per-topic for slow processing (e.g. subtree validation). + maxProcessingTimeMs := util.GetQueryParamInt(url, "maxProcessingTime", 100) + sessionTimeoutMs := util.GetQueryParamInt(url, "sessionTimeout", 10000) + heartbeatIntervalMs := util.GetQueryParamInt(url, "heartbeatInterval", 3000) + rebalanceTimeoutMs := util.GetQueryParamInt(url, "rebalanceTimeout", 60000) + channelBufferSize := util.GetQueryParamInt(url, "channelBufferSize", 256) + consumerTimeoutMs := util.GetQueryParamInt(url, "consumerTimeout", 90000) + + // Offset reset strategy: how to handle offset-out-of-range (e.g. "latest", "earliest", or "" for default/Replay). offsetReset := url.Query().Get("offsetReset") - // Extract TLS and debug logging settings from kafkaSettings (if provided) var enableTLS, tlsSkipVerify, enableDebugLogging bool var tlsCAFile, tlsCertFile, tlsKeyFile string if kafkaSettings != nil { @@ -272,25 +243,21 @@ func NewKafkaConsumerGroupFromURL(logger ulogger.Logger, url *url.URL, consumerG } consumerConfig := KafkaConsumerConfig{ - Logger: logger, - URL: url, - BrokersURL: strings.Split(url.Host, ","), - Topic: strings.TrimPrefix(url.Path, "/"), - Partitions: partitions, - ConsumerGroupID: consumerGroupID, - AutoCommitEnabled: autoCommit, - // default is start from beginning - // do not ignore everything that is already queued, this is the case where we start a new consumer group for the first time - // maybe it shouldn't be called replay because it suggests that the consume will always replay messages from the beginning - Replay: util.GetQueryParamInt(url, "replay", 1) == 1, - MaxProcessingTime: time.Duration(maxProcessingTimeMs) * time.Millisecond, - SessionTimeout: time.Duration(sessionTimeoutMs) * time.Millisecond, - HeartbeatInterval: time.Duration(heartbeatIntervalMs) * time.Millisecond, - RebalanceTimeout: time.Duration(rebalanceTimeoutMs) * time.Millisecond, - ChannelBufferSize: channelBufferSize, - ConsumerTimeout: time.Duration(consumerTimeoutMs) * time.Millisecond, - OffsetReset: offsetReset, - // TLS/Auth configuration + Logger: logger, + URL: url, + BrokersURL: strings.Split(url.Host, ","), + Topic: strings.TrimPrefix(url.Path, "/"), + Partitions: partitions, + ConsumerGroupID: consumerGroupID, + AutoCommitEnabled: autoCommit, + Replay: util.GetQueryParamInt(url, "replay", 1) == 1, + MaxProcessingTime: time.Duration(maxProcessingTimeMs) * time.Millisecond, + SessionTimeout: time.Duration(sessionTimeoutMs) * time.Millisecond, + HeartbeatInterval: time.Duration(heartbeatIntervalMs) * time.Millisecond, + RebalanceTimeout: time.Duration(rebalanceTimeoutMs) * time.Millisecond, + ChannelBufferSize: channelBufferSize, + ConsumerTimeout: time.Duration(consumerTimeoutMs) * time.Millisecond, + OffsetReset: offsetReset, EnableTLS: enableTLS, TLSSkipVerify: tlsSkipVerify, TLSCAFile: tlsCAFile, @@ -299,7 +266,6 @@ func NewKafkaConsumerGroupFromURL(logger ulogger.Logger, url *url.URL, consumerG EnableDebugLogging: enableDebugLogging, } - // Validate timeout configuration if err := validateTimeoutConfig(consumerConfig); err != nil { return nil, err } @@ -309,71 +275,57 @@ func NewKafkaConsumerGroupFromURL(logger ulogger.Logger, url *url.URL, consumerG // Close gracefully shuts down the Kafka consumer group func (k *KafkaConsumerGroup) Close() error { - // Check if the consumer group was properly initialized if k == nil || k.Config.Logger == nil { return nil } k.Config.Logger.Infof("[Kafka] %s: initiating shutdown of consumer group for topic %s", k.Config.ConsumerGroupID, k.Config.Topic) - // cancel the context first to signal all consumers to stop if k.cancel.Load() != nil { k.Config.Logger.Debugf("[Kafka] %s: canceling context for topic %s", k.Config.ConsumerGroupID, k.Config.Topic) k.cancel.Load().(context.CancelFunc)() } - // Then close the consumer group - if k.ConsumerGroup != nil { - if err := k.ConsumerGroup.Close(); err != nil { - k.Config.Logger.Errorf("[Kafka] %s: error closing consumer group for topic %s: %v", k.Config.ConsumerGroupID, k.Config.Topic, err) - return err + if k.isInMemory { + if k.inMemoryConsumer != nil { + if err := k.inMemoryConsumer.Close(); err != nil { + k.Config.Logger.Errorf("[Kafka] %s: error closing in-memory consumer for topic %s: %v", k.Config.ConsumerGroupID, k.Config.Topic, err) + return err + } + } + } else { + if k.client != nil { + k.client.Close() + k.Config.Logger.Infof("[Kafka] %s: successfully closed consumer group for topic %s", k.Config.ConsumerGroupID, k.Config.Topic) } - - k.Config.Logger.Infof("[Kafka] %s: successfully closed consumer group for topic %s", k.Config.ConsumerGroupID, k.Config.Topic) } return nil } -// forceRecovery forces recovery of a stuck consumer by closing and recreating the consumer group. -// This simulates what happens when you restart the Kafka server - the connection closes, -// the stuck Consume() returns with an error, and the retry loop creates a fresh consumer. -// -// This is safe because: -// - Close() unblocks the stuck Consume() call by closing internal connections -// - We use a mutex to prevent concurrent recovery attempts -// - The new consumer group is created with the same configuration -// - The retry loop automatically uses the new consumer on the next iteration +// forceRecovery forces recovery of a stuck consumer by closing and recreating the client. func (k *KafkaConsumerGroup) forceRecovery() error { - // Lock to prevent concurrent recovery attempts - k.consumerMu.Lock() - defer k.consumerMu.Unlock() + k.clientMu.Lock() + defer k.clientMu.Unlock() k.Config.Logger.Warnf("[kafka-watchdog] Forcing recovery for topic %s - closing stuck consumer and creating new one", k.Config.Topic) - // Close the existing consumer group - this will cause stuck Consume() to return - if k.ConsumerGroup != nil { - if err := k.ConsumerGroup.Close(); err != nil { - k.Config.Logger.Errorf("[kafka-watchdog] Error closing stuck consumer group: %v", err) - // Continue anyway - we'll try to create a new one - } + if k.client != nil { + k.client.Close() } - // Create a new consumer group with the same configuration - newConsumerGroup, err := sarama.NewConsumerGroup(k.Config.BrokersURL, k.Config.ConsumerGroupID, k.saramaConfig) + newClient, err := kgo.NewClient(k.clientOpts...) if err != nil { - return errors.NewServiceError("failed to recreate consumer group for %s", k.Config.Topic, err) + return errors.NewServiceError("failed to recreate consumer client for %s", k.Config.Topic, err) } - // Replace the consumer group atomically - k.ConsumerGroup = newConsumerGroup - + k.client = newClient k.Config.Logger.Infof("[kafka-watchdog] Successfully recreated consumer group for topic %s", k.Config.Topic) + return nil } -// NewKafkaConsumerGroup creates a new Kafka consumer group -// We DO NOT read autocommit parameter from the URL because the handler func has specific error handling logic. +// NewKafkaConsumerGroup creates a new Kafka consumer group using franz-go func NewKafkaConsumerGroup(cfg KafkaConsumerConfig) (*KafkaConsumerGroup, error) { if cfg.URL == nil { return nil, errors.NewConfigurationError("kafka URL is not set", nil) @@ -387,109 +339,43 @@ func NewKafkaConsumerGroup(cfg KafkaConsumerConfig) (*KafkaConsumerGroup, error) return nil, errors.NewConfigurationError("group ID is not set", nil) } - cfg.Logger.Infof("Starting Kafka consumer for topic %s in group %s (concurrency based on partition count)", cfg.Topic, cfg.ConsumerGroupID) + cfg.Logger.Infof("Starting Kafka consumer for topic %s in group %s", cfg.Topic, cfg.ConsumerGroupID) - // Initialize Prometheus metrics (idempotent) InitPrometheusMetrics() - var consumerGroup sarama.ConsumerGroup - + // Handle in-memory case if cfg.URL.Scheme == memoryScheme { - // --- Use the in-memory implementation --- - broker := inmemorykafka.GetSharedBroker() // Get the shared broker instance - // Create the InMemoryConsumerGroup which implements sarama.ConsumerGroup - consumerGroup = inmemorykafka.NewInMemoryConsumerGroup(broker, cfg.Topic, cfg.ConsumerGroupID) + broker := inmemorykafka.GetSharedBroker() + consumerGroup := inmemorykafka.NewInMemoryConsumerGroup(broker, cfg.Topic, cfg.ConsumerGroupID) cfg.Logger.Infof("Using in-memory Kafka consumer group") - // No error expected from creation here, unless topic/group were invalid (checked above) return &KafkaConsumerGroup{ - Config: cfg, - ConsumerGroup: consumerGroup, - watchdog: &consumeWatchdog{}, // Initialize watchdog for in-memory consumer + Config: cfg, + inMemoryConsumer: consumerGroup, + isInMemory: true, + watchdog: &consumeWatchdog{}, }, nil } - // --- Use the real Sarama implementation --- - var err error - - config := sarama.NewConfig() - config.Consumer.Return.Errors = true - - // Enable Sarama debug logging for consumer diagnostics (only if configured) - // By default, SARAMA logs are too verbose and not needed in production - if cfg.EnableDebugLogging { - sarama.Logger = &saramaLoggerAdapter{logger: cfg.Logger} - cfg.Logger.Infof("Kafka debug logging enabled for consumer group %s", cfg.ConsumerGroupID) - } - - // Configure consumer group timeouts from URL query parameters to prevent partition abandonment during slow processing - - // Only override Sarama defaults if explicitly set (non-zero values) - if cfg.MaxProcessingTime > 0 { - config.Consumer.MaxProcessingTime = cfg.MaxProcessingTime - } - - if cfg.SessionTimeout > 0 { - config.Consumer.Group.Session.Timeout = cfg.SessionTimeout - } - - if cfg.HeartbeatInterval > 0 { - config.Consumer.Group.Heartbeat.Interval = cfg.HeartbeatInterval - } - - if cfg.RebalanceTimeout > 0 { - config.Consumer.Group.Rebalance.Timeout = cfg.RebalanceTimeout - } - - if cfg.ChannelBufferSize > 0 { - config.ChannelBufferSize = cfg.ChannelBufferSize - } - - // Configure network and metadata timeouts to prevent hanging when broker is unavailable - // See: https://github.com/IBM/sarama/issues/2991 - RefreshMetadata doesn't respect context cancellation - // These settings ensure metadata fetch fails quickly instead of hanging forever - config.Net.DialTimeout = 10 * time.Second // Max time to establish TCP connection - config.Net.ReadTimeout = 10 * time.Second // Max time waiting for response from broker - config.Net.WriteTimeout = 10 * time.Second // Max time for write operations - config.Metadata.Timeout = 30 * time.Second // Overall timeout for metadata operations - config.Metadata.Retry.Max = 3 // Retry metadata fetch 3 times - config.Metadata.Retry.Backoff = 2 * time.Second // Wait 2s between metadata retries - - // Configure authentication if TLS is enabled - if cfg.EnableTLS { - cfg.Logger.Debugf("Configuring Kafka TLS authentication - EnableTLS: %v, SkipVerify: %v, CA: %s, Cert: %s", - cfg.EnableTLS, cfg.TLSSkipVerify, cfg.TLSCAFile, cfg.TLSCertFile) - - if err := configureKafkaAuthFromFields(config, cfg.EnableTLS, cfg.TLSSkipVerify, cfg.TLSCAFile, cfg.TLSCertFile, cfg.TLSKeyFile); err != nil { - return nil, errors.NewConfigurationError("failed to configure Kafka authentication", err) - } - - cfg.Logger.Debugf("Successfully configured Kafka TLS authentication for consumer group %s", cfg.ConsumerGroupID) - } - - // https://github.com/IBM/sarama/issues/1689 - // https://github.com/IBM/sarama/pull/1699 - // Default value for config.Consumer.Offsets.AutoCommit.Enable is true. - if !cfg.AutoCommitEnabled { - config.Consumer.Offsets.AutoCommit.Enable = false + // Build franz-go client options + opts := []kgo.Opt{ + kgo.SeedBrokers(cfg.BrokersURL...), + kgo.ConsumerGroup(cfg.ConsumerGroupID), + kgo.ConsumeTopics(cfg.Topic), + kgo.FetchMaxWait(cfg.MaxProcessingTime), + kgo.SessionTimeout(cfg.SessionTimeout), + kgo.HeartbeatInterval(cfg.HeartbeatInterval), + kgo.RebalanceTimeout(cfg.RebalanceTimeout), } // Configure offset reset behavior - // This determines what offset to use when: - // 1. There is no initial offset (new consumer group) - // 2. Current offset is out of range (offset expired due to retention) - // - // NOTE: ResetInvalidOffsets is true by default in Sarama (since v1.38.1) - // BUT it only works during consumer initialization. If offset becomes invalid - // during active consumption, the partition consumer will shut down and trigger - // a rebalance, then restart with Consumer.Offsets.Initial. if cfg.OffsetReset != "" { switch strings.ToLower(cfg.OffsetReset) { case "latest", "newest": - config.Consumer.Offsets.Initial = sarama.OffsetNewest + opts = append(opts, kgo.ConsumeResetOffset(kgo.NewOffset().AtEnd())) cfg.Logger.Infof("[Kafka] %s: configured to reset to latest offset when out of range", cfg.Topic) case "earliest", "oldest": - config.Consumer.Offsets.Initial = sarama.OffsetOldest + opts = append(opts, kgo.ConsumeResetOffset(kgo.NewOffset().AtStart())) cfg.Logger.Infof("[Kafka] %s: configured to reset to earliest offset when out of range", cfg.Topic) default: return nil, errors.NewConfigurationError( @@ -499,29 +385,41 @@ func NewKafkaConsumerGroup(cfg KafkaConsumerConfig) (*KafkaConsumerGroup, error) ) } } else if cfg.Replay { - // Legacy behavior: Replay setting controls initial offset - // defaults to OffsetNewest - config.Consumer.Offsets.Initial = sarama.OffsetOldest + opts = append(opts, kgo.ConsumeResetOffset(kgo.NewOffset().AtStart())) + cfg.Logger.Infof("[Kafka] %s: replay enabled, configured to consume from earliest offset", cfg.Topic) } - clusterAdmin, err := sarama.NewClusterAdmin(cfg.BrokersURL, config) - if err != nil { - return nil, errors.NewConfigurationError("error while creating cluster admin", err) + // Configure auto-commit + if !cfg.AutoCommitEnabled { + opts = append(opts, kgo.DisableAutoCommit()) + } + + // Configure TLS if enabled + if cfg.EnableTLS { + tlsConfig, err := buildFranzTLSConfig(cfg.EnableTLS, cfg.TLSSkipVerify, cfg.TLSCAFile, cfg.TLSCertFile, cfg.TLSKeyFile) + if err != nil { + return nil, errors.NewConfigurationError("failed to configure TLS for kafka consumer", err) + } + opts = append(opts, kgo.DialTLSConfig(tlsConfig)) } - defer func(clusterAdmin sarama.ClusterAdmin) { - _ = clusterAdmin.Close() - }(clusterAdmin) - consumerGroup, err = sarama.NewConsumerGroup(cfg.BrokersURL, cfg.ConsumerGroupID, config) + // Enable debug logging if configured + if cfg.EnableDebugLogging { + opts = append(opts, kgo.WithLogger(&franzLoggerAdapter{logger: cfg.Logger})) + cfg.Logger.Infof("Kafka debug logging enabled for consumer group %s", cfg.ConsumerGroupID) + } + + // Create the franz-go client + client, err := kgo.NewClient(opts...) if err != nil { - return nil, errors.NewServiceError("failed to create Kafka consumer group for %s", cfg.Topic, err) + return nil, errors.NewServiceError("failed to create Kafka consumer client for %s", cfg.Topic, err) } return &KafkaConsumerGroup{ - Config: cfg, - ConsumerGroup: consumerGroup, - watchdog: &consumeWatchdog{}, // Initialize watchdog - saramaConfig: config, // Store config for force recovery + Config: cfg, + client: client, + watchdog: &consumeWatchdog{}, + clientOpts: opts, }, nil } @@ -539,11 +437,10 @@ type consumerOptions struct { } // WithRetryAndMoveOn configures error behaviour for the consumer function -// After max retries, the error is logged and the message is skipped func WithRetryAndMoveOn(maxRetries, backoffMultiplier int, backoffDurationType time.Duration) ConsumerOption { return func(o *consumerOptions) { o.withRetryAndMoveOn = true - o.withRetryAndStop = false // can't have both options set + o.withRetryAndStop = false o.maxRetries = maxRetries o.backoffMultiplier = backoffMultiplier o.backoffDurationType = backoffDurationType @@ -551,11 +448,9 @@ func WithRetryAndMoveOn(maxRetries, backoffMultiplier int, backoffDurationType t } // WithRetryAndStop configures error behaviour for the consumer function -// After max retries, the error is logged and message consumption stops -// Use this when you cannot proceed with the next message in the queue func WithRetryAndStop(maxRetries, backoffMultiplier int, backoffDurationType time.Duration, stopFn func()) ConsumerOption { return func(o *consumerOptions) { - o.withRetryAndMoveOn = false // can't have both options set + o.withRetryAndMoveOn = false o.withRetryAndStop = true o.maxRetries = maxRetries o.backoffMultiplier = backoffMultiplier @@ -565,8 +460,6 @@ func WithRetryAndStop(maxRetries, backoffMultiplier int, backoffDurationType tim } // WithLogErrorAndMoveOn configures error behaviour for the consumer function -// When an error occurs, it is logged and the message is skipped without any retries -// Use this for non-critical messages where you want visibility of failures but don't want to block processing func WithLogErrorAndMoveOn() ConsumerOption { return func(o *consumerOptions) { o.withLogErrorAndMoveOn = true @@ -580,6 +473,12 @@ func (k *KafkaConsumerGroup) Start(ctx context.Context, consumerFn func(message return } + // Handle in-memory case + if k.isInMemory { + k.startInMemory(ctx, consumerFn, opts...) + return + } + options := &consumerOptions{ withRetryAndMoveOn: false, withRetryAndStop: false, @@ -592,301 +491,104 @@ func (k *KafkaConsumerGroup) Start(ctx context.Context, consumerFn func(message opt(options) } - if options.withRetryAndMoveOn { - originalFn := consumerFn - consumerFn = func(msg *KafkaMessage) error { - _, err := retry.Retry(ctx, k.Config.Logger, func() (any, error) { - return struct{}{}, originalFn(msg) - }, - retry.WithRetryCount(options.maxRetries), - retry.WithBackoffMultiplier(options.backoffMultiplier), - retry.WithBackoffDurationType(options.backoffDurationType), - retry.WithMessage("[kafka_consumer] retrying processing kafka message...")) - - // if we can't process the message, log the error and skip to the next message - if err != nil { - key := "" - if msg != nil && msg.Key != nil { - key = string(msg.Key) - } - - k.Config.Logger.Errorf("[kafka_consumer] error processing kafka message on topic %s (key: %s), skipping", k.Config.Topic, key) - } - - return nil // give up and move on - } - } - - if options.withRetryAndStop { - originalFn := consumerFn - consumerFn = func(msg *KafkaMessage) error { - _, err := retry.Retry(ctx, k.Config.Logger, func() (any, error) { - return struct{}{}, originalFn(msg) - }, - retry.WithRetryCount(options.maxRetries), - retry.WithBackoffMultiplier(options.backoffMultiplier), - retry.WithBackoffDurationType(options.backoffDurationType), - retry.WithMessage("[kafka_consumer] retrying processing kafka message...")) - - // if we can't process the message, log the error and stop consuming any more messages - if err != nil { - if options.stopFn != nil { - key := "" - if msg != nil && msg.Key != nil { - key = string(msg.Key) - } - - k.Config.Logger.Errorf("[kafka_consumer] error processing kafka message on topic %s (key: %s), stopping", k.Config.Topic, key) - options.stopFn() - } else { - c := k.ConsumerGroup - k.ConsumerGroup = nil - - _ = c.Close() - - panic("error processing kafka message, with no stop function provided") - } - } - - return nil - } - } - - if options.withLogErrorAndMoveOn { - originalFn := consumerFn - consumerFn = func(msg *KafkaMessage) error { - err := originalFn(msg) - - // if we can't process the message, log the error and skip to the next message - if err != nil { - key := "" - if msg != nil && msg.Key != nil { - key = string(msg.Key) - } - - k.Config.Logger.Errorf("[kafka_consumer] error processing kafka message on topic %s (key: %s), skipping: %v", k.Config.Topic, key, err) - } - - return nil // always move on to the next message - } - } + // Apply retry/error handling wrappers + consumerFn = wrapConsumerFn(ctx, k.Config.Logger, k.Config.Topic, consumerFn, options) go func() { internalCtx, cancel := context.WithCancel(ctx) k.cancel.Store(cancel) defer cancel() - go func() { - for { - select { - case <-ctx.Done(): - return - default: - // Safely read current consumer (might be replaced by forceRecovery or offset reset) - k.consumerMu.Lock() - currentConsumer := k.ConsumerGroup - k.consumerMu.Unlock() - - if currentConsumer == nil { - time.Sleep(100 * time.Millisecond) - continue - } - - // Read from current consumer's error channel - select { - case err, ok := <-currentConsumer.Errors(): - if !ok { - // Channel closed (consumer was closed), loop to get new consumer - time.Sleep(100 * time.Millisecond) - continue - } - if err != nil { - // Check if this is an offset out of range error - // This happens when committed offset has been deleted due to retention - // Sarama's built-in offsetReset=latest will handle the reset when we recreate the consumer - if errors.Is(err, sarama.ErrOffsetOutOfRange) || strings.Contains(err.Error(), "offset out of range") { - k.Config.Logger.Errorf("[kafka-consumer-error] Offset out of range error detected: %v. Recreating consumer to trigger Sarama's offset reset...", err) - - // Close current consumer and recreate - // Sarama will automatically reset to latest offset per offsetReset=latest config - if recErr := k.forceRecovery(); recErr != nil { - k.Config.Logger.Errorf("[kafka-consumer-error] Force recovery after offset error failed: %v", recErr) - } else { - k.Config.Logger.Infof("[kafka-consumer-error] Successfully recovered from offset out of range error. Sarama will auto-reset to latest offset.") - } - - continue - } - - // Don't log context cancellation as an error - it's expected during shutdown - if errors.Is(err, context.Canceled) || strings.Contains(err.Error(), "context canceled") { - k.Config.Logger.Debugf("Kafka consumer shutdown: %v", err) - } else { - k.Config.Logger.Errorf("Kafka consumer error: %v", err) - } - } - case <-ctx.Done(): - return - } - } - } - }() - - topics := []string{k.Config.Topic} - - // Watchdog: Active monitoring and recovery for stuck Consume() calls - // This watchdog detects when Consume() is stuck in RefreshMetadata (Sarama bug #2991) - // and triggers force recovery by closing and recreating the consumer group. - // This simulates what happens when you restart the Kafka server in production. + // Watchdog goroutine const watchdogCheckInterval = 30 * time.Second - - // Use configured timeout or default to 90s watchdogStuckThreshold := k.Config.ConsumerTimeout if watchdogStuckThreshold == 0 { watchdogStuckThreshold = 90 * time.Second } - go func() { - ticker := time.NewTicker(watchdogCheckInterval) - defer ticker.Stop() + go k.runWatchdog(internalCtx, watchdogCheckInterval, watchdogStuckThreshold) - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - // Check for initial RefreshMetadata hang (before first successful Setup) - stuck, duration := k.watchdog.isStuckInRefreshMetadata(watchdogStuckThreshold) - if stuck { - // Get watchdog state for logging - startTime, _ := k.watchdog.consumeStartTime.Load().(time.Time) - setupTime, _ := k.watchdog.setupCalledTime.Load().(time.Time) - - k.Config.Logger.Errorf( - "[kafka-consumer-watchdog][topic:%s][group:%s] Consume() stuck for %v (threshold: %v). "+ - "StartTime=%v SetupCalled=%v. Forcing recovery...", - k.Config.Topic, k.Config.ConsumerGroupID, duration, watchdogStuckThreshold, - startTime.Format(time.RFC3339), setupTime.IsZero(), - ) - - // Record metrics - prometheusKafkaWatchdogRecoveryAttempts.WithLabelValues(k.Config.Topic, k.Config.ConsumerGroupID).Inc() - prometheusKafkaWatchdogStuckDuration.WithLabelValues(k.Config.Topic).Observe(duration.Seconds()) - - // Attempt force recovery - if err := k.forceRecovery(); err != nil { - k.Config.Logger.Errorf("[kafka-consumer-watchdog][topic:%s] Force recovery failed: %v. Will retry on next watchdog check.", k.Config.Topic, err) - } else { - k.Config.Logger.Infof("[kafka-consumer-watchdog][topic:%s] Force recovery successful. Consumer should resume.", k.Config.Topic) - // Reset watchdog state - k.watchdog.markConsumeEnded() - } - continue - } + // Main consume loop + go func() { + k.Config.Logger.Debugf("[kafka] starting consumer for group %s on topic %s", k.Config.ConsumerGroupID, k.Config.Topic) - // Check for hang after error/rebalance (Consume() returned, but retry is stuck) - stuckAfterError, durationAfterError := k.watchdog.isStuckAfterError(watchdogStuckThreshold) - if stuckAfterError { - // Get watchdog state for logging - startTime, _ := k.watchdog.consumeStartTime.Load().(time.Time) - setupTime, _ := k.watchdog.setupCalledTime.Load().(time.Time) - endTime, _ := k.watchdog.consumeEndTime.Load().(time.Time) - - k.Config.Logger.Errorf( - "[kafka-consumer-watchdog][topic:%s][group:%s] Consume() stuck after error/rebalance for %v (threshold: %v). "+ - "StartTime=%v EndTime=%v SetupCalled=%v. Forcing recovery...", - k.Config.Topic, k.Config.ConsumerGroupID, durationAfterError, watchdogStuckThreshold, - startTime.Format(time.RFC3339), endTime.Format(time.RFC3339), setupTime.IsZero(), - ) - - // Record metrics - prometheusKafkaWatchdogRecoveryAttempts.WithLabelValues(k.Config.Topic, k.Config.ConsumerGroupID).Inc() - prometheusKafkaWatchdogStuckDuration.WithLabelValues(k.Config.Topic).Observe(durationAfterError.Seconds()) - - // Attempt force recovery - if err := k.forceRecovery(); err != nil { - k.Config.Logger.Errorf("[kafka-consumer-watchdog][topic:%s] Force recovery failed: %v. Will retry on next watchdog check.", k.Config.Topic, err) - } else { - k.Config.Logger.Infof("[kafka-consumer-watchdog][topic:%s] Force recovery successful after error. Consumer should resume.", k.Config.Topic) - // Reset watchdog state - k.watchdog.markConsumeEnded() - } - continue - } - } - } - }() + commitTicker := time.NewTicker(time.Minute) + defer commitTicker.Stop() - // Only spawn one consumer goroutine - Sarama handles partition concurrency internally - go func() { - k.Config.Logger.Debugf("[kafka] starting consumer for group %s on topic %s (partition-based concurrency)", k.Config.ConsumerGroupID, topics[0]) + uncommittedRecords := make([]*kgo.Record, 0) + var uncommittedMu sync.Mutex for { select { case <-internalCtx.Done(): - // Context cancelled, exit goroutine + k.commitRecords(uncommittedRecords) return default: - // Mark that we're attempting to start Consume() (before RefreshMetadata) k.watchdog.markConsumeStarted() - k.Config.Logger.Debugf("[kafka] Consumer for group %s calling Consume() on topic %s", k.Config.ConsumerGroupID, k.Config.Topic) - consumeStart := time.Now() - - // Get current consumer group (might be replaced by force recovery or offset reset) - // Use mutex to ensure we don't read while forceRecovery() is replacing it - k.consumerMu.Lock() - currentConsumer := k.ConsumerGroup - k.consumerMu.Unlock() - - if currentConsumer == nil { - // Consumer is nil - likely being recreated by error handler after offset error - // Wait for recovery to create new consumer, then retry - k.Config.Logger.Debugf("[kafka] Consumer group is nil for topic %s, waiting for recovery to create new consumer...", k.Config.Topic) - time.Sleep(1 * time.Second) - continue // Retry with new consumer + k.clientMu.Lock() + currentClient := k.client + k.clientMu.Unlock() + + if currentClient == nil { + time.Sleep(100 * time.Millisecond) + continue } - // CRITICAL: Create a NEW context for each Consume() attempt - // When forceRecovery() closes the consumer, Sarama cancels the context passed to Consume() - // If we reuse the same context, the next Consume() call will fail immediately - // We derive from internalCtx so that shutdown still works correctly - consumeCtx, consumeCancel := context.WithCancel(internalCtx) - err := currentConsumer.Consume(consumeCtx, topics, NewKafkaConsumer(k.Config, consumerFn, k.watchdog)) - consumeCancel() // Always clean up the context when Consume() returns + fetches := currentClient.PollFetches(internalCtx) + k.watchdog.markSetupCalled() - // Consume() returned - mark as no longer attempting - k.watchdog.markConsumeEnded() - consumeDuration := time.Since(consumeStart) - - if err != nil { - k.Config.Logger.Debugf("[kafka] Consumer for group %s Consume() returned after %v", k.Config.ConsumerGroupID, consumeDuration) + if fetches.IsClientClosed() { + return + } - switch { - case errors.Is(err, sarama.ErrClosedConsumerGroup): - // Check if context is cancelled - if so, this is a normal shutdown - select { - case <-internalCtx.Done(): - k.Config.Logger.Infof("[kafka] Consumer for group %s closed due to context cancellation", k.Config.ConsumerGroupID) + if errs := fetches.Errors(); len(errs) > 0 { + for _, err := range errs { + if errors.Is(err.Err, context.Canceled) { + k.Config.Logger.Debugf("Kafka consumer shutdown: %v", err.Err) return - default: - // Context still active - this might be force recovery, continue loop to use new consumer - k.Config.Logger.Infof("[kafka] Consumer for group %s closed but context still active, retrying with new consumer...", k.Config.ConsumerGroupID) - time.Sleep(1 * time.Second) // Brief pause before retrying } - case errors.Is(err, context.Canceled): - k.Config.Logger.Infof("[kafka] Consumer for group %s cancelled", k.Config.ConsumerGroupID) + k.Config.Logger.Errorf("Kafka consumer error on topic %s partition %d: %v", err.Topic, err.Partition, err.Err) + } + k.watchdog.markConsumeEnded() + continue + } + + fetches.EachRecord(func(record *kgo.Record) { + kafkaMsg := &KafkaMessage{ + Key: record.Key, + Value: record.Value, + Topic: record.Topic, + Partition: record.Partition, + Offset: record.Offset, + Timestamp: record.Timestamp, + } + + if err := consumerFn(kafkaMsg); err != nil { + k.Config.Logger.Errorf("[kafka_consumer] failed to process message (topic: %s, partition: %d, offset: %d): %v", + record.Topic, record.Partition, record.Offset, err) return - default: - // Log error and wait before retrying to prevent tight loop when broker is down - k.Config.Logger.Errorf("Error from consumer: %v (after %v), retrying in 5s...", err, consumeDuration) - time.Sleep(5 * time.Second) } - } else { - // Consume() returned successfully - this is normal (rebalance, coordinator change, etc.) - // Continue looping to call Consume() again - k.Config.Logger.Debugf("[kafka] Consumer for group %s Consume() completed successfully after %v", k.Config.ConsumerGroupID, consumeDuration) + + if !k.Config.AutoCommitEnabled { + uncommittedMu.Lock() + uncommittedRecords = append(uncommittedRecords, record) + uncommittedMu.Unlock() + } + }) + + select { + case <-commitTicker.C: + uncommittedMu.Lock() + if len(uncommittedRecords) > 0 { + k.commitRecords(uncommittedRecords) + uncommittedRecords = uncommittedRecords[:0] + } + uncommittedMu.Unlock() + default: } + + k.watchdog.markConsumeEnded() } } }() @@ -894,218 +596,254 @@ func (k *KafkaConsumerGroup) Start(ctx context.Context, consumerFn func(message signals := make(chan os.Signal, 1) signal.Notify(signals, os.Interrupt, syscall.SIGTERM) - go func() { - select { - case <-signals: - cancel() - case <-internalCtx.Done(): - // Context cancelled, exit gracefully without cancelling again - return - } - }() - select { case <-signals: k.Config.Logger.Infof("[kafka] Received signal, shutting down consumers for group %s", k.Config.ConsumerGroupID) - cancel() // Ensure the context is canceled + cancel() case <-internalCtx.Done(): k.Config.Logger.Infof("[kafka] Context done, shutting down consumer for %s", k.Config.ConsumerGroupID) } - if k.ConsumerGroup != nil { - if err := k.ConsumerGroup.Close(); err != nil { - k.Config.Logger.Errorf("[Kafka] %s: error closing client: %v", k.Config.ConsumerGroupID, err) - } + if k.client != nil { + k.client.Close() } }() } -func (k *KafkaConsumerGroup) BrokersURL() []string { - return k.Config.BrokersURL -} - -// PauseAll suspends fetching from all partitions without triggering a rebalance. -// Heartbeats continue to be sent to the broker, so the consumer remains part of the group. -func (k *KafkaConsumerGroup) PauseAll() { - if k.ConsumerGroup != nil { - k.ConsumerGroup.PauseAll() - k.Config.Logger.Debugf("[Kafka] %s: paused all partitions for topic %s", k.Config.ConsumerGroupID, k.Config.Topic) +// startInMemory handles the in-memory consumer case +func (k *KafkaConsumerGroup) startInMemory(ctx context.Context, consumerFn func(message *KafkaMessage) error, opts ...ConsumerOption) { + options := &consumerOptions{ + maxRetries: 3, + backoffMultiplier: 2, + backoffDurationType: time.Second, } -} - -// ResumeAll resumes all partitions which have been paused. -func (k *KafkaConsumerGroup) ResumeAll() { - if k.ConsumerGroup != nil { - k.ConsumerGroup.ResumeAll() - k.Config.Logger.Debugf("[Kafka] %s: resumed all partitions for topic %s", k.Config.ConsumerGroupID, k.Config.Topic) + for _, opt := range opts { + opt(options) } -} -// KafkaConsumer represents a Sarama consumer group consumer -type KafkaConsumer struct { - consumerClosure func(*KafkaMessage) error - cfg KafkaConsumerConfig - watchdog *consumeWatchdog // Monitors for stuck RefreshMetadata and triggers force recovery -} - -func NewKafkaConsumer(cfg KafkaConsumerConfig, consumerClosureOrNil func(message *KafkaMessage) error, watchdog *consumeWatchdog) *KafkaConsumer { - consumer := &KafkaConsumer{ - consumerClosure: consumerClosureOrNil, - cfg: cfg, - watchdog: watchdog, + handler := &inMemoryConsumerHandler{ + logger: k.Config.Logger, + consumerFn: consumerFn, + options: options, + topic: k.Config.Topic, } - return consumer + go func() { + err := k.inMemoryConsumer.Consume(ctx, []string{k.Config.Topic}, handler) + if err != nil && !errors.Is(err, context.Canceled) { + k.Config.Logger.Errorf("In-memory consumer error: %v", err) + } + }() } -// Setup is run at the beginning of a new session, before ConsumeClaim -func (kc *KafkaConsumer) Setup(sarama.ConsumerGroupSession) error { - // This is called AFTER RefreshMetadata succeeds and consumer joins group - if kc.watchdog != nil { - kc.watchdog.markSetupCalled() - kc.cfg.Logger.Infof("[kafka] Consumer setup completed for topic %s - successfully joined group after RefreshMetadata", kc.cfg.Topic) +// runWatchdog monitors for stuck consumers +func (k *KafkaConsumerGroup) runWatchdog(ctx context.Context, checkInterval, threshold time.Duration) { + ticker := time.NewTicker(checkInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + stuck, duration := k.watchdog.isStuckInRefreshMetadata(threshold) + if stuck { + k.Config.Logger.Errorf("[kafka-consumer-watchdog][topic:%s][group:%s] Consumer stuck for %v. Forcing recovery...", + k.Config.Topic, k.Config.ConsumerGroupID, duration) + + prometheusKafkaWatchdogRecoveryAttempts.WithLabelValues(k.Config.Topic, k.Config.ConsumerGroupID).Inc() + prometheusKafkaWatchdogStuckDuration.WithLabelValues(k.Config.Topic).Observe(duration.Seconds()) + + if err := k.forceRecovery(); err != nil { + k.Config.Logger.Errorf("[kafka-consumer-watchdog][topic:%s] Force recovery failed: %v", k.Config.Topic, err) + } else { + k.Config.Logger.Infof("[kafka-consumer-watchdog][topic:%s] Force recovery successful", k.Config.Topic) + k.watchdog.markConsumeEnded() + } + } + } } - return nil } -// Cleanup is run at the end of a session, once all ConsumeClaim goroutines have exited -func (kc *KafkaConsumer) Cleanup(session sarama.ConsumerGroupSession) error { - kc.cfg.Logger.Infof("[kafka-consumer-cleanup][topic:%s] Session ending - committing offsets and releasing partitions. GenerationID: %d, MemberID: %s", - kc.cfg.Topic, session.GenerationID(), session.MemberID()) +// commitRecords commits the offsets for the given records +func (k *KafkaConsumerGroup) commitRecords(records []*kgo.Record) { + if len(records) == 0 || k.client == nil { + return + } - if !kc.cfg.AutoCommitEnabled { - session.Commit() + offsets := make(map[string]map[int32]kgo.EpochOffset) + for _, r := range records { + if _, ok := offsets[r.Topic]; !ok { + offsets[r.Topic] = make(map[int32]kgo.EpochOffset) + } + offsets[r.Topic][r.Partition] = kgo.EpochOffset{ + Epoch: r.LeaderEpoch, + Offset: r.Offset + 1, + } } - return nil + k.client.CommitOffsets(context.Background(), offsets, func(_ *kgo.Client, _ *kmsg.OffsetCommitRequest, _ *kmsg.OffsetCommitResponse, err error) { + if err != nil { + k.Config.Logger.Errorf("[kafka] Failed to commit offsets: %v", err) + } + }) } -// ConsumeClaim must start a consumer loop of ConsumerGroupClaim's Messages(). -func (kc *KafkaConsumer) ConsumeClaim(session sarama.ConsumerGroupSession, claim sarama.ConsumerGroupClaim) error { - const commitInterval = time.Minute - - messageProcessedSinceLastCommit := false - messagesProcessed := atomic.Uint64{} - - var mu sync.Mutex // Add mutex to protect messageProcessedSinceLastCommit - - // Start a separate goroutine for commit ticker - if !kc.cfg.AutoCommitEnabled { - go func() { - commitTicker := time.NewTicker(commitInterval) - defer commitTicker.Stop() +// BrokersURL returns the list of Kafka broker URLs. +func (k *KafkaConsumerGroup) BrokersURL() []string { + return k.Config.BrokersURL +} - for { - select { - case <-session.Context().Done(): - err := session.Context().Err() - if err != nil { - kc.cfg.Logger.Debugf("[kafka_consumer] Context canceled in commit ticker (topic: %s): %v", - kc.cfg.Topic, err) - } +// PauseAll suspends fetching from all partitions. +func (k *KafkaConsumerGroup) PauseAll() { + if k.isInMemory { + k.inMemoryConsumer.PauseAll() + return + } + if k.client != nil { + k.client.PauseFetchTopics(k.Config.Topic) + k.Config.Logger.Debugf("[Kafka] %s: paused all partitions for topic %s", k.Config.ConsumerGroupID, k.Config.Topic) + } +} - return - case <-commitTicker.C: - mu.Lock() - if messageProcessedSinceLastCommit { - session.Commit() +// ResumeAll resumes all partitions which have been paused. +func (k *KafkaConsumerGroup) ResumeAll() { + if k.isInMemory { + k.inMemoryConsumer.ResumeAll() + return + } + if k.client != nil { + k.client.ResumeFetchTopics(k.Config.Topic) + k.Config.Logger.Debugf("[Kafka] %s: resumed all partitions for topic %s", k.Config.ConsumerGroupID, k.Config.Topic) + } +} - messageProcessedSinceLastCommit = false - } - mu.Unlock() +// wrapConsumerFn applies retry/error handling wrappers to consumer function +func wrapConsumerFn(ctx context.Context, logger ulogger.Logger, topic string, consumerFn func(message *KafkaMessage) error, options *consumerOptions) func(message *KafkaMessage) error { + if options.withRetryAndMoveOn { + originalFn := consumerFn + consumerFn = func(msg *KafkaMessage) error { + var err error + for i := 0; i < options.maxRetries; i++ { + err = originalFn(msg) + if err == nil { + return nil } + backoff := time.Duration(options.backoffMultiplier*(i+1)) * options.backoffDurationType + logger.Warnf("[kafka_consumer] retrying processing kafka message... attempt %d/%d, backoff %v", i+1, options.maxRetries, backoff) + time.Sleep(backoff) } - }() - } - // Create a buffered channel for messages to reduce context switching - // Buffer size is configurable via URL query parameter - messages := make(chan *sarama.ConsumerMessage, kc.cfg.ChannelBufferSize) + key := "" + if msg != nil && msg.Key != nil { + key = string(msg.Key) + } + logger.Errorf("[kafka_consumer] error processing kafka message on topic %s (key: %s), skipping", topic, key) + return nil + } + } - // Start a separate goroutine to receive messages - go func() { - for message := range claim.Messages() { - select { - case messages <- message: - case <-session.Context().Done(): - // Log when context is canceled in the message forwarding goroutine - err := session.Context().Err() - if err != nil { - kc.cfg.Logger.Debugf("[kafka_consumer] Context canceled in message forwarder (topic: %s, partition: %d): %v", - claim.Topic(), claim.Partition(), err) + if options.withRetryAndStop { + originalFn := consumerFn + consumerFn = func(msg *KafkaMessage) error { + var err error + for i := 0; i < options.maxRetries; i++ { + err = originalFn(msg) + if err == nil { + return nil } + backoff := time.Duration(options.backoffMultiplier*(i+1)) * options.backoffDurationType + logger.Warnf("[kafka_consumer] retrying processing kafka message... attempt %d/%d, backoff %v", i+1, options.maxRetries, backoff) + time.Sleep(backoff) + } - return + key := "" + if msg != nil && msg.Key != nil { + key = string(msg.Key) } + logger.Errorf("[kafka_consumer] error processing kafka message on topic %s (key: %s), stopping", topic, key) + if options.stopFn != nil { + options.stopFn() + } + return nil } - }() + } - for { - select { - case <-session.Context().Done(): - err := session.Context().Err() - // Only log detailed information if it's not a normal shutdown + if options.withLogErrorAndMoveOn { + originalFn := consumerFn + consumerFn = func(msg *KafkaMessage) error { + err := originalFn(msg) if err != nil { - // Get additional context about the consumer state - partition := claim.Partition() - topic := claim.Topic() - highWatermark := claim.HighWaterMarkOffset() - - kc.cfg.Logger.Infof("[kafka_consumer] Context done for consumer (topic: %s, partition: %d, highWatermark: %d): %v. This is normal during shutdown or rebalancing.", - topic, partition, highWatermark, err) + key := "" + if msg != nil && msg.Key != nil { + key = string(msg.Key) + } + logger.Errorf("[kafka_consumer] error processing kafka message on topic %s (key: %s), skipping: %v", topic, key, err) } + return nil + } + } - return err + return consumerFn +} - case message := <-messages: - if message == nil { - continue - } +// inMemoryConsumerHandler implements the handler for in-memory consumer +type inMemoryConsumerHandler struct { + logger ulogger.Logger + consumerFn func(message *KafkaMessage) error + options *consumerOptions + topic string +} - // Process message - var err error - if kc.cfg.AutoCommitEnabled { - err = kc.handleMessagesWithAutoCommit(message) - } else { - err = kc.handleMessageWithManualCommit(session, message) +func (h *inMemoryConsumerHandler) Setup(_ inmemorykafka.ConsumerGroupSession) error { + return nil +} + +func (h *inMemoryConsumerHandler) Cleanup(_ inmemorykafka.ConsumerGroupSession) error { + return nil +} + +func (h *inMemoryConsumerHandler) ConsumeClaim(session inmemorykafka.ConsumerGroupSession, claim inmemorykafka.ConsumerGroupClaim) error { + for message := range claim.Messages() { + kafkaMsg := &KafkaMessage{ + Key: message.Key, + Value: message.Value, + Topic: message.Topic, + Partition: message.Partition, + Offset: message.Offset, + Timestamp: message.Timestamp, + } + + var err error + if h.options.withRetryAndMoveOn { + for i := 0; i < h.options.maxRetries; i++ { + err = h.consumerFn(kafkaMsg) if err == nil { - mu.Lock() - messageProcessedSinceLastCommit = true - mu.Unlock() + break } + time.Sleep(time.Duration(h.options.backoffMultiplier*(i+1)) * h.options.backoffDurationType) } - if err != nil { - kc.cfg.Logger.Errorf("[kafka_consumer] failed to process message (topic: %s, partition: %d, offset: %d): %v", - message.Topic, message.Partition, message.Offset, err) - return err + h.logger.Errorf("[kafka_consumer] error processing message, skipping: %v", err) } + continue + } - // Increment message counter for heartbeat logging - messagesProcessed.Add(1) + if h.options.withLogErrorAndMoveOn { + if err := h.consumerFn(kafkaMsg); err != nil { + h.logger.Errorf("[kafka_consumer] error processing message, skipping: %v", err) + } + continue } - } -} -// handleMessageWithManualCommit processes the message and commits the offset only if the processing of the message is successful -func (kc *KafkaConsumer) handleMessageWithManualCommit(session sarama.ConsumerGroupSession, message *sarama.ConsumerMessage) error { - msg := KafkaMessage{*message} - // kc.cfg.Logger.Infof("Processing message with offset: %v", message.Offset) + if err := h.consumerFn(kafkaMsg); err != nil { + if h.options.withRetryAndStop && h.options.stopFn != nil { + h.options.stopFn() + } + return err + } - if err := kc.consumerClosure(&msg); err != nil { - return err + session.MarkMessage(message, "") } - - // kc.logger.Infof("Committing offset: %v", message.Offset) - - // Update the message offset, processing is successful - // This doesn't commit the offset to the server, it just marks it as processed in memory on the client - // The commit is done elsewhere - session.MarkMessage(message, "") - return nil } - -func (kc *KafkaConsumer) handleMessagesWithAutoCommit(message *sarama.ConsumerMessage) error { - return kc.consumerClosure(&KafkaMessage{*message}) -} diff --git a/util/kafka/kafka_consumer_test.go b/util/kafka/kafka_consumer_test.go index 4ce89cf76d..04b914a682 100644 --- a/util/kafka/kafka_consumer_test.go +++ b/util/kafka/kafka_consumer_test.go @@ -6,7 +6,6 @@ import ( "testing" "time" - "github.com/IBM/sarama" "github.com/bsv-blockchain/teranode/ulogger" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -67,87 +66,49 @@ func TestWithRetryAndStop(t *testing.T) { assert.True(t, stopFnCalled) } -func TestNewKafkaConsumer(t *testing.T) { +func TestNewKafkaConsumerGroup(t *testing.T) { logger := &mockLogger{} + kafkaURL, err := url.Parse("memory://localhost/test-topic") + require.NoError(t, err) cfg := KafkaConsumerConfig{ Logger: logger, + URL: kafkaURL, Topic: "test-topic", ConsumerGroupID: "test-group", AutoCommitEnabled: true, } - consumerFn := func(message *KafkaMessage) error { - return nil - } - - watchdog := &consumeWatchdog{} - consumer := NewKafkaConsumer(cfg, consumerFn, watchdog) + consumer, err := NewKafkaConsumerGroup(cfg) + require.NoError(t, err) assert.NotNil(t, consumer) - assert.Equal(t, cfg, consumer.cfg) - assert.NotNil(t, consumer.consumerClosure) - assert.NotNil(t, consumer.watchdog) + assert.Equal(t, cfg.Topic, consumer.Config.Topic) + assert.Equal(t, cfg.ConsumerGroupID, consumer.Config.ConsumerGroupID) + assert.Equal(t, cfg.AutoCommitEnabled, consumer.Config.AutoCommitEnabled) } -func TestNewKafkaConsumerNilConsumerFunction(t *testing.T) { +func TestNewKafkaConsumerGroupNilConsumerFunction(t *testing.T) { logger := &mockLogger{} + kafkaURL, err := url.Parse("memory://localhost/test-topic") + require.NoError(t, err) cfg := KafkaConsumerConfig{ Logger: logger, + URL: kafkaURL, Topic: "test-topic", ConsumerGroupID: "test-group", AutoCommitEnabled: false, } - watchdog := &consumeWatchdog{} - consumer := NewKafkaConsumer(cfg, nil, watchdog) - - assert.NotNil(t, consumer) - assert.Equal(t, cfg, consumer.cfg) - assert.Nil(t, consumer.consumerClosure) -} - -func TestKafkaConsumerSetup(t *testing.T) { - consumer := &KafkaConsumer{ - cfg: KafkaConsumerConfig{ - Topic: "test-topic", - }, - } - - err := consumer.Setup(&mockConsumerGroupSession{}) - - assert.NoError(t, err) -} - -func TestKafkaConsumerCleanupAutoCommitEnabled(t *testing.T) { - consumer := &KafkaConsumer{ - cfg: KafkaConsumerConfig{ - Logger: &mockLogger{}, - Topic: "test-topic", - AutoCommitEnabled: true, - }, - } - - session := &mockConsumerGroupSession{} - err := consumer.Cleanup(session) - - assert.NoError(t, err) - assert.False(t, session.commitCalled) // Should not call commit when auto-commit is enabled -} - -func TestKafkaConsumerCleanupManualCommit(t *testing.T) { - consumer := &KafkaConsumer{ - cfg: KafkaConsumerConfig{ - Logger: &mockLogger{}, - Topic: "test-topic", - AutoCommitEnabled: false, - }, - } - - session := &mockConsumerGroupSession{} - err := consumer.Cleanup(session) + consumer, err := NewKafkaConsumerGroup(cfg) + require.NoError(t, err) + require.NotNil(t, consumer) - assert.NoError(t, err) - assert.True(t, session.commitCalled) // Should call commit when auto-commit is disabled + // Start with nil consumerFn is invalid; Start should return without panicking (consumerFn is only used when messages arrive). + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + consumer.Start(ctx, nil) + cancel() + _ = consumer.Close() } func TestNewKafkaConsumerGroupFromURLInvalidURL(t *testing.T) { @@ -192,21 +153,15 @@ func TestNewKafkaConsumerGroupFromURLDefaultValues(t *testing.T) { func TestKafkaConsumerGroupClose(t *testing.T) { logger := &mockLogger{} - mockConsumerGroup := &mockSaramaConsumerGroup{} - - consumer := &KafkaConsumerGroup{ - Config: KafkaConsumerConfig{ - Logger: logger, - Topic: "test-topic", - ConsumerGroupID: "test-group", - }, - ConsumerGroup: mockConsumerGroup, - } + kafkaURL, err := url.Parse("memory://localhost/test-topic") + require.NoError(t, err) - err := consumer.Close() + consumer, err := NewKafkaConsumerGroupFromURL(logger, kafkaURL, "test-group", true, nil) + require.NoError(t, err) + require.NotNil(t, consumer) + err = consumer.Close() assert.NoError(t, err) - assert.True(t, mockConsumerGroup.closed) } func TestKafkaConsumerGroupBrokersURL(t *testing.T) { @@ -288,55 +243,6 @@ func (m *mockLogger) SetLogLevel(string) {} func (m *mockLogger) New(string, ...ulogger.Option) ulogger.Logger { return m } func (m *mockLogger) Duplicate(...ulogger.Option) ulogger.Logger { return m } -type mockConsumerGroupSession struct { - commitCalled bool -} - -func (m *mockConsumerGroupSession) Claims() map[string][]int32 { return nil } -func (m *mockConsumerGroupSession) MemberID() string { return "test-member" } -func (m *mockConsumerGroupSession) GenerationID() int32 { return 1 } -func (m *mockConsumerGroupSession) MarkOffset(string, int32, int64, string) { -} -func (m *mockConsumerGroupSession) ResetOffset(string, int32, int64, string) { -} -func (m *mockConsumerGroupSession) MarkMessage(*sarama.ConsumerMessage, string) {} -func (m *mockConsumerGroupSession) Context() context.Context { return context.Background() } -func (m *mockConsumerGroupSession) Commit() { - m.commitCalled = true -} - -type mockSaramaConsumerGroup struct { - closed bool -} - -// Consume implements sarama.ConsumerGroup interface -func (m *mockSaramaConsumerGroup) Consume(context.Context, []string, sarama.ConsumerGroupHandler) error { - return nil -} - -// Errors implements sarama.ConsumerGroup interface -func (m *mockSaramaConsumerGroup) Errors() <-chan error { - return make(chan error) -} - -// Close implements sarama.ConsumerGroup interface -func (m *mockSaramaConsumerGroup) Close() error { - m.closed = true - return nil -} - -// Pause and Resume methods are no-ops for the mock -func (m *mockSaramaConsumerGroup) Pause(map[string][]int32) {} - -// Resume implements sarama.ConsumerGroup interface -func (m *mockSaramaConsumerGroup) Resume(map[string][]int32) {} - -// PauseAll and ResumeAll methods are no-ops for the mock -func (m *mockSaramaConsumerGroup) PauseAll() {} - -// ResumeAll implements sarama.ConsumerGroup interface -func (m *mockSaramaConsumerGroup) ResumeAll() {} - // Watchdog tests func TestConsumeWatchdogMarkConsumeStarted(t *testing.T) { @@ -483,33 +389,6 @@ func TestConsumeWatchdogSequence_NormalFlow(t *testing.T) { assert.False(t, stuck) } -func TestForceRecovery_ClosesOldConsumer(t *testing.T) { - logger := &mockLogger{} - mockConsumerGroup := &mockSaramaConsumerGroup{} - - cfg := sarama.NewConfig() - cfg.Consumer.Return.Errors = true - - consumer := &KafkaConsumerGroup{ - Config: KafkaConsumerConfig{ - Logger: logger, - Topic: "test-topic", - ConsumerGroupID: "test-group", - BrokersURL: []string{"localhost:9092"}, - }, - ConsumerGroup: mockConsumerGroup, - saramaConfig: cfg, - watchdog: &consumeWatchdog{}, - } - - // Force recovery should close the old consumer - _ = consumer.forceRecovery() - - // The important thing is that Close() was called on the mock consumer - // (New consumer creation will fail with invalid brokers, but that's expected and logged) - assert.True(t, mockConsumerGroup.closed, "forceRecovery should close the old consumer group") -} - func TestForceRecovery_WatchdogIntegration(t *testing.T) { // Create a watchdog that appears stuck watchdog := &consumeWatchdog{} @@ -530,11 +409,6 @@ func TestForceRecovery_WatchdogIntegration(t *testing.T) { func TestForceRecovery_MutexProtectsConcurrentCalls(t *testing.T) { logger := &mockLogger{} - mockConsumerGroup := &mockSaramaConsumerGroup{} - - cfg := sarama.NewConfig() - cfg.Consumer.Return.Errors = true - consumer := &KafkaConsumerGroup{ Config: KafkaConsumerConfig{ Logger: logger, @@ -542,28 +416,19 @@ func TestForceRecovery_MutexProtectsConcurrentCalls(t *testing.T) { ConsumerGroupID: "test-group", BrokersURL: []string{"localhost:9092"}, }, - ConsumerGroup: mockConsumerGroup, - saramaConfig: cfg, - watchdog: &consumeWatchdog{}, + watchdog: &consumeWatchdog{}, } - // Launch multiple concurrent force recovery calls - // The mutex should ensure they don't interfere with each other + // Launch multiple concurrent force recovery calls; mutex should prevent panics const numConcurrent = 5 done := make(chan bool, numConcurrent) - for i := 0; i < numConcurrent; i++ { go func() { _ = consumer.forceRecovery() done <- true }() } - - // Wait for all goroutines to complete for i := 0; i < numConcurrent; i++ { <-done } - - // Should have closed the consumer (at least once) - assert.True(t, mockConsumerGroup.closed) } diff --git a/util/kafka/kafka_health.go b/util/kafka/kafka_health.go index df75dd6f63..53aadf1114 100644 --- a/util/kafka/kafka_health.go +++ b/util/kafka/kafka_health.go @@ -6,62 +6,55 @@ import ( "net/http" "time" - "github.com/IBM/sarama" + "github.com/twmb/franz-go/pkg/kgo" ) -// HealthChecker creates a function that checks the health of a Kafka cluster. -// It returns a health check function that can be used to verify the Kafka cluster's status. +// HealthChecker returns a function that checks basic connectivity to the Kafka cluster. +// It does not verify individual consumer or producer functionality. +// +// Kafka brokers do not expose a dedicated health endpoint; the usual approach is to verify +// connectivity with a metadata (or equivalent) request. This check creates a short-lived +// client, pings the cluster, and returns 200 if connectable, 503 otherwise. In production, +// producers and consumers reconnect on their own; we assume that if we can connect to the +// brokers, the cluster is healthy. // // Parameters: -// - ctx: Context for the health check operation -// - brokersURL: List of Kafka broker URLs to check +// - ctx: Context for the health check operation (unused at construction time). +// - brokersURL: List of Kafka broker URLs to check. +// +// Returns a function with signature: // -// Returns: -// - A function that performs the actual health check with the following signature: -// func(ctx context.Context, checkLiveness bool) (int, string, error) -// where: -// - int: HTTP status code (200 for healthy, 503 for unhealthy) -// - string: Health check message -// - error: Any error encountered during the health check +// func(ctx context.Context, checkLiveness bool) (int, string, error) // -// The returned health check function attempts to establish a connection to the Kafka cluster. -// If successful, it indicates the cluster is healthy. This check doesn't verify individual -// consumer or producer functionality but rather tests basic connectivity to the cluster. +// where int is HTTP status (200 healthy, 503 unhealthy), string is a message, and error is non-nil on failure. func HealthChecker(_ context.Context, brokersURL []string) func(ctx context.Context, checkLiveness bool) (int, string, error) { - /* - There isn't a built-in way to check the health of a Kafka cluster. - So, we need to connect to the cluster and check if we can connect to it. - If we can't connect to it, we return a 503. - If we can connect to it, we return a 200. - In reality this isn't testing the Kafka consumer or producer, it's testing - that the Kafka cluster is healthy. - However, in production every producer and consumer reconnects without fuss so we assume - that if we can connect to the brokers then the cluster is healthy. - Not perfect but it's something. - */ return func(ctx context.Context, checkLiveness bool) (int, string, error) { - if brokersURL == nil { + if brokersURL == nil || len(brokersURL) == 0 { return http.StatusOK, "Kafka is not configured - skipping health check", nil } - config := sarama.NewConfig() - config.Version = sarama.V2_1_0_0 - config.Admin.Retry.Max = 0 - config.Admin.Timeout = 100 * time.Millisecond - config.Net.DialTimeout = 100 * time.Millisecond - config.Net.ReadTimeout = 100 * time.Millisecond - config.Net.WriteTimeout = 100 * time.Millisecond - config.Metadata.Retry.Max = 0 - config.Metadata.Full = true - config.Metadata.AllowAutoTopicCreation = false + if checkLiveness { + return http.StatusOK, "Kafka liveness (skipped)", nil + } - kafkaClusterAdmin, err := sarama.NewClusterAdmin(brokersURL, config) + opts := []kgo.Opt{ + kgo.SeedBrokers(brokersURL...), + kgo.ConnIdleTimeout(100 * time.Millisecond), + kgo.MetadataMinAge(100 * time.Millisecond), + kgo.RetryBackoffFn(func(int) time.Duration { return 0 }), + } + + client, err := kgo.NewClient(opts...) if err != nil { return http.StatusServiceUnavailable, "Failed to connect to Kafka", err } + defer client.Close() + + pingCtx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() - if err = kafkaClusterAdmin.Close(); err != nil { - return http.StatusServiceUnavailable, "Failed to close Kafka connection", err + if err := client.Ping(pingCtx); err != nil { + return http.StatusServiceUnavailable, "Failed to connect to Kafka", err } return http.StatusOK, "Kafka is healthy", nil diff --git a/util/kafka/kafka_health_test.go b/util/kafka/kafka_health_test.go index e99e74466b..c04f62307e 100644 --- a/util/kafka/kafka_health_test.go +++ b/util/kafka/kafka_health_test.go @@ -36,15 +36,14 @@ func TestHealthCheckerNilBrokers(t *testing.T) { func TestHealthCheckerEmptyBrokers(t *testing.T) { healthCheck := HealthChecker(context.Background(), []string{}) - status, message, err := healthCheck(context.Background(), true) + status, message, err := healthCheck(context.Background(), false) - assert.Equal(t, http.StatusServiceUnavailable, status) - assert.Equal(t, "Failed to connect to Kafka", message) - assert.Error(t, err) + assert.Equal(t, http.StatusOK, status) + assert.Equal(t, "Kafka is not configured - skipping health check", message) + assert.NoError(t, err) } func TestHealthCheckerInvalidBrokers(t *testing.T) { - // Use non-existent hosts with dynamic ports to ensure connection failure unusedPort1 := getUnusedPort(t) unusedPort2 := getUnusedPort(t) brokers := []string{ @@ -53,7 +52,7 @@ func TestHealthCheckerInvalidBrokers(t *testing.T) { } healthCheck := HealthChecker(context.Background(), brokers) - status, message, err := healthCheck(context.Background(), true) + status, message, err := healthCheck(context.Background(), false) assert.Equal(t, http.StatusServiceUnavailable, status) assert.Equal(t, "Failed to connect to Kafka", message) @@ -61,23 +60,43 @@ func TestHealthCheckerInvalidBrokers(t *testing.T) { } func TestHealthCheckerLivenessParameter(t *testing.T) { + unusedPort := getUnusedPort(t) + invalidBrokers := []string{fmt.Sprintf("localhost:%d", unusedPort)} + tests := []struct { name string checkLiveness bool brokers []string expectedMsg string + expectedOK bool }{ { name: "Liveness check with nil brokers", checkLiveness: true, brokers: nil, expectedMsg: "Kafka is not configured - skipping health check", + expectedOK: true, }, { name: "Readiness check with nil brokers", checkLiveness: false, brokers: nil, expectedMsg: "Kafka is not configured - skipping health check", + expectedOK: true, + }, + { + name: "Liveness check with brokers skips ping", + checkLiveness: true, + brokers: invalidBrokers, + expectedMsg: "Kafka liveness (skipped)", + expectedOK: true, + }, + { + name: "Readiness check with invalid brokers pings and fails", + checkLiveness: false, + brokers: invalidBrokers, + expectedMsg: "Failed to connect to Kafka", + expectedOK: false, }, } @@ -87,28 +106,29 @@ func TestHealthCheckerLivenessParameter(t *testing.T) { status, message, err := healthCheck(context.Background(), tt.checkLiveness) - assert.Equal(t, http.StatusOK, status) + if tt.expectedOK { + assert.Equal(t, http.StatusOK, status) + assert.NoError(t, err) + } else { + assert.Equal(t, http.StatusServiceUnavailable, status) + assert.Error(t, err) + } assert.Equal(t, tt.expectedMsg, message) - assert.NoError(t, err) }) } } func TestHealthCheckerContextHandling(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) - // Use a port that's guaranteed to be closed unusedPort := getUnusedPort(t) brokers := []string{fmt.Sprintf("localhost:%d", unusedPort)} healthCheck := HealthChecker(ctx, brokers) - // Cancel context before calling health check cancel() - status, message, err := healthCheck(ctx, true) + status, message, err := healthCheck(ctx, false) - // Should still attempt the check despite canceled context in creation - // The actual connection attempt will fail due to invalid broker assert.Equal(t, http.StatusServiceUnavailable, status) assert.Equal(t, "Failed to connect to Kafka", message) assert.Error(t, err) @@ -179,7 +199,7 @@ func TestHealthCheckerErrorScenarios(t *testing.T) { t.Run(tt.name, func(t *testing.T) { healthCheck := HealthChecker(context.Background(), tt.brokers) - status, message, err := healthCheck(context.Background(), true) + status, message, err := healthCheck(context.Background(), false) assert.Equal(t, tt.expectedStatus, status) assert.Equal(t, tt.expectedMessage, message) diff --git a/util/kafka/kafka_producer.go b/util/kafka/kafka_producer.go index d5913acfa6..ae6368d1fc 100644 --- a/util/kafka/kafka_producer.go +++ b/util/kafka/kafka_producer.go @@ -2,16 +2,17 @@ package kafka import ( + "context" "encoding/binary" "net/url" "strings" - "github.com/IBM/sarama" safeconversion "github.com/bsv-blockchain/go-safe-conversion" "github.com/bsv-blockchain/teranode/errors" "github.com/bsv-blockchain/teranode/settings" "github.com/bsv-blockchain/teranode/util" imk "github.com/bsv-blockchain/teranode/util/kafka/in_memory_kafka" + "github.com/twmb/franz-go/pkg/kgo" ) /** @@ -24,9 +25,6 @@ kafka-console-consumer.sh --topic blocks --bootstrap-server localhost:9092 --fro // KafkaProducerI defines the interface for Kafka producer operations. type KafkaProducerI interface { - // GetClient returns the underlying consumer group client - GetClient() sarama.ConsumerGroup - // Send publishes a message with the given key and data Send(key []byte, data []byte) error @@ -34,31 +32,40 @@ type KafkaProducerI interface { Close() error } -// SyncKafkaProducer implements a synchronous Kafka producer. +// SyncKafkaProducer implements a synchronous Kafka producer using franz-go. type SyncKafkaProducer struct { - Producer sarama.SyncProducer // Underlying Sarama sync producer - Topic string // Kafka topic to produce to - Partitions int32 // Number of partitions - client sarama.ConsumerGroup // Associated consumer group client + client *kgo.Client // Underlying franz-go client + Topic string // Kafka topic to produce to + Partitions int32 // Number of partitions + + // For in-memory support + inMemoryProducer *imk.InMemorySyncProducer + isInMemory bool } // Close gracefully shuts down the sync producer. func (k *SyncKafkaProducer) Close() error { - if err := k.Producer.Close(); err != nil { - return errors.NewServiceError("failed to close Kafka producer", err) + if k.isInMemory { + return k.inMemoryProducer.Close() } - return nil -} + if k.client != nil { + if err := k.client.Flush(context.Background()); err != nil { + return errors.NewServiceError("failed to flush Kafka producer", err) + } + k.client.Close() + } -// GetClient returns the associated consumer group client. -func (k *SyncKafkaProducer) GetClient() sarama.ConsumerGroup { - return k.client + return nil } // Send publishes a message to Kafka with the specified key and data. // The partition is determined by hashing the key. func (k *SyncKafkaProducer) Send(key []byte, data []byte) error { + if k.isInMemory { + return k.sendInMemory(key, data) + } + kPartitionsUint32, err := safeconversion.Int32ToUint32(k.Partitions) if err != nil { return err @@ -71,17 +78,27 @@ func (k *SyncKafkaProducer) Send(key []byte, data []byte) error { return err } - _, _, err = k.Producer.SendMessage(&sarama.ProducerMessage{ + record := &kgo.Record{ Topic: k.Topic, - Key: sarama.ByteEncoder(key), - Value: sarama.ByteEncoder(data), + Key: key, + Value: data, Partition: partitionInt32, - }) + } + + results := k.client.ProduceSync(context.Background(), record) + if err := results.FirstErr(); err != nil { + return err + } - return err + return nil +} + +// sendInMemory handles sending for in-memory producer +func (k *SyncKafkaProducer) sendInMemory(key []byte, data []byte) error { + return k.inMemoryProducer.Send(k.Topic, key, data) } -// NewKafkaProducer creates a new Kafka producer from the given URL. +// NewKafkaProducer creates a new Kafka producer from the given URL using franz-go. // It also creates the topic if it doesn't exist with the specified configuration. // For "memory" scheme, it uses an in-memory implementation. // @@ -90,96 +107,98 @@ func (k *SyncKafkaProducer) Send(key []byte, data []byte) error { // - kafkaSettings: Kafka settings for TLS and debug logging (can be nil for defaults) // // Returns: -// - ClusterAdmin: Kafka cluster administrator interface (nil for memory scheme) // - KafkaProducerI: Configured Kafka producer // - error: Any error encountered during setup -func NewKafkaProducer(kafkaURL *url.URL, kafkaSettings *settings.KafkaSettings) (sarama.ClusterAdmin, KafkaProducerI, error) { +func NewKafkaProducer(kafkaURL *url.URL, kafkaSettings *settings.KafkaSettings) (KafkaProducerI, error) { + return NewKafkaProducerWithContext(context.Background(), kafkaURL, kafkaSettings) +} + +// NewKafkaProducerWithContext creates a new Kafka producer from the given URL using franz-go with context. +func NewKafkaProducerWithContext(ctx context.Context, kafkaURL *url.URL, kafkaSettings *settings.KafkaSettings) (KafkaProducerI, error) { topic := kafkaURL.Path[1:] // Handle in-memory producer case if kafkaURL.Scheme == memoryScheme { - // Get the shared broker instance broker := imk.GetSharedBroker() - // Create the in-memory sync producer (implements sarama.SyncProducer) - inMemSaramaProducer := imk.NewInMemorySyncProducer(broker) - // No error expected from mock creation + inMemProducer := imk.NewInMemorySyncProducer(broker) - // Wrap the sarama.SyncProducer in our SyncKafkaProducer to satisfy KafkaProducerI producer := &SyncKafkaProducer{ - Producer: inMemSaramaProducer, - Topic: topic, + Topic: topic, + inMemoryProducer: inMemProducer, + isInMemory: true, } - return nil, producer, nil // Return wrapper type + return producer, nil } - // Proceed with real Kafka connection + // Proceed with real franz-go connection brokersURL := strings.Split(kafkaURL.Host, ",") - config := sarama.NewConfig() - config.Version = sarama.V2_1_0_0 - - // Note: Debug logging not supported for sync producer as it doesn't have a logger parameter - // If needed, add a logger parameter to NewKafkaProducer function - - // Apply authentication settings if kafkaSettings provided and TLS is enabled - if kafkaSettings != nil && kafkaSettings.EnableTLS { - if err := configureKafkaAuthFromFields(config, kafkaSettings.EnableTLS, kafkaSettings.TLSSkipVerify, - kafkaSettings.TLSCAFile, kafkaSettings.TLSCertFile, kafkaSettings.TLSKeyFile); err != nil { - return nil, nil, errors.NewConfigurationError("failed to configure Kafka authentication", err) - } - } - - clusterAdmin, err := sarama.NewClusterAdmin(brokersURL, config) - if err != nil { - return nil, nil, errors.NewServiceError("error while creating cluster admin", err) - } - partitions := util.GetQueryParamInt(kafkaURL, "partitions", 1) replicationFactor := util.GetQueryParamInt(kafkaURL, "replication", 1) - retentionPeriod := util.GetQueryParam(kafkaURL, "retention", "600000") // 10 minutes - segmentBytes := util.GetQueryParam(kafkaURL, "segment_bytes", "1073741824") // 1GB default + retentionPeriod := util.GetQueryParam(kafkaURL, "retention", "600000") + segmentBytes := util.GetQueryParam(kafkaURL, "segment_bytes", "1073741824") + flushBytes := util.GetQueryParamInt(kafkaURL, "flush_bytes", 16*1024) partitionsInt32, err := safeconversion.IntToInt32(partitions) if err != nil { - return nil, nil, err + return nil, err } replicationFactorInt16, err := safeconversion.IntToInt16(replicationFactor) if err != nil { - // Clean up cluster admin if topic creation prep fails - _ = clusterAdmin.Close() // Best effort close - return nil, nil, err - } - - if err := clusterAdmin.CreateTopic(topic, &sarama.TopicDetail{ - NumPartitions: partitionsInt32, - ReplicationFactor: replicationFactorInt16, - ConfigEntries: map[string]*string{ - "retention.ms": &retentionPeriod, // Set the retention period - "delete.retention.ms": &retentionPeriod, - "segment.ms": &retentionPeriod, - "segment.bytes": &segmentBytes, - }, - }, false); err != nil { - if !errors.Is(err, sarama.ErrTopicAlreadyExists) { - _ = clusterAdmin.Close() // Best effort close - return nil, nil, err - } + return nil, err } - flushBytes := util.GetQueryParamInt(kafkaURL, "flush_bytes", 1024) + // Build franz-go client options + opts := []kgo.Opt{ + kgo.SeedBrokers(brokersURL...), + kgo.DefaultProduceTopic(topic), + kgo.ProducerBatchMaxBytes(int32(flushBytes)), + kgo.RequiredAcks(kgo.AllISRAcks()), + kgo.RecordPartitioner(kgo.ManualPartitioner()), + kgo.RecordRetries(5), + } - producer, err := ConnectProducer(brokersURL, topic, partitionsInt32, kafkaSettings, flushBytes) + // Configure TLS if enabled + if kafkaSettings != nil && kafkaSettings.EnableTLS { + tlsConfig, err := buildFranzTLSConfig(kafkaSettings.EnableTLS, kafkaSettings.TLSSkipVerify, + kafkaSettings.TLSCAFile, kafkaSettings.TLSCertFile, kafkaSettings.TLSKeyFile) + if err != nil { + return nil, errors.NewConfigurationError("failed to configure TLS for kafka producer", err) + } + opts = append(opts, kgo.DialTLSConfig(tlsConfig)) + } + + // Create the franz-go client + client, err := kgo.NewClient(opts...) if err != nil { - _ = clusterAdmin.Close() // Best effort close - return nil, nil, errors.NewServiceError("unable to connect to kafka", err) + return nil, errors.NewServiceError("error while creating kafka client", err) + } + + // Create topic configuration + cfg := KafkaProducerConfig{ + Topic: topic, + Partitions: partitionsInt32, + ReplicationFactor: replicationFactorInt16, + RetentionPeriodMillis: retentionPeriod, + SegmentBytes: segmentBytes, + } + + // Create topic if it doesn't exist + if err := createTopicWithFranz(ctx, client, cfg); err != nil { + client.Close() + return nil, err } - return clusterAdmin, producer, nil + return &SyncKafkaProducer{ + client: client, + Partitions: partitionsInt32, + Topic: topic, + }, nil } -// ConnectProducer establishes a connection to Kafka and creates a new sync producer. +// ConnectProducer establishes a connection to Kafka and creates a new sync producer using franz-go. // // Parameters: // - brokersURL: List of Kafka broker URLs @@ -192,36 +211,39 @@ func NewKafkaProducer(kafkaURL *url.URL, kafkaSettings *settings.KafkaSettings) // - KafkaProducerI: Configured Kafka producer // - error: Any error encountered during connection func ConnectProducer(brokersURL []string, topic string, partitions int32, kafkaSettings *settings.KafkaSettings, flushBytes ...int) (KafkaProducerI, error) { - config := sarama.NewConfig() - config.Producer.Return.Successes = true - config.Producer.Return.Errors = true - config.Producer.RequiredAcks = sarama.WaitForAll - config.Producer.Retry.Max = 5 - config.Producer.Partitioner = sarama.NewManualPartitioner - - // Apply authentication settings if kafkaSettings provided and TLS is enabled - if kafkaSettings != nil && kafkaSettings.EnableTLS { - if err := configureKafkaAuthFromFields(config, kafkaSettings.EnableTLS, kafkaSettings.TLSSkipVerify, - kafkaSettings.TLSCAFile, kafkaSettings.TLSCertFile, kafkaSettings.TLSKeyFile); err != nil { - return nil, errors.NewConfigurationError("failed to configure Kafka authentication", err) - } - } - flush := 16 * 1024 if len(flushBytes) > 0 { flush = flushBytes[0] } - config.Producer.Flush.Bytes = flush + // Build franz-go client options + opts := []kgo.Opt{ + kgo.SeedBrokers(brokersURL...), + kgo.DefaultProduceTopic(topic), + kgo.ProducerBatchMaxBytes(int32(flush)), + kgo.RequiredAcks(kgo.AllISRAcks()), + kgo.RecordPartitioner(kgo.ManualPartitioner()), + kgo.RecordRetries(5), + } + + // Configure TLS if enabled + if kafkaSettings != nil && kafkaSettings.EnableTLS { + tlsConfig, err := buildFranzTLSConfig(kafkaSettings.EnableTLS, kafkaSettings.TLSSkipVerify, + kafkaSettings.TLSCAFile, kafkaSettings.TLSCertFile, kafkaSettings.TLSKeyFile) + if err != nil { + return nil, errors.NewConfigurationError("failed to configure TLS for kafka producer", err) + } + opts = append(opts, kgo.DialTLSConfig(tlsConfig)) + } - // NewSyncProducer creates a new SyncProducer using the given broker addresses and configuration. - conn, err := sarama.NewSyncProducer(brokersURL, config) + // Create the franz-go client + client, err := kgo.NewClient(opts...) if err != nil { return nil, err } return &SyncKafkaProducer{ - Producer: conn, + client: client, Partitions: partitions, Topic: topic, }, nil diff --git a/util/kafka/kafka_producer_async.go b/util/kafka/kafka_producer_async.go index edd1ac8bc1..a87b7ada2d 100644 --- a/util/kafka/kafka_producer_async.go +++ b/util/kafka/kafka_producer_async.go @@ -3,17 +3,19 @@ package kafka import ( "context" + "crypto/tls" + "crypto/x509" "fmt" "net/url" "os" "os/signal" + "strconv" "strings" "sync" "sync/atomic" "syscall" "time" - "github.com/IBM/sarama" safeconversion "github.com/bsv-blockchain/go-safe-conversion" "github.com/bsv-blockchain/teranode/errors" "github.com/bsv-blockchain/teranode/settings" @@ -22,16 +24,10 @@ import ( inmemorykafka "github.com/bsv-blockchain/teranode/util/kafka/in_memory_kafka" "github.com/bsv-blockchain/teranode/util/retry" "github.com/ordishs/go-utils" - "github.com/rcrowley/go-metrics" + "github.com/twmb/franz-go/pkg/kadm" + "github.com/twmb/franz-go/pkg/kgo" ) -// init disables go-metrics globally to prevent memory leak from exponential decay sample heap. -// This must be set before any Sarama clients are created. -// See: https://github.com/IBM/sarama/issues/1321 -func init() { - metrics.UseNilMetrics = true -} - // KafkaAsyncProducerI defines the interface for asynchronous Kafka producer operations. type KafkaAsyncProducerI interface { // Start begins the async producer operation with the given message channel @@ -69,7 +65,7 @@ type KafkaProducerConfig struct { TLSKeyFile string // Path to client key file // Debug logging - EnableDebugLogging bool // Enable verbose Sarama (Kafka client) debug logging + EnableDebugLogging bool // Enable verbose debug logging } // MessageStatus represents the status of a produced message. @@ -85,29 +81,21 @@ type Message struct { Value []byte } -// KafkaAsyncProducer implements asynchronous Kafka producer functionality. +// KafkaAsyncProducer implements asynchronous Kafka producer functionality using franz-go. type KafkaAsyncProducer struct { - Config KafkaProducerConfig // Producer configuration - Producer sarama.AsyncProducer // Underlying Sarama async producer - publishChannel chan *Message // Channel for publishing messages - closed atomic.Bool // Flag indicating if producer is closed - channelMu sync.RWMutex // Mutex to protect publishChannel access - publishWg sync.WaitGroup // WaitGroup to track publish goroutine + Config KafkaProducerConfig // Producer configuration + client *kgo.Client // Underlying franz-go client + publishChannel chan *Message // Channel for publishing messages + closed atomic.Bool // Flag indicating if producer is closed + channelMu sync.RWMutex // Mutex to protect publishChannel access + publishWg sync.WaitGroup // WaitGroup to track publish goroutine + + // For in-memory support + inMemoryProducer *inmemorykafka.InMemoryAsyncProducer + isInMemory bool } // NewKafkaAsyncProducerFromURL creates a new async producer from a URL configuration. -// This is a convenience function for production code that extracts settings from kafkaSettings. -// For tests, use NewKafkaAsyncProducer directly with a manually constructed config. -// -// Parameters: -// - ctx: Context for producer operations -// - logger: Logger instance -// - url: URL containing Kafka configuration -// - kafkaSettings: Kafka settings for TLS and debug logging (can be nil for defaults) -// -// Returns: -// - *KafkaAsyncProducer: Configured async producer -// - error: Any error encountered during setup func NewKafkaAsyncProducerFromURL(ctx context.Context, logger ulogger.Logger, url *url.URL, kafkaSettings *settings.KafkaSettings) (*KafkaAsyncProducer, error) { partitionsInt32, err := safeconversion.IntToInt32(util.GetQueryParamInt(url, "partitions", 1)) if err != nil { @@ -119,7 +107,6 @@ func NewKafkaAsyncProducerFromURL(ctx context.Context, logger ulogger.Logger, ur return nil, err } - // Extract TLS and debug logging settings from kafkaSettings (if provided) var enableTLS, tlsSkipVerify, enableDebugLogging bool var tlsCAFile, tlsCertFile, tlsKeyFile string if kafkaSettings != nil { @@ -138,18 +125,17 @@ func NewKafkaAsyncProducerFromURL(ctx context.Context, logger ulogger.Logger, ur Topic: strings.TrimPrefix(url.Path, "/"), Partitions: partitionsInt32, ReplicationFactor: replicationFactorInt16, - RetentionPeriodMillis: util.GetQueryParam(url, "retention", "600000"), // 10 minutes - SegmentBytes: util.GetQueryParam(url, "segment_bytes", "1073741824"), // 1GB default + RetentionPeriodMillis: util.GetQueryParam(url, "retention", "600000"), + SegmentBytes: util.GetQueryParam(url, "segment_bytes", "1073741824"), FlushBytes: util.GetQueryParamInt(url, "flush_bytes", 1024*1024), FlushMessages: util.GetQueryParamInt(url, "flush_messages", 50_000), FlushFrequency: util.GetQueryParamDuration(url, "flush_frequency", 10*time.Second), - // TLS/Auth configuration - EnableTLS: enableTLS, - TLSSkipVerify: tlsSkipVerify, - TLSCAFile: tlsCAFile, - TLSCertFile: tlsCertFile, - TLSKeyFile: tlsKeyFile, - EnableDebugLogging: enableDebugLogging, + EnableTLS: enableTLS, + TLSSkipVerify: tlsSkipVerify, + TLSCAFile: tlsCAFile, + TLSCertFile: tlsCertFile, + TLSKeyFile: tlsKeyFile, + EnableDebugLogging: enableDebugLogging, } producer, err := retry.Retry(ctx, logger, func() (*KafkaAsyncProducer, error) { @@ -163,147 +149,94 @@ func NewKafkaAsyncProducerFromURL(ctx context.Context, logger ulogger.Logger, ur return producer, nil } -// NewKafkaAsyncProducer creates a new async producer with the given configuration. -// -// Parameters: -// - logger: Logger instance -// - cfg: Producer configuration (includes TLS and debug logging settings) -// -// Returns: -// - *KafkaAsyncProducer: Configured async producer -// - error: Any error encountered during setup +// NewKafkaAsyncProducer creates a new async producer with the given configuration using franz-go. func NewKafkaAsyncProducer(logger ulogger.Logger, cfg KafkaProducerConfig) (*KafkaAsyncProducer, error) { logger.Debugf("Starting async kafka producer for %v", cfg.URL) - if cfg.URL.Scheme == memoryScheme { - // --- Use the in-memory implementation --- - broker := inmemorykafka.GetSharedBroker() // Use alias 'imk' - // Use a reasonable default buffer size for the mock async producer, or take from config if available - bufferSize := 256 // Default buffer size for the publish channel - producer := inmemorykafka.NewInMemoryAsyncProducer(broker, bufferSize) // Use alias 'imk' + if cfg.URL != nil && cfg.URL.Scheme == memoryScheme { + broker := inmemorykafka.GetSharedBroker() + bufferSize := 256 + producer := inmemorykafka.NewInMemoryAsyncProducer(broker, bufferSize) cfg.Logger.Infof("Using in-memory Kafka async producer") - // No error expected from mock creation client := &KafkaAsyncProducer{ - Producer: producer, - Config: cfg, + Config: cfg, + inMemoryProducer: producer, + isInMemory: true, } return client, nil } - // --- Use the real Sarama implementation --- - - config := sarama.NewConfig() - config.Producer.Flush.Bytes = cfg.FlushBytes - config.Producer.Flush.Messages = cfg.FlushMessages - config.Producer.Flush.Frequency = cfg.FlushFrequency - // config.Producer.Return.Successes = true - - // Enable Sarama debug logging if configured - if cfg.EnableDebugLogging { - sarama.Logger = &saramaLoggerAdapter{logger: logger} - logger.Infof("Kafka debug logging enabled for async producer topic %s", cfg.Topic) + // Build franz-go client options + opts := []kgo.Opt{ + kgo.SeedBrokers(cfg.BrokersURL...), + kgo.DefaultProduceTopic(cfg.Topic), + kgo.ProducerBatchMaxBytes(int32(cfg.FlushBytes)), + kgo.ProducerLinger(cfg.FlushFrequency), + kgo.MaxBufferedRecords(cfg.FlushMessages), + kgo.DisableIdempotentWrite(), } - // Apply authentication settings if TLS is enabled + // Configure TLS if enabled if cfg.EnableTLS { - cfg.Logger.Debugf("Configuring Kafka TLS authentication - EnableTLS: %v, SkipVerify: %v, CA: %s, Cert: %s", - cfg.EnableTLS, cfg.TLSSkipVerify, cfg.TLSCAFile, cfg.TLSCertFile) - - if err := configureKafkaAuthFromFields(config, cfg.EnableTLS, cfg.TLSSkipVerify, cfg.TLSCAFile, cfg.TLSCertFile, cfg.TLSKeyFile); err != nil { - return nil, errors.NewConfigurationError("failed to configure Kafka authentication", err) + tlsConfig, err := buildFranzTLSConfig(cfg.EnableTLS, cfg.TLSSkipVerify, cfg.TLSCAFile, cfg.TLSCertFile, cfg.TLSKeyFile) + if err != nil { + return nil, errors.NewConfigurationError("failed to configure TLS for kafka async producer", err) } - - cfg.Logger.Debugf("Successfully configured Kafka TLS authentication for async producer topic %s", cfg.Topic) + opts = append(opts, kgo.DialTLSConfig(tlsConfig)) } - cfg.Logger.Infof("Starting Kafka async producer for %s topic", cfg.Topic) - - // try turning off acks - // config.Producer.RequiredAcks = sarama.NoResponse // Equivalent to 'acks=0' - // config.Producer.Return.Successes = false - - clusterAdmin, err := sarama.NewClusterAdmin(cfg.BrokersURL, config) - if err != nil { - return nil, errors.NewConfigurationError("error while creating cluster admin", err) - } - defer func(clusterAdmin sarama.ClusterAdmin) { - _ = clusterAdmin.Close() - }(clusterAdmin) - - if err := createTopic(clusterAdmin, cfg); err != nil { - return nil, err + // Enable debug logging if configured + if cfg.EnableDebugLogging { + opts = append(opts, kgo.WithLogger(&franzLoggerAdapter{logger: logger})) } - producer, err := sarama.NewAsyncProducer(cfg.BrokersURL, config) + // Create the franz-go client + client, err := kgo.NewClient(opts...) if err != nil { return nil, errors.NewServiceError("Failed to create Kafka async producer for %s", cfg.Topic, err) } - client := &KafkaAsyncProducer{ - Producer: producer, - Config: cfg, - } - - return client, nil -} - -func (c *KafkaAsyncProducer) decodeKeyOrValue(encoder sarama.Encoder) string { - if encoder == nil { - return "" + // Create topic if it doesn't exist (uses Background; constructor is not cancelable) + if err := createTopicWithFranz(context.Background(), client, cfg); err != nil { + client.Close() + return nil, err } - bytes := encoder.(sarama.ByteEncoder) - - if len(bytes) > 80 { - return fmt.Sprintf("%x", bytes[:80]) + "... (truncated)" + producer := &KafkaAsyncProducer{ + Config: cfg, + client: client, } - return fmt.Sprintf("%x", bytes) + return producer, nil } // Start begins the async producer operation. -// It sets up message handling and error handling goroutines. func (c *KafkaAsyncProducer) Start(ctx context.Context, ch chan *Message) { if c == nil { return } + // Handle in-memory case + if c.isInMemory { + c.startInMemory(ctx, ch) + return + } + wg := sync.WaitGroup{} wg.Add(1) - c.publishWg.Add(1) // Track the publish goroutine + c.publishWg.Add(1) go func() { - context, cancel := context.WithCancel(ctx) - + internalCtx, cancel := context.WithCancel(ctx) defer cancel() c.channelMu.Lock() c.publishChannel = ch c.channelMu.Unlock() - go func() { - for s := range c.Producer.Successes() { - key := c.decodeKeyOrValue(s.Key) - value := c.decodeKeyOrValue(s.Value) - - c.Config.Logger.Debugf("Successfully sent message to topic %s, offset: %d, key: %v, value: %v", - s.Topic, s.Offset, key, value) - } - }() - - go func() { - for err := range c.Producer.Errors() { - key := c.decodeKeyOrValue(err.Msg.Key) - value := c.decodeKeyOrValue(err.Msg.Value) - - c.Config.Logger.Errorf("Failed to deliver message to topic %s: %v, Key: %v, Value: %v", - err.Msg.Topic, err.Err, key, value) - } - }() - go func() { defer c.publishWg.Done() wg.Done() @@ -317,41 +250,24 @@ func (c *KafkaAsyncProducer) Start(ctx context.Context, ch chan *Message) { break } - var key sarama.ByteEncoder - if msgBytes.Key != nil { - key = sarama.ByteEncoder(msgBytes.Key) - } - - message := &sarama.ProducerMessage{ + record := &kgo.Record{ Topic: c.Config.Topic, - Key: key, - Value: sarama.ByteEncoder(msgBytes.Value), + Key: msgBytes.Key, + Value: msgBytes.Value, } - // Check if closed again right before sending to avoid race condition - // where Close() is called between the check above and the send below if c.closed.Load() { break } - // Use a function with recover to safely handle sends to potentially closed channel - func() { - defer func() { - if r := recover(); r != nil { - // Check if this is the expected "send on closed channel" panic - panicMsg := fmt.Sprint(r) - if strings.Contains(panicMsg, "closed channel") { - // Expected during shutdown, log at debug level - c.Config.Logger.Debugf("[kafka] Recovered from send to closed channel during shutdown") - } else { - // Unexpected panic - log error and re-throw to expose the bug - c.Config.Logger.Errorf("[kafka] Unexpected panic while sending message: %v", r) - panic(r) - } - } - }() - c.Producer.Input() <- message - }() + // Produce asynchronously with callback + c.client.Produce(internalCtx, record, func(r *kgo.Record, err error) { + if err != nil { + c.Config.Logger.Errorf("Failed to deliver message to topic %s: %v, Key: %x", r.Topic, err, r.Key) + } else { + c.Config.Logger.Debugf("Successfully sent message to topic %s, partition: %d, offset: %d", r.Topic, r.Partition, r.Offset) + } + }) } }() @@ -366,15 +282,77 @@ func (c *KafkaAsyncProducer) Start(ctx context.Context, ch chan *Message) { select { case <-signals: c.Config.Logger.Infof("[kafka] Received signal, shutting down producer %v ...", c.Config.URL) - cancel() // Ensure the context is canceled - case <-context.Done(): + cancel() + case <-internalCtx.Done(): + c.Config.Logger.Infof("[kafka] Context done, shutting down producer %v ...", c.Config.URL) + } + + _ = c.Stop() + }() + + wg.Wait() +} + +// startInMemory handles the in-memory producer case +func (c *KafkaAsyncProducer) startInMemory(ctx context.Context, ch chan *Message) { + wg := sync.WaitGroup{} + wg.Add(1) + c.publishWg.Add(1) + + go func() { + internalCtx, cancel := context.WithCancel(ctx) + defer cancel() + + c.channelMu.Lock() + c.publishChannel = ch + c.channelMu.Unlock() + + go func() { + defer c.publishWg.Done() + wg.Done() + + c.channelMu.RLock() + ch := c.publishChannel + c.channelMu.RUnlock() + + for msgBytes := range ch { + if c.closed.Load() { + break + } + + c.inMemoryProducer.Produce(c.Config.Topic, msgBytes.Key, msgBytes.Value) + } + }() + + // Handle successes + go func() { + for range c.inMemoryProducer.Successes() { + c.Config.Logger.Debugf("Successfully sent message to topic %s", c.Config.Topic) + } + }() + + // Handle errors + go func() { + for err := range c.inMemoryProducer.Errors() { + c.Config.Logger.Errorf("Failed to deliver message: %v", err) + } + }() + + signals := make(chan os.Signal, 1) + signal.Notify(signals, os.Interrupt, syscall.SIGTERM) + + select { + case <-signals: + c.Config.Logger.Infof("[kafka] Received signal, shutting down producer %v ...", c.Config.URL) + cancel() + case <-internalCtx.Done(): c.Config.Logger.Infof("[kafka] Context done, shutting down producer %v ...", c.Config.URL) } _ = c.Stop() }() - wg.Wait() // don't continue until we know we know the go func has started and is ready to accept messages on the PublishChannel + wg.Wait() } // Stop gracefully shuts down the async producer. @@ -389,22 +367,30 @@ func (c *KafkaAsyncProducer) Stop() error { c.closed.Store(true) - // Close the publish channel to signal the publish goroutine to exit c.channelMu.Lock() ch := c.publishChannel if ch != nil { - c.publishChannel = nil // Set to nil BEFORE closing to prevent sends to closed channel + c.publishChannel = nil close(ch) } c.channelMu.Unlock() - // Wait for the publish goroutine to finish processing c.publishWg.Wait() - // Now it's safe to close the producer - if err := c.Producer.Close(); err != nil { - c.closed.Store(false) - return err + if c.isInMemory { + if c.inMemoryProducer != nil { + if err := c.inMemoryProducer.Close(); err != nil { + c.closed.Store(false) + return err + } + } + } else { + if c.client != nil { + if err := c.client.Flush(context.Background()); err != nil { + c.Config.Logger.Warnf("Error flushing kafka producer: %v", err) + } + c.client.Close() + } } return nil @@ -428,52 +414,124 @@ func (c *KafkaAsyncProducer) Publish(msg *Message) { return } - c.channelMu.RLock() - defer c.channelMu.RUnlock() - if c.publishChannel != nil { utils.SafeSend(c.publishChannel, msg) } } -// createTopic creates a new Kafka topic with the specified configuration. -// -// Parameters: -// - admin: Kafka cluster administrator -// - cfg: Producer configuration containing topic settings -// -// Returns: -// - error: Any error encountered during topic creation -func createTopic(admin sarama.ClusterAdmin, cfg KafkaProducerConfig) error { - err := admin.CreateTopic(cfg.Topic, &sarama.TopicDetail{ - NumPartitions: cfg.Partitions, - ReplicationFactor: cfg.ReplicationFactor, - ConfigEntries: map[string]*string{ - "retention.ms": &cfg.RetentionPeriodMillis, - "delete.retention.ms": &cfg.RetentionPeriodMillis, - "segment.ms": &cfg.RetentionPeriodMillis, - "segment.bytes": &cfg.SegmentBytes, - }, - }, false) +// createTopicWithFranz creates a new Kafka topic with the specified configuration. +func createTopicWithFranz(ctx context.Context, client *kgo.Client, cfg KafkaProducerConfig) error { + admin := kadm.NewClient(client) + retentionMs, err := strconv.ParseInt(cfg.RetentionPeriodMillis, 10, 64) if err != nil { - if errors.Is(err, sarama.ErrTopicAlreadyExists) { - err = admin.AlterConfig(sarama.TopicResource, cfg.Topic, map[string]*string{ - "retention.ms": &cfg.RetentionPeriodMillis, - "delete.retention.ms": &cfg.RetentionPeriodMillis, - "segment.ms": &cfg.RetentionPeriodMillis, - "segment.bytes": &cfg.SegmentBytes, - }, false) - - if err != nil { - return errors.NewProcessingError("unable to alter topic config", err) - } + retentionMs = 600000 + } - return nil - } + segmentBytes, err := strconv.ParseInt(cfg.SegmentBytes, 10, 64) + if err != nil { + segmentBytes = 1073741824 + } + + configs := map[string]*string{ + "retention.ms": stringPtr(fmt.Sprintf("%d", retentionMs)), + "delete.retention.ms": stringPtr(fmt.Sprintf("%d", retentionMs)), + "segment.ms": stringPtr(fmt.Sprintf("%d", retentionMs)), + "segment.bytes": stringPtr(fmt.Sprintf("%d", segmentBytes)), + } + resp, err := admin.CreateTopic(ctx, cfg.Partitions, cfg.ReplicationFactor, configs, cfg.Topic) + if err != nil { return errors.NewProcessingError("unable to create topic", err) } + if resp.Err != nil && resp.Err.Error() != "TOPIC_ALREADY_EXISTS" { + _, alterErr := admin.AlterTopicConfigs(ctx, []kadm.AlterConfig{ + {Name: "retention.ms", Value: stringPtr(fmt.Sprintf("%d", retentionMs))}, + {Name: "delete.retention.ms", Value: stringPtr(fmt.Sprintf("%d", retentionMs))}, + {Name: "segment.ms", Value: stringPtr(fmt.Sprintf("%d", retentionMs))}, + {Name: "segment.bytes", Value: stringPtr(fmt.Sprintf("%d", segmentBytes))}, + }, cfg.Topic) + if alterErr != nil { + return errors.NewProcessingError("unable to alter topic config", alterErr) + } + } + return nil } + +// buildFranzTLSConfig builds a TLS configuration for franz-go +func buildFranzTLSConfig(enableTLS bool, tlsSkipVerify bool, tlsCAFile string, tlsCertFile string, tlsKeyFile string) (*tls.Config, error) { + if !enableTLS { + return nil, nil + } + + // #nosec G402 -- InsecureSkipVerify is configurable and may be needed for testing environments + tlsConfig := &tls.Config{ + InsecureSkipVerify: tlsSkipVerify, + } + + if tlsCAFile != "" { + caCert, err := os.ReadFile(tlsCAFile) + if err != nil { + return nil, errors.New(errors.ERR_CONFIGURATION, "failed to read TLS CA file: "+tlsCAFile, err) + } + + if tlsConfig.RootCAs == nil { + tlsConfig.RootCAs = loadSystemCertPool() + } + + if !tlsConfig.RootCAs.AppendCertsFromPEM(caCert) { + return nil, errors.New(errors.ERR_CONFIGURATION, "failed to append CA certificate to RootCAs from file: "+tlsCAFile) + } + } + + if tlsCertFile != "" && tlsKeyFile != "" { + cert, err := tls.LoadX509KeyPair(tlsCertFile, tlsKeyFile) + if err != nil { + return nil, errors.New(errors.ERR_CONFIGURATION, "failed to load TLS certificate/key pair", err) + } + tlsConfig.Certificates = []tls.Certificate{cert} + } + + return tlsConfig, nil +} + +// loadSystemCertPool loads the system certificate pool +func loadSystemCertPool() *x509.CertPool { + pool, err := x509.SystemCertPool() + if err != nil { + return x509.NewCertPool() + } + return pool +} + +// franzLoggerAdapter adapts ulogger.Logger to franz-go's logger interface +type franzLoggerAdapter struct { + logger ulogger.Logger +} + +func (f *franzLoggerAdapter) Level() kgo.LogLevel { + return kgo.LogLevelDebug +} + +func (f *franzLoggerAdapter) Log(level kgo.LogLevel, msg string, keyvals ...interface{}) { + formatted := fmt.Sprintf("[FRANZ-GO] %s %v", msg, keyvals) + switch level { + case kgo.LogLevelError: + f.logger.Errorf(formatted) + case kgo.LogLevelWarn: + f.logger.Warnf(formatted) + case kgo.LogLevelInfo: + f.logger.Infof(formatted) + case kgo.LogLevelDebug: + f.logger.Debugf(formatted) + default: + f.logger.Infof(formatted) + } +} + +// stringPtr returns a pointer to the string +func stringPtr(s string) *string { + return &s +} diff --git a/util/kafka/kafka_producer_async_test.go b/util/kafka/kafka_producer_async_test.go index 212ceac5db..53ed0053f0 100644 --- a/util/kafka/kafka_producer_async_test.go +++ b/util/kafka/kafka_producer_async_test.go @@ -6,7 +6,6 @@ import ( "testing" "time" - "github.com/IBM/sarama" "github.com/bsv-blockchain/teranode/ulogger" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -57,39 +56,6 @@ func TestKafkaAsyncProducerBrokersURLNilProducer(t *testing.T) { assert.Nil(t, result) } -func TestKafkaAsyncProducerDecodeKeyOrValue(t *testing.T) { - producer := &KafkaAsyncProducer{} - - tests := []struct { - name string - encoder sarama.Encoder - expected string - }{ - { - name: "Nil encoder", - encoder: nil, - expected: "", - }, - { - name: "Short data", - encoder: sarama.ByteEncoder("hello"), - expected: "68656c6c6f", - }, - { - name: "Long data gets truncated", - encoder: sarama.ByteEncoder(make([]byte, 100)), - expected: "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000... (truncated)", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := producer.decodeKeyOrValue(tt.encoder) - assert.Equal(t, tt.expected, result) - }) - } -} - func TestKafkaAsyncProducerStopNilProducer(t *testing.T) { var producer *KafkaAsyncProducer @@ -193,25 +159,26 @@ func TestNewKafkaAsyncProducerFromURLInvalidConversion(t *testing.T) { func TestNewKafkaAsyncProducerMemoryScheme(t *testing.T) { logger := &mockAsyncLogger{} cfg := KafkaProducerConfig{ - Logger: logger, - URL: &url.URL{Scheme: memoryScheme}, - Topic: "memory-topic", + Logger: logger, + URL: &url.URL{Scheme: memoryScheme, Path: "/memory-topic", Host: "localhost"}, + Topic: "memory-topic", + BrokersURL: []string{"localhost"}, } producer, err := NewKafkaAsyncProducer(logger, cfg) assert.NoError(t, err) assert.NotNil(t, producer) - assert.NotNil(t, producer.Producer) - assert.Equal(t, cfg, producer.Config) + assert.Equal(t, cfg.Topic, producer.Config.Topic) } func TestNewKafkaAsyncProducerWithKafkaSettings(t *testing.T) { logger := &mockAsyncLogger{} cfg := KafkaProducerConfig{ Logger: logger, - URL: &url.URL{Scheme: memoryScheme}, + URL: &url.URL{Scheme: memoryScheme, Path: "/test-topic", Host: "localhost"}, Topic: "test-topic", + BrokersURL: []string{"localhost"}, EnableTLS: false, TLSSkipVerify: false, EnableDebugLogging: false, diff --git a/util/kafka/kafka_producer_test.go b/util/kafka/kafka_producer_test.go index 6bed5d6f1d..790c73f00f 100644 --- a/util/kafka/kafka_producer_test.go +++ b/util/kafka/kafka_producer_test.go @@ -1,266 +1,60 @@ package kafka import ( - "context" - "encoding/binary" "fmt" "net" "net/url" "testing" - "github.com/IBM/sarama" - safeconversion "github.com/bsv-blockchain/go-safe-conversion" "github.com/bsv-blockchain/teranode/settings" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -// MockSyncProducer implements sarama.SyncProducer for testing -type MockSyncProducer struct { - messages []sarama.ProducerMessage - closed bool - sendErr error -} - -func (m *MockSyncProducer) SendMessage(msg *sarama.ProducerMessage) (partition int32, offset int64, err error) { - if m.sendErr != nil { - return 0, 0, m.sendErr - } - m.messages = append(m.messages, *msg) - return msg.Partition, int64(len(m.messages)), nil -} - -func (m *MockSyncProducer) SendMessages(msgs []*sarama.ProducerMessage) error { - for _, msg := range msgs { - _, _, err := m.SendMessage(msg) - if err != nil { - return err - } - } - return nil -} - -func (m *MockSyncProducer) Close() error { - m.closed = true - return nil -} - -func (m *MockSyncProducer) TxnStatus() sarama.ProducerTxnStatusFlag { - return sarama.ProducerTxnFlagReady -} - -func (m *MockSyncProducer) IsTransactional() bool { - return false -} - -func (m *MockSyncProducer) BeginTxn() error { - return nil -} - -func (m *MockSyncProducer) CommitTxn() error { - return nil -} - -func (m *MockSyncProducer) AbortTxn() error { - return nil -} - -func (m *MockSyncProducer) AddOffsetsToTxn(map[string][]*sarama.PartitionOffsetMetadata, string) error { - return nil -} - -func (m *MockSyncProducer) AddMessageToTxn(*sarama.ConsumerMessage, string, *string) error { - return nil -} - func TestSyncKafkaProducerClose(t *testing.T) { - mockProducer := &MockSyncProducer{} - producer := &SyncKafkaProducer{ - Producer: mockProducer, - Topic: "test-topic", - Partitions: 1, - } - - err := producer.Close() + kafkaURL, err := url.Parse("memory://localhost/test-topic") + require.NoError(t, err) + producer, err := NewKafkaProducer(kafkaURL, nil) + require.NoError(t, err) + require.NotNil(t, producer) + err = producer.Close() assert.NoError(t, err) - assert.True(t, mockProducer.closed) -} - -func TestSyncKafkaProducerGetClient(t *testing.T) { - mockProducer := &MockSyncProducer{} - mockClient := &mockConsumerGroup{} - producer := &SyncKafkaProducer{ - Producer: mockProducer, - Topic: "test-topic", - Partitions: 1, - client: mockClient, - } - - client := producer.GetClient() - - assert.Equal(t, mockClient, client) } func TestSyncKafkaProducerSend(t *testing.T) { - tests := []struct { - name string - partitions int32 - key []byte - data []byte - expectedTopic string - expectError bool - }{ - { - name: "Send with valid key and data", - partitions: 4, - key: []byte{0x01, 0x02, 0x03, 0x04}, - data: []byte("test message"), - expectedTopic: "test-topic", - expectError: false, - }, - { - name: "Send with single partition", - partitions: 1, - key: []byte{0xFF, 0xFF, 0xFF, 0xFF}, - data: []byte("single partition message"), - expectedTopic: "test-topic", - expectError: false, - }, - { - name: "Send with empty data", - partitions: 2, - key: []byte{0x00, 0x00, 0x00, 0x01}, - data: []byte{}, - expectedTopic: "test-topic", - expectError: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - mockProducer := &MockSyncProducer{} - producer := &SyncKafkaProducer{ - Producer: mockProducer, - Topic: tt.expectedTopic, - Partitions: tt.partitions, - } - - err := producer.Send(tt.key, tt.data) - - if tt.expectError { - assert.Error(t, err) - } else { - assert.NoError(t, err) - require.Len(t, mockProducer.messages, 1) - - msg := mockProducer.messages[0] - assert.Equal(t, tt.expectedTopic, msg.Topic) - assert.Equal(t, sarama.ByteEncoder(tt.key), msg.Key) - assert.Equal(t, sarama.ByteEncoder(tt.data), msg.Value) - - // Verify partition calculation - kPartitionsUint32, _ := safeconversion.Int32ToUint32(tt.partitions) - expectedPartition := binary.LittleEndian.Uint32(tt.key) % kPartitionsUint32 - expectedPartitionInt32, _ := safeconversion.Uint32ToInt32(expectedPartition) - assert.Equal(t, expectedPartitionInt32, msg.Partition) - } - }) - } -} - -func TestSyncKafkaProducerSendPartitionCalculation(t *testing.T) { - mockProducer := &MockSyncProducer{} - producer := &SyncKafkaProducer{ - Producer: mockProducer, - Topic: "test-topic", - Partitions: 3, - } - - testCases := []struct { - key []byte - expectedPartition int32 - }{ - {[]byte{0x00, 0x00, 0x00, 0x00}, 0}, // 0 % 3 = 0 - {[]byte{0x01, 0x00, 0x00, 0x00}, 1}, // 1 % 3 = 1 - {[]byte{0x02, 0x00, 0x00, 0x00}, 2}, // 2 % 3 = 2 - {[]byte{0x03, 0x00, 0x00, 0x00}, 0}, // 3 % 3 = 0 - {[]byte{0x04, 0x00, 0x00, 0x00}, 1}, // 4 % 3 = 1 - } - - for i, tc := range testCases { - err := producer.Send(tc.key, []byte("test")) - require.NoError(t, err) - - msg := mockProducer.messages[i] - assert.Equal(t, tc.expectedPartition, msg.Partition) - } -} - -func TestSyncKafkaProducerSendProducerError(t *testing.T) { - mockProducer := &MockSyncProducer{ - sendErr: assert.AnError, - } - producer := &SyncKafkaProducer{ - Producer: mockProducer, - Topic: "test-topic", - Partitions: 1, - } - - err := producer.Send([]byte{0x01, 0x00, 0x00, 0x00}, []byte("test")) - - assert.Error(t, err) - assert.Equal(t, assert.AnError, err) -} - -// Mock consumer group for testing GetClient -type mockConsumerGroup struct{} - -func (m *mockConsumerGroup) Consume(context.Context, []string, sarama.ConsumerGroupHandler) error { - return nil -} - -func (m *mockConsumerGroup) Errors() <-chan error { - return nil -} + kafkaURL, err := url.Parse("memory://localhost/test-topic?partitions=4") + require.NoError(t, err) + producer, err := NewKafkaProducer(kafkaURL, nil) + require.NoError(t, err) + require.NotNil(t, producer) + defer producer.Close() -func (m *mockConsumerGroup) Close() error { - return nil + err = producer.Send([]byte{0x01, 0x02, 0x03, 0x04}, []byte("test message")) + assert.NoError(t, err) } -func (m *mockConsumerGroup) Pause(map[string][]int32) {} -func (m *mockConsumerGroup) Resume(map[string][]int32) {} -func (m *mockConsumerGroup) PauseAll() {} -func (m *mockConsumerGroup) ResumeAll() {} - func TestNewKafkaProducer_MemoryScheme(t *testing.T) { kafkaURL, err := url.Parse("memory://localhost/test-topic?partitions=2") require.NoError(t, err) - clusterAdmin, producer, err := NewKafkaProducer(kafkaURL, nil) + producer, err := NewKafkaProducer(kafkaURL, nil) assert.NoError(t, err) - assert.Nil(t, clusterAdmin) // Memory scheme returns nil cluster admin assert.NotNil(t, producer) syncProducer, ok := producer.(*SyncKafkaProducer) require.True(t, ok) assert.Equal(t, "test-topic", syncProducer.Topic) - - // Note: Memory scheme producer has 0 partitions, so sending messages would cause divide by zero - // This is a limitation of the current implementation - the Send method should handle this case - // For now, just test that the producer was created correctly - assert.Equal(t, int32(0), syncProducer.Partitions) } func TestNewKafkaProducer_InvalidScheme(t *testing.T) { kafkaURL, err := url.Parse("kafka://invalid-broker:9092/test-topic") require.NoError(t, err) - clusterAdmin, producer, err := NewKafkaProducer(kafkaURL, nil) + producer, err := NewKafkaProducer(kafkaURL, nil) assert.Error(t, err) - assert.Nil(t, clusterAdmin) assert.Nil(t, producer) } @@ -273,17 +67,13 @@ func TestNewKafkaProducer_WithKafkaSettings(t *testing.T) { TLSSkipVerify: false, } - clusterAdmin, producer, err := NewKafkaProducer(kafkaURL, kafkaSettings) + producer, err := NewKafkaProducer(kafkaURL, kafkaSettings) assert.NoError(t, err) - assert.Nil(t, clusterAdmin) // Memory scheme returns nil cluster admin assert.NotNil(t, producer) } func TestConnectProducer_ConfigurationOptions(t *testing.T) { - // This test verifies that ConnectProducer would handle various configurations - // but since it tries to connect to real brokers, we can only test the parameter validation - // Use a port that's guaranteed to be closed to ensure connection failure addr, err := net.ResolveTCPAddr("tcp", "localhost:0") require.NoError(t, err) l, err := net.ListenTCP("tcp", addr) @@ -295,17 +85,20 @@ func TestConnectProducer_ConfigurationOptions(t *testing.T) { topic := "test-topic" partitions := int32(3) - // Test with default flush bytes + // franz-go may create the client lazily, so we may get either an error or a producer producer, err := ConnectProducer(brokers, topic, partitions, nil) + if err != nil { + assert.Nil(t, producer) + } else { + require.NotNil(t, producer) + _ = producer.Close() + } - // We expect an error since we're not connecting to a real broker - assert.Error(t, err) - assert.Nil(t, producer) - - // Test with custom flush bytes producer, err = ConnectProducer(brokers, topic, partitions, nil, 2048) - - // We expect an error since we're not connecting to a real broker - assert.Error(t, err) - assert.Nil(t, producer) + if err != nil { + assert.Nil(t, producer) + } else { + require.NotNil(t, producer) + _ = producer.Close() + } }