Skip to content

Commit ab8c5a3

Browse files
jolshanmanoj-mathivanan
authored andcommitted
KAFKA-18660: Transactions Version 2 doesn't handle epoch overflow correctly (apache#18730)
Fixed the typo that used the wrong producer ID and epoch when returning so that we handle epoch overflow correctly. We also had to rearrange the concurrent transaction handling so that we don't self-fence when we start the new transaction with the new producer ID. I also tested this with a modified version of the code where epoch overflow happens on the first epoch bump (every request has a new producer id) Reviewers: Artem Livshits <[email protected]>, Jeff Kim <[email protected]>
1 parent 0eba83b commit ab8c5a3

File tree

3 files changed

+97
-19
lines changed

3 files changed

+97
-19
lines changed

core/src/main/scala/kafka/coordinator/transaction/TransactionCoordinator.scala

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -408,13 +408,16 @@ class TransactionCoordinator(txnConfig: TransactionConfig,
408408

409409
// generate the new transaction metadata with added partitions
410410
txnMetadata.inLock {
411-
if (txnMetadata.producerId != producerId) {
411+
if (txnMetadata.pendingTransitionInProgress) {
412+
// return a retriable exception to let the client backoff and retry
413+
// This check is performed first so that the pending transition can complete before subsequent checks.
414+
// With TV2, we may be transitioning over a producer epoch overflow, and the producer may be using the
415+
// new producer ID that is still only in pending state.
416+
Left(Errors.CONCURRENT_TRANSACTIONS)
417+
} else if (txnMetadata.producerId != producerId) {
412418
Left(Errors.INVALID_PRODUCER_ID_MAPPING)
413419
} else if (txnMetadata.producerEpoch != producerEpoch) {
414420
Left(Errors.PRODUCER_FENCED)
415-
} else if (txnMetadata.pendingTransitionInProgress) {
416-
// return a retriable exception to let the client backoff and retry
417-
Left(Errors.CONCURRENT_TRANSACTIONS)
418421
} else if (txnMetadata.state == PrepareCommit || txnMetadata.state == PrepareAbort) {
419422
Left(Errors.CONCURRENT_TRANSACTIONS)
420423
} else if (txnMetadata.state == Ongoing && partitions.subsetOf(txnMetadata.topicPartitions)) {
@@ -812,10 +815,13 @@ class TransactionCoordinator(txnConfig: TransactionConfig,
812815
}
813816
}
814817

815-
if (txnMetadata.producerId != producerId && !retryOnOverflow)
816-
Left(Errors.INVALID_PRODUCER_ID_MAPPING)
817-
else if (txnMetadata.pendingTransitionInProgress && txnMetadata.pendingState.get != PrepareEpochFence)
818+
if (txnMetadata.pendingTransitionInProgress && txnMetadata.pendingState.get != PrepareEpochFence) {
819+
// This check is performed first so that the pending transition can complete before the next checks.
820+
// With TV2, we may be transitioning over a producer epoch overflow, and the producer may be using the
821+
// new producer ID that is still only in pending state.
818822
Left(Errors.CONCURRENT_TRANSACTIONS)
823+
} else if (txnMetadata.producerId != producerId && !retryOnOverflow)
824+
Left(Errors.INVALID_PRODUCER_ID_MAPPING)
819825
else if (!isValidEpoch)
820826
Left(Errors.PRODUCER_FENCED)
821827
else txnMetadata.state match {
@@ -940,7 +946,7 @@ class TransactionCoordinator(txnConfig: TransactionConfig,
940946
case Right((txnMetadata, newPreSendMetadata)) =>
941947
// we can respond to the client immediately and continue to write the txn markers if
942948
// the log append was successful
943-
responseCallback(Errors.NONE, txnMetadata.producerId, txnMetadata.producerEpoch)
949+
responseCallback(Errors.NONE, newPreSendMetadata.producerId, newPreSendMetadata.producerEpoch)
944950

945951
txnMarkerChannelManager.addTxnMarkersToSend(coordinatorEpoch, txnMarkerResult, txnMetadata, newPreSendMetadata)
946952
}

core/src/test/scala/integration/kafka/api/TransactionsTest.scala

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import kafka.utils.{TestInfoUtils, TestUtils}
2222
import org.apache.kafka.clients.consumer._
2323
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
2424
import org.apache.kafka.common.TopicPartition
25-
import org.apache.kafka.common.errors.{InvalidProducerEpochException, ProducerFencedException, TimeoutException}
25+
import org.apache.kafka.common.errors.{ConcurrentTransactionsException, InvalidProducerEpochException, ProducerFencedException, TimeoutException}
2626
import org.apache.kafka.common.test.api.Flaky
2727
import org.apache.kafka.coordinator.group.GroupCoordinatorConfig
2828
import org.apache.kafka.coordinator.transaction.{TransactionLogConfig, TransactionStateManagerConfig}
@@ -617,15 +617,20 @@ class TransactionsTest extends IntegrationTestHarness {
617617
// Wait for the expiration cycle to kick in.
618618
Thread.sleep(600)
619619

620-
try {
621-
// Now that the transaction has expired, the second send should fail with a InvalidProducerEpochException.
622-
producer.send(TestUtils.producerRecordWithExpectedTransactionStatus(topic1, null, "2", "2", willBeCommitted = false)).get()
623-
fail("should have raised a InvalidProducerEpochException since the transaction has expired")
624-
} catch {
625-
case _: InvalidProducerEpochException =>
626-
case e: ExecutionException =>
627-
assertTrue(e.getCause.isInstanceOf[InvalidProducerEpochException])
628-
}
620+
TestUtils.waitUntilTrue(() => {
621+
var foundException = false
622+
try {
623+
// Now that the transaction has expired, the second send should fail with a InvalidProducerEpochException. We may see some concurrentTransactionsExceptions.
624+
producer.send(TestUtils.producerRecordWithExpectedTransactionStatus(topic1, null, "2", "2", willBeCommitted = false)).get()
625+
fail("should have raised an error due to concurrent transactions or invalid producer epoch")
626+
} catch {
627+
case _: ConcurrentTransactionsException =>
628+
case _: InvalidProducerEpochException =>
629+
case e: ExecutionException =>
630+
foundException = e.getCause.isInstanceOf[InvalidProducerEpochException]
631+
}
632+
foundException
633+
}, "Never returned the expected InvalidProducerEpochException")
629634

630635
// Verify that the first message was aborted and the second one was never written at all.
631636
val nonTransactionalConsumer = nonTransactionalConsumers.head

core/src/test/scala/unit/kafka/coordinator/transaction/TransactionCoordinatorTest.scala

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,40 @@ class TransactionCoordinatorTest {
182182
assertEquals(Errors.NONE, result.error)
183183
}
184184

185+
@Test
186+
def shouldGenerateNewProducerIdIfEpochsExhaustedV2(): Unit = {
187+
initPidGenericMocks(transactionalId)
188+
189+
val txnMetadata1 = new TransactionMetadata(transactionalId, producerId, producerId, RecordBatch.NO_PRODUCER_ID, (Short.MaxValue - 1).toShort,
190+
(Short.MaxValue - 2).toShort, txnTimeoutMs, Ongoing, mutable.Set.empty, time.milliseconds(), time.milliseconds(), TV_2)
191+
// We start with txnMetadata1 so we can transform the metadata to PrepareCommit.
192+
val txnMetadata2 = new TransactionMetadata(transactionalId, producerId, producerId, RecordBatch.NO_PRODUCER_ID, (Short.MaxValue - 1).toShort,
193+
(Short.MaxValue - 2).toShort, txnTimeoutMs, Ongoing, mutable.Set.empty, time.milliseconds(), time.milliseconds(), TV_2)
194+
val transitMetadata = txnMetadata2.prepareAbortOrCommit(PrepareCommit, TV_2, producerId2, time.milliseconds(), false)
195+
txnMetadata2.completeTransitionTo(transitMetadata)
196+
197+
assertEquals(producerId, txnMetadata2.producerId)
198+
assertEquals(Short.MaxValue, txnMetadata2.producerEpoch)
199+
200+
when(transactionManager.getTransactionState(ArgumentMatchers.eq(transactionalId)))
201+
.thenReturn(Right(Some(CoordinatorEpochAndTxnMetadata(coordinatorEpoch, txnMetadata1))))
202+
.thenReturn(Right(Some(CoordinatorEpochAndTxnMetadata(coordinatorEpoch, txnMetadata2))))
203+
204+
when(transactionManager.appendTransactionToLog(
205+
ArgumentMatchers.eq(transactionalId),
206+
ArgumentMatchers.eq(coordinatorEpoch),
207+
any[TxnTransitMetadata],
208+
capturedErrorsCallback.capture(),
209+
any(),
210+
any()
211+
)).thenAnswer(_ => capturedErrorsCallback.getValue.apply(Errors.NONE))
212+
213+
coordinator.handleEndTransaction(transactionalId, producerId, (Short.MaxValue - 1).toShort, TransactionResult.COMMIT, TV_2, endTxnCallback)
214+
assertEquals(producerId2, newProducerId)
215+
assertEquals(0, newEpoch)
216+
assertEquals(Errors.NONE, error)
217+
}
218+
185219
@Test
186220
def shouldRespondWithNotCoordinatorOnInitPidWhenNotCoordinator(): Unit = {
187221
when(transactionManager.validateTransactionTimeoutMs(anyInt()))
@@ -519,7 +553,7 @@ class TransactionCoordinatorTest {
519553
.thenReturn(Right(Some(CoordinatorEpochAndTxnMetadata(coordinatorEpoch, txnMetadata))))
520554

521555
val nextProducerEpoch = if (isRetry) producerEpoch - 1 else producerEpoch
522-
coordinator.handleEndTransaction(transactionalId, producerId, nextProducerEpoch.toShort , TransactionResult.ABORT, clientTransactionVersion, endTxnCallback)
556+
coordinator.handleEndTransaction(transactionalId, producerId, nextProducerEpoch.toShort, TransactionResult.ABORT, clientTransactionVersion, endTxnCallback)
523557
if (isRetry) {
524558
assertEquals(Errors.PRODUCER_FENCED, error)
525559
} else {
@@ -770,6 +804,39 @@ class TransactionCoordinatorTest {
770804
verify(transactionManager, times(2)).getTransactionState(ArgumentMatchers.eq(transactionalId))
771805
}
772806

807+
@Test
808+
def shouldReturnConcurrentTxnOnAddPartitionsIfEndTxnV2EpochOverflowAndNotComplete(): Unit = {
809+
val prepareWithPending = new TransactionMetadata(transactionalId, producerId, producerId,
810+
producerId2, Short.MaxValue, (Short.MaxValue - 1).toShort, 1, PrepareCommit, collection.mutable.Set.empty[TopicPartition], 0, time.milliseconds(), TV_2)
811+
val txnTransitMetadata = prepareWithPending.prepareComplete(time.milliseconds())
812+
813+
when(transactionManager.getTransactionState(ArgumentMatchers.eq(transactionalId)))
814+
.thenReturn(Right(Some(CoordinatorEpochAndTxnMetadata(coordinatorEpoch, prepareWithPending))))
815+
816+
// Return CONCURRENT_TRANSACTIONS while transaction is still completing
817+
coordinator.handleAddPartitionsToTransaction(transactionalId, producerId2, 0, partitions, errorsCallback, TV_2)
818+
assertEquals(Errors.CONCURRENT_TRANSACTIONS, error)
819+
verify(transactionManager).getTransactionState(ArgumentMatchers.eq(transactionalId))
820+
821+
prepareWithPending.completeTransitionTo(txnTransitMetadata)
822+
assertEquals(CompleteCommit, prepareWithPending.state)
823+
when(transactionManager.getTransactionState(ArgumentMatchers.eq(transactionalId)))
824+
.thenReturn(Right(Some(CoordinatorEpochAndTxnMetadata(coordinatorEpoch, prepareWithPending))))
825+
when(transactionManager.appendTransactionToLog(
826+
ArgumentMatchers.eq(transactionalId),
827+
ArgumentMatchers.eq(coordinatorEpoch),
828+
any[TxnTransitMetadata],
829+
capturedErrorsCallback.capture(),
830+
any(),
831+
any())
832+
).thenAnswer(_ => capturedErrorsCallback.getValue.apply(Errors.NONE))
833+
834+
coordinator.handleAddPartitionsToTransaction(transactionalId, producerId2, 0, partitions, errorsCallback, TV_2)
835+
836+
assertEquals(Errors.NONE, error)
837+
verify(transactionManager, times(2)).getTransactionState(ArgumentMatchers.eq(transactionalId))
838+
}
839+
773840
@ParameterizedTest
774841
@ValueSource(shorts = Array(0, 2))
775842
def shouldAppendPrepareCommitToLogOnEndTxnWhenStatusIsOngoingAndResultIsCommit(transactionVersion: Short): Unit = {

0 commit comments

Comments
 (0)