diff --git a/clientconn.go b/clientconn.go index c0c2c9a76abf..ab6fd60577ec 100644 --- a/clientconn.go +++ b/clientconn.go @@ -332,7 +332,7 @@ func (cc *ClientConn) addTraceEvent(msg string) { Severity: channelz.CtInfo, } } - channelz.AddTraceEvent(logger, cc.channelz, 0, ted) + channelz.AddTraceEvent(logger, cc.channelz, 1, ted) } type idler ClientConn @@ -356,10 +356,21 @@ func (cc *ClientConn) exitIdleMode() (err error) { } cc.mu.Unlock() + // Set state to CONNECTING before building the name resolver + // so the channel does not remain in IDLE. + cc.csMgr.updateState(connectivity.Connecting) + // This needs to be called without cc.mu because this builds a new resolver // which might update state or report error inline, which would then need to // acquire cc.mu. if err := cc.resolverWrapper.start(); err != nil { + // If resolver creation fails, transition to TransientFailure. For a + // channel created with `NewClient`, the error will be returned on the + // first RPC. For a channel created with `Dial`, the error will be + // returned by `Dial`. + logger.Warningf("Failed to start resolver: %v", err) + cc.csMgr.updateState(connectivity.TransientFailure) + return err } diff --git a/dial_test.go b/dial_test.go index cb86f330d5cf..7b74aac79fa6 100644 --- a/dial_test.go +++ b/dial_test.go @@ -20,6 +20,7 @@ package grpc import ( "context" + "fmt" "net" "strings" "testing" @@ -312,3 +313,36 @@ func (s) TestResolverAddressesWithTypedNilAttribute(t *testing.T) { type stringerVal struct{ s string } func (s stringerVal) String() string { return s.s } + +const errResolverBuilderScheme = "test-resolver-build-failure" + +// errResolverBuilder is a resolver builder that returns an error from its Build +// method. +type errResolverBuilder struct { + err error +} + +func (b *errResolverBuilder) Build(resolver.Target, resolver.ClientConn, resolver.BuildOptions) (resolver.Resolver, error) { + return nil, b.err +} + +func (b *errResolverBuilder) Scheme() string { + return errResolverBuilderScheme +} + +// Tests that Dial returns an error if the resolver builder returns an error +// from its Build method. +func (s) TestDial_ResolverBuilder_Error(t *testing.T) { + resolverErr := fmt.Errorf("resolver builder error") + dopts := []DialOption{ + WithTransportCredentials(insecure.NewCredentials()), + WithResolvers(&errResolverBuilder{err: resolverErr}), + } + _, err := Dial(errResolverBuilderScheme+":///test.server", dopts...) + if err == nil { + t.Fatalf("Dial() succeeded when it should have failed") + } + if !strings.Contains(err.Error(), resolverErr.Error()) { + t.Fatalf("Dial() failed with error %v, want %v", err, resolverErr) + } +} diff --git a/internal/idle/idle.go b/internal/idle/idle.go index 2c13ee9dac75..091ed18523c0 100644 --- a/internal/idle/idle.go +++ b/internal/idle/idle.go @@ -234,9 +234,11 @@ func (m *Manager) ExitIdleMode() error { return nil } - if err := m.enforcer.ExitIdleMode(); err != nil { - return fmt.Errorf("failed to exit idle mode: %w", err) - } + // This can fail if resolver creation fails. In that case, we want to + // return the error to the caller so that the RPC can fail. But we still + // need to undo the idle entry process, and ensure that the idle timer is + // started again. + err := m.enforcer.ExitIdleMode() // Undo the idle entry process. This also respects any new RPC attempts. atomic.AddInt32(&m.activeCallsCount, math.MaxInt32) @@ -244,6 +246,10 @@ func (m *Manager) ExitIdleMode() error { // Start a new timer to fire after the configured idle timeout. m.resetIdleTimerLocked(m.timeout) + + if err != nil { + return fmt.Errorf("failed to exit idle mode: %v", err) + } return nil } diff --git a/internal/idle/idle_test.go b/internal/idle/idle_test.go index c2645bb95c05..9715a480cbd8 100644 --- a/internal/idle/idle_test.go +++ b/internal/idle/idle_test.go @@ -21,6 +21,7 @@ package idle import ( "context" "fmt" + "strings" "sync" "sync/atomic" "testing" @@ -44,13 +45,14 @@ func Test(t *testing.T) { } type testEnforcer struct { + exitIdleErr error exitIdleCh chan struct{} enterIdleCh chan struct{} } func (ti *testEnforcer) ExitIdleMode() error { ti.exitIdleCh <- struct{}{} - return nil + return ti.exitIdleErr } @@ -381,3 +383,42 @@ func (s) TestManager_IdleTimeoutRacesWithOnCallBegin(t *testing.T) { }) } } + +// TestManager_ExitIdleError tests the case where ExitIdleMode on the enforcer +// returns an error. It verifies that the idle timer is started and the channel +// eventually attempts to enter idle mode. +func (s) TestManager_ExitIdleError(t *testing.T) { + callbackCh := overrideNewTimer(t) + exitIdleErr := fmt.Errorf("exit idle error") + enforcer := newTestEnforcer() + enforcer.exitIdleErr = exitIdleErr + + mgr := NewManager(enforcer, defaultTestIdleTimeout) + defer mgr.Close() + + // Call ExitIdleMode and expect it to fail. + if err := mgr.ExitIdleMode(); err == nil || !strings.Contains(err.Error(), "exit idle error") { + t.Fatalf("mgr.ExitIdleMode() returned: %v, want error: %v", err, exitIdleErr) + } + + // Verify that ExitIdleMode was called on the enforcer. + select { + case <-enforcer.exitIdleCh: + case <-time.After(defaultTestShortTimeout): + t.Fatal("Timeout waiting for ExitIdleMode to be called on the enforcer") + } + + // The timer should have been started. Wait for it to fire. + select { + case <-callbackCh: + case <-time.After(2 * defaultTestIdleTimeout): + t.Fatal("Timeout waiting for idle timer callback to fire") + } + + // After the timer fires, the manager should attempt to enter idle mode. + select { + case <-enforcer.enterIdleCh: + case <-time.After(defaultTestShortTimeout): + t.Fatal("Timeout waiting for EnterIdleMode to be called on the enforcer") + } +} diff --git a/resolver_wrapper.go b/resolver_wrapper.go index 80e16a327cd3..e69a938a488b 100644 --- a/resolver_wrapper.go +++ b/resolver_wrapper.go @@ -79,6 +79,7 @@ func (ccr *ccResolverWrapper) start() error { Authority: ccr.cc.authority, MetricsRecorder: ccr.cc.metricsRecorderList, } + var err error // The delegating resolver is used unless: // - A custom dialer is provided via WithContextDialer dialoption or diff --git a/test/clientconn_state_transition_test.go b/test/clientconn_state_transition_test.go index de4e5b3d6894..82e55bc335d9 100644 --- a/test/clientconn_state_transition_test.go +++ b/test/clientconn_state_transition_test.go @@ -23,6 +23,7 @@ import ( "fmt" "io" "net" + "strings" "sync" "testing" "time" @@ -583,8 +584,6 @@ func (s) TestConnectivityStateSubscriber(t *testing.T) { // Test verifies that a channel starts off in IDLE and transitions to CONNECTING // when Connect() is called, and stays there when there are no resolver updates. func (s) TestStateTransitions_WithConnect_NoResolverUpdate(t *testing.T) { - t.Skip("The channel remains in IDLE until the LB policy updates the state to CONNECTING. This is a bug and the channel should transition to CONNECTING as soon as Connect() is called. See issue #7686.") - backend := stubserver.StartTestService(t, nil) defer backend.Stop() @@ -618,8 +617,6 @@ func (s) TestStateTransitions_WithConnect_NoResolverUpdate(t *testing.T) { // Test verifies that a channel starts off in IDLE and transitions to CONNECTING // when Connect() is called, and stays there when there are no resolver updates. func (s) TestStateTransitions_WithRPC_NoResolverUpdate(t *testing.T) { - t.Skip("The channel remains in IDLE until the LB policy updates the state to CONNECTING. This is a bug and the channel should transition to CONNECTING as soon as an RPC call is made. See issue #7686.") - backend := stubserver.StartTestService(t, nil) defer backend.Stop() @@ -641,8 +638,7 @@ func (s) TestStateTransitions_WithRPC_NoResolverUpdate(t *testing.T) { // Make an RPC call to transition the channel to CONNECTING. go func() { - _, err := testgrpc.NewTestServiceClient(cc).EmptyCall(ctx, &testpb.Empty{}) - if err == nil { + if _, err := testgrpc.NewTestServiceClient(cc).EmptyCall(ctx, &testpb.Empty{}); err == nil { t.Errorf("Expected RPC to fail, but it succeeded") } }() @@ -656,3 +652,116 @@ func (s) TestStateTransitions_WithRPC_NoResolverUpdate(t *testing.T) { defer shortCancel() testutils.AwaitNoStateChange(shortCtx, t, cc, connectivity.Connecting) } + +const testResolverBuildFailureScheme = "test-resolver-build-failure" + +// testResolverBuilder is a resolver builder that fails the first time its +// Build method is called, and succeeds thereafter. +type testResolverBuilder struct { + logger interface { + Logf(format string, args ...any) + } + buildCalled bool + manualR *manual.Resolver +} + +func (b *testResolverBuilder) Build(target resolver.Target, cc resolver.ClientConn, opts resolver.BuildOptions) (resolver.Resolver, error) { + b.logger.Logf("testResolverBuilder: Build called with target: %v", target) + if !b.buildCalled { + b.buildCalled = true + b.logger.Logf("testResolverBuilder: returning build failure") + return nil, fmt.Errorf("simulated resolver build failure") + } + return b.manualR.Build(target, cc, opts) +} + +func (b *testResolverBuilder) Scheme() string { + return testResolverBuildFailureScheme +} + +// Tests for state transitions when the resolver initially fails to build. +func (s) TestStateTransitions_ResolverBuildFailure(t *testing.T) { + tests := []struct { + name string + exitIdleWithRPC bool + }{ + { + name: "exitIdleByConnecting", + exitIdleWithRPC: false, + }, + { + name: "exitIdleByRPC", + exitIdleWithRPC: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mr := manual.NewBuilderWithScheme("whatever" + tt.name) + backend := stubserver.StartTestService(t, nil) + defer backend.Stop() + mr.InitialState(resolver.State{Addresses: []resolver.Address{{Addr: backend.Address}}}) + + dopts := []grpc.DialOption{ + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithResolvers(&testResolverBuilder{logger: t, manualR: mr}), + grpc.WithIdleTimeout(time.Second), + } + + cc, err := grpc.NewClient(testResolverBuildFailureScheme+":///", dopts...) + if err != nil { + t.Fatalf("Failed to create new client: %v", err) + } + defer cc.Close() + + // Ensure that the client is in IDLE before connecting. + if state := cc.GetState(); state != connectivity.Idle { + t.Fatalf("Expected initial state to be IDLE, got %v", state) + } + + // Subscribe to state updates. + ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) + defer cancel() + stateCh := make(chan connectivity.State, 1) + s := &funcConnectivityStateSubscriber{ + onMsg: func(s connectivity.State) { + select { + case stateCh <- s: + case <-ctx.Done(): + } + }, + } + internal.SubscribeToConnectivityStateChanges.(func(cc *grpc.ClientConn, s grpcsync.Subscriber) func())(cc, s) + + if tt.exitIdleWithRPC { + // The first attempt to kick the channel is expected to return + // the resolver build error to the RPC. + const wantErr = "simulated resolver build failure" + if _, err := testgrpc.NewTestServiceClient(cc).EmptyCall(ctx, &testpb.Empty{}); err == nil || !strings.Contains(err.Error(), wantErr) { + t.Fatalf("EmptyCall RPC failed with error: %q, want %q", err, wantErr) + } + } else { + cc.Connect() + } + + wantStates := []connectivity.State{ + connectivity.Connecting, // When channel exits IDLE for the first time. + connectivity.TransientFailure, // Resolver build failure. + connectivity.Idle, // After idle timeout. + connectivity.Connecting, // When channel exits IDLE again. + connectivity.Ready, // Successful resolver build and connection to backend. + } + for _, wantState := range wantStates { + waitForState(ctx, t, stateCh, wantState) + if wantState == connectivity.Idle { + if tt.exitIdleWithRPC { + if _, err := testgrpc.NewTestServiceClient(cc).EmptyCall(ctx, &testpb.Empty{}); err != nil { + t.Fatalf("EmptyCall RPC failed: %v", err) + } + } else { + cc.Connect() + } + } + } + }) + } +}