Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clientconn.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ func (cc *ClientConn) addTraceEvent(msg string) {
Severity: channelz.CtInfo,
}
}
channelz.AddTraceEvent(logger, cc.channelz, 0, ted)
channelz.AddTraceEvent(logger, cc.channelz, 1, ted)
}

type idler ClientConn
Expand Down
34 changes: 34 additions & 0 deletions dial_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package grpc

import (
"context"
"fmt"
"net"
"strings"
"testing"
Expand Down Expand Up @@ -312,3 +313,36 @@ func (s) TestResolverAddressesWithTypedNilAttribute(t *testing.T) {
type stringerVal struct{ s string }

func (s stringerVal) String() string { return s.s }

const errResolverBuildercheme = "test-resolver-build-failure"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo


// errResolverBuilder is a resolver builder that returns an error from its Build
// method.
type errResolverBuilder struct {
err error
}

func (b *errResolverBuilder) Build(resolver.Target, resolver.ClientConn, resolver.BuildOptions) (resolver.Resolver, error) {
return nil, b.err
}

func (b *errResolverBuilder) Scheme() string {
return errResolverBuildercheme
}

// Tests that Dial returns an error if the resolver builder returns an error
// from its Build method.
func (s) TestDial_ResolverBuilder_Error(t *testing.T) {
resolverErr := fmt.Errorf("resolver builder error")
dopts := []DialOption{
WithTransportCredentials(insecure.NewCredentials()),
WithResolvers(&errResolverBuilder{err: resolverErr}),
}
_, err := Dial(errResolverBuildercheme+":///test.server", dopts...)
if err == nil {
t.Fatalf("Dial() succeeded when it should have failed")
}
if !strings.Contains(err.Error(), resolverErr.Error()) {
t.Fatalf("Dial() failed with error %v, want %v", err, resolverErr)
}
}
12 changes: 9 additions & 3 deletions internal/idle/idle.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,16 +234,22 @@ func (m *Manager) ExitIdleMode() error {
return nil
}

if err := m.enforcer.ExitIdleMode(); err != nil {
return fmt.Errorf("failed to exit idle mode: %w", err)
}
// This can fail if resolver creation fails. In that case, we want to
// return the error to the caller so that the RPC can fail. But we still
// need to undo the idle entry process, and ensure that the idle timer is
// started again.
err := m.enforcer.ExitIdleMode()

// Undo the idle entry process. This also respects any new RPC attempts.
atomic.AddInt32(&m.activeCallsCount, math.MaxInt32)
m.actuallyIdle = false

// Start a new timer to fire after the configured idle timeout.
m.resetIdleTimerLocked(m.timeout)

if err != nil {
return fmt.Errorf("failed to exit idle mode: %v", err)
}
return nil
}

Expand Down
43 changes: 42 additions & 1 deletion internal/idle/idle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package idle
import (
"context"
"fmt"
"strings"
"sync"
"sync/atomic"
"testing"
Expand All @@ -44,13 +45,14 @@ func Test(t *testing.T) {
}

type testEnforcer struct {
exitIdleErr error
exitIdleCh chan struct{}
enterIdleCh chan struct{}
}

func (ti *testEnforcer) ExitIdleMode() error {
ti.exitIdleCh <- struct{}{}
return nil
return ti.exitIdleErr

}

Expand Down Expand Up @@ -381,3 +383,42 @@ func (s) TestManager_IdleTimeoutRacesWithOnCallBegin(t *testing.T) {
})
}
}

// TestManager_ExitIdleError tests the case where ExitIdleMode on the enforcer
// returns an error. It verifies that the idle timer is started and the channel
// eventually attempts to enter idle mode.
func (s) TestManager_ExitIdleError(t *testing.T) {
callbackCh := overrideNewTimer(t)
exitIdleErr := fmt.Errorf("exit idle error")
enforcer := newTestEnforcer()
enforcer.exitIdleErr = exitIdleErr

mgr := NewManager(enforcer, defaultTestIdleTimeout)
defer mgr.Close()

// Call ExitIdleMode and expect it to fail.
if err := mgr.ExitIdleMode(); err == nil || !strings.Contains(err.Error(), "exit idle error") {
t.Fatalf("mgr.ExitIdleMode() returned: %v, want error: %v", err, exitIdleErr)
}

// Verify that ExitIdleMode was called on the enforcer.
select {
case <-enforcer.exitIdleCh:
case <-time.After(defaultTestShortTimeout):
t.Fatal("Timeout waiting for ExitIdleMode to be called on the enforcer")
}

// The timer should have been started. Wait for it to fire.
select {
case <-callbackCh:
case <-time.After(2 * defaultTestIdleTimeout):
t.Fatal("Timeout waiting for idle timer callback to fire")
}

// After the timer fires, the manager should attempt to enter idle mode.
select {
case <-enforcer.enterIdleCh:
case <-time.After(defaultTestShortTimeout):
t.Fatal("Timeout waiting for EnterIdleMode to be called on the enforcer")
}
}
24 changes: 24 additions & 0 deletions resolver_wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"strings"
"sync"

"google.golang.org/grpc/connectivity"
"google.golang.org/grpc/internal/channelz"
"google.golang.org/grpc/internal/grpcsync"
"google.golang.org/grpc/internal/pretty"
Expand Down Expand Up @@ -79,6 +80,18 @@ func (ccr *ccResolverWrapper) start() error {
Authority: ccr.cc.authority,
MetricsRecorder: ccr.cc.metricsRecorderList,
}

// https://github.com/grpc/grpc/blob/master/doc/connectivity-semantics-and-api.md
// defines CONNECTING as follows:
// - The channel is trying to establish a connection and is waiting to
// make progress on one of the steps involved in name resolution, TCP
// connection establishment or TLS handshake. This may be used as the
// initial state for channels upon creation.
//
// We are starting the name resolver here as part of exiting IDLE, so
// transitioning to CONNECTING is the right thing to do.
Comment on lines +84 to +92
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO comments should be short and to the point.

Short comments make the code take up less space, which makes it easier to read and understand. Long comments make long functions extremely long and not fit on the page.

Honestly, I think a comment for this action isn't even necessary. But if you think we need one, this could be:

// Set state to CONNECTING before building the name resolver
// so the channel does not remain in IDLE.

ccr.cc.csMgr.updateState(connectivity.Connecting)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this (and the error handling below) go into exitIdleMode instead? It might make more sense to be in the channel directly, rather than in this sub-component.


var err error
// The delegating resolver is used unless:
// - A custom dialer is provided via WithContextDialer dialoption or
Expand All @@ -90,6 +103,17 @@ func (ccr *ccResolverWrapper) start() error {
} else {
ccr.resolver, err = delegatingresolver.New(ccr.cc.parsedTarget, ccr, opts, ccr.cc.resolverBuilder, ccr.cc.dopts.enableLocalDNSResolution)
}

// If resolver creation fails, transition to TransientFailure. This is
// useful for channels created using `NewClient` and the returned error
// will be returned to the user when they try to make the first RPC.
// This is also useful when a channel is exiting IDLE state.
//
// The returned error will be returned to channels created using `Dial`.
if err != nil {
logger.Warningf("Failed to start resolver: %v", err)
ccr.cc.csMgr.updateState(connectivity.TransientFailure)
}
errCh <- err
})
return <-errCh
Expand Down
115 changes: 109 additions & 6 deletions test/clientconn_state_transition_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -583,8 +583,6 @@ func (s) TestConnectivityStateSubscriber(t *testing.T) {
// Test verifies that a channel starts off in IDLE and transitions to CONNECTING
// when Connect() is called, and stays there when there are no resolver updates.
func (s) TestStateTransitions_WithConnect_NoResolverUpdate(t *testing.T) {
t.Skip("The channel remains in IDLE until the LB policy updates the state to CONNECTING. This is a bug and the channel should transition to CONNECTING as soon as Connect() is called. See issue #7686.")

backend := stubserver.StartTestService(t, nil)
defer backend.Stop()

Expand Down Expand Up @@ -618,8 +616,6 @@ func (s) TestStateTransitions_WithConnect_NoResolverUpdate(t *testing.T) {
// Test verifies that a channel starts off in IDLE and transitions to CONNECTING
// when Connect() is called, and stays there when there are no resolver updates.
func (s) TestStateTransitions_WithRPC_NoResolverUpdate(t *testing.T) {
t.Skip("The channel remains in IDLE until the LB policy updates the state to CONNECTING. This is a bug and the channel should transition to CONNECTING as soon as an RPC call is made. See issue #7686.")

backend := stubserver.StartTestService(t, nil)
defer backend.Stop()

Expand All @@ -641,8 +637,7 @@ func (s) TestStateTransitions_WithRPC_NoResolverUpdate(t *testing.T) {

// Make an RPC call to transition the channel to CONNECTING.
go func() {
_, err := testgrpc.NewTestServiceClient(cc).EmptyCall(ctx, &testpb.Empty{})
if err == nil {
if _, err := testgrpc.NewTestServiceClient(cc).EmptyCall(ctx, &testpb.Empty{}); err == nil {
t.Errorf("Expected RPC to fail, but it succeeded")
}
}()
Expand All @@ -656,3 +651,111 @@ func (s) TestStateTransitions_WithRPC_NoResolverUpdate(t *testing.T) {
defer shortCancel()
testutils.AwaitNoStateChange(shortCtx, t, cc, connectivity.Connecting)
}

const testResolverBuildFailureScheme = "test-resolver-build-failure"

// testResolverBuilder is a resolver builder that fails the first time its
// Build method is called, and succeeds thereafter.
type testResolverBuilder struct {
logger interface {
Logf(format string, args ...any)
}
buildCalled bool
manualR *manual.Resolver
}

func (b *testResolverBuilder) Build(target resolver.Target, cc resolver.ClientConn, opts resolver.BuildOptions) (resolver.Resolver, error) {
b.logger.Logf("testResolverBuilder: Build called with target: %v", target)
if !b.buildCalled {
b.buildCalled = true
b.logger.Logf("testResolverBuilder: returning build failure")
return nil, fmt.Errorf("simulated resolver build failure")
}
return b.manualR.Build(target, cc, opts)
}

func (b *testResolverBuilder) Scheme() string {
return testResolverBuildFailureScheme
}

// Tests for state transitions when the resolver initially fails to build.
func (s) TestStateTransitions_ResolverBuildFailure(t *testing.T) {
tests := []struct {
name string
exitIdleFunc func(ctx context.Context, cc *grpc.ClientConn) error
}{
{
name: "exitIdleByConnecting",
exitIdleFunc: func(_ context.Context, cc *grpc.ClientConn) error {
cc.Connect()
return nil
},
},
{
name: "exitIdleByRPC",
exitIdleFunc: func(ctx context.Context, cc *grpc.ClientConn) error {
if _, err := testgrpc.NewTestServiceClient(cc).EmptyCall(ctx, &testpb.Empty{}); err != nil {
return fmt.Errorf("EmptyCall RPC failed: %v", err)
}
return nil
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
mr := manual.NewBuilderWithScheme("whatever" + tt.name)
backend := stubserver.StartTestService(t, nil)
defer backend.Stop()
mr.InitialState(resolver.State{Addresses: []resolver.Address{{Addr: backend.Address}}})

dopts := []grpc.DialOption{
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithResolvers(&testResolverBuilder{logger: t, manualR: mr}),
grpc.WithIdleTimeout(time.Second),
}

cc, err := grpc.NewClient(testResolverBuildFailureScheme+":///", dopts...)
if err != nil {
t.Fatalf("Failed to create new client: %v", err)
}
defer cc.Close()

// Ensure that the client is in IDLE before connecting.
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
defer cancel()
testutils.AwaitState(ctx, t, cc, connectivity.Idle)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't need an Await right? It should just check the current state, and never wait for changes, as we know it starts idle.


// Subscribe to state updates.
stateCh := make(chan connectivity.State, 1)
s := &funcConnectivityStateSubscriber{
onMsg: func(s connectivity.State) {
select {
case stateCh <- s:
case <-ctx.Done():
}
},
}
internal.SubscribeToConnectivityStateChanges.(func(cc *grpc.ClientConn, s grpcsync.Subscriber) func())(cc, s)

if state := cc.GetState(); state != connectivity.Idle {
t.Fatalf("Expected initial state to be IDLE, got %v", state)
}
Comment on lines +740 to +742
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The AwaitState above already tested this IIUC


cc.Connect()
wantStates := []connectivity.State{
connectivity.Connecting, // When channel exits IDLE for the first time.
connectivity.TransientFailure, // Resolver build failure.
connectivity.Idle, // After idle timeout.
connectivity.Connecting, // When channel exits IDLE again.
connectivity.Ready, // Successful resolver build and connection to backend.
}
for _, wantState := range wantStates {
waitForState(ctx, t, stateCh, wantState)
if wantState == connectivity.Idle {
tt.exitIdleFunc(ctx, cc)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we make this test the actual RPC error when we use an RPC to exit idle?

}
}

})
}
}