From d1544fd8369d3de1f23b2b96c602a8b1af76a63d Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Thu, 12 Oct 2023 10:37:39 -0700 Subject: [PATCH] tests: add client failover on defrag test case in integration and e2e Signed-off-by: Chao Chen --- tests/e2e/failover_test.go | 145 ++++++++++++++++++++++++++ tests/integration/v3_failover_test.go | 77 ++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 tests/e2e/failover_test.go diff --git a/tests/e2e/failover_test.go b/tests/e2e/failover_test.go new file mode 100644 index 000000000000..8348611d5919 --- /dev/null +++ b/tests/e2e/failover_test.go @@ -0,0 +1,145 @@ +// Copyright 2023 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !cluster_proxy + +package e2e + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + _ "google.golang.org/grpc/health" + + clientv3 "go.etcd.io/etcd/client/v3" + "go.etcd.io/etcd/tests/v3/framework/config" + "go.etcd.io/etcd/tests/v3/framework/e2e" +) + +const ( + // in sync with how kubernetes uses etcd + // https://github.com/kubernetes/kubernetes/blob/release-1.28/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go#L59-L71 + keepaliveTime = 30 * time.Second + keepaliveTimeout = 10 * time.Second + dialTimeout = 20 * time.Second + + clientRuntime = 10 * time.Second + // expect no more than 5 failed requests + failedRequests = 5 +) + +func TestFailover(t *testing.T) { + tcs := []struct { + name string + clusterOptions []e2e.EPClusterOption + failpointName string + injectFailure func(t *testing.T, member e2e.EtcdProcess, failpointName string) + }{ + { + name: "defrag", + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)}, + failpointName: "defragBeforeCopy", + injectFailure: triggerDefrag, + }, + } + + for _, tc := range tcs { + t.Run(tc.name, func(t *testing.T) { + e2e.BeforeTest(t) + clus, cerr := e2e.NewEtcdProcessCluster(context.TODO(), t, tc.clusterOptions...) + require.NoError(t, cerr) + t.Cleanup(func() { clus.Stop() }) + if !failpointAvailable(clus.Procs[0], tc.failpointName) { + t.Skipf("fail point %s is not available", tc.failpointName) + } + + endpoints := clus.EndpointsGRPC() + + cnt, success := 0, 0 + donec := make(chan struct{}) + errc := make(chan error, 1) + + go func() { + var lastErr error + var cc *clientv3.Client + defer func() { + if cc != nil { + cc.Close() + } + errc <- lastErr + close(donec) + close(errc) + }() + cc, cerr := clientv3.New(clientv3.Config{ + DialTimeout: dialTimeout, + DialKeepAliveTime: keepaliveTime, + DialKeepAliveTimeout: keepaliveTimeout, + Endpoints: endpoints, + DialOptions: []grpc.DialOption{ + grpc.WithDisableServiceConfig(), + grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin", "healthCheckConfig": {"serviceName": ""}}`), + }, + }) + require.NoError(t, cerr) + timeout := time.After(clientRuntime) + + for { + select { + case <-timeout: + return + default: + } + cctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + _, err := cc.Get(cctx, "health") + cancel() + cnt++ + if err != nil { + lastErr = err + continue + } + success++ + } + }() + + tc.injectFailure(t, clus.Procs[0], tc.failpointName) + + <-donec + err, ok := <-errc + if ok && err != nil { + t.Logf("etcd client failed to fail over, error (%v)", err) + } + t.Logf("request failure rate is %.2f%%, traffic volume success %d requests, total %d requests", (1-float64(success)/float64(cnt))*100, success, cnt) + // expect no more than 5 failed requests + require.InDelta(t, cnt, success, failedRequests) + }) + } +} + +func triggerDefrag(t *testing.T, member e2e.EtcdProcess, failpointName string) { + err := member.Failpoints().SetupHTTP(context.Background(), failpointName, `sleep("8s")`) + require.NoError(t, err) + err = member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}) + require.NoError(t, err) +} + +func failpointAvailable(member e2e.EtcdProcess, name string) bool { + memberFailpoints := member.Failpoints() + if memberFailpoints == nil { + return false + } + return memberFailpoints.Available(name) +} diff --git a/tests/integration/v3_failover_test.go b/tests/integration/v3_failover_test.go index 9d271bd9fa90..8dd1103a30c8 100644 --- a/tests/integration/v3_failover_test.go +++ b/tests/integration/v3_failover_test.go @@ -21,6 +21,7 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" "google.golang.org/grpc" "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" @@ -29,6 +30,18 @@ import ( clientv3test "go.etcd.io/etcd/tests/v3/integration/clientv3" ) +const ( + // in sync with how kubernetes uses etcd + // https://github.com/kubernetes/kubernetes/blob/release-1.28/staging/src/k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go#L59-L71 + keepaliveTime = 30 * time.Second + keepaliveTimeout = 10 * time.Second + dialTimeout = 20 * time.Second + + clientRuntime = 10 * time.Second + // expect no more than 5 failed requests + failedRequests = 5 +) + func TestFailover(t *testing.T) { cases := []struct { name string @@ -172,3 +185,67 @@ func shouldRetry(err error) bool { } return false } + +func TestFailoverOnDefrag(t *testing.T) { + integration2.BeforeTest(t, integration2.WithFailpoint("defragBeforeCopy", `sleep(10000)`)) + clus := integration2.NewCluster(t, &integration2.ClusterConfig{Size: 3}) + defer clus.Terminate(t) + endpoints := clus.Endpoints() + + cnt, success := 0, 0 + donec := make(chan struct{}) + errc := make(chan error, 1) + + go func() { + var lastErr error + var cc *clientv3.Client + defer func() { + if cc != nil { + cc.Close() + } + errc <- lastErr + close(donec) + close(errc) + }() + cc, cerr := clientv3.New(clientv3.Config{ + DialTimeout: dialTimeout, + DialKeepAliveTime: keepaliveTime, + DialKeepAliveTimeout: keepaliveTimeout, + Endpoints: endpoints, + DialOptions: []grpc.DialOption{ + grpc.WithDisableServiceConfig(), + grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin", "healthCheckConfig": {"serviceName": ""}}`), + }, + }) + require.NoError(t, cerr) + timeout := time.After(clientRuntime) + + for { + select { + case <-timeout: + return + default: + } + cctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + _, err := cc.Get(cctx, "health") + cancel() + cnt++ + if err != nil { + lastErr = err + continue + } + success++ + } + }() + _, err := clus.Client(0).Defragment(context.Background(), endpoints[0]) + require.NoError(t, err) + + <-donec + err, ok := <-errc + if ok && err != nil { + t.Logf("etcd client failed to fail over, error (%v)", err) + } + t.Logf("request failure rate is %.2f%%, traffic volume success %d requests, total %d requests", (1-float64(success)/float64(cnt))*100, success, cnt) + // expect no more than 5 failed requests + require.InDelta(t, cnt, success, failedRequests) +}