Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cluster id check for hash kv handler #15924

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/v3rpc/rpctypes/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ var (
ErrGRPCMemberNotLearner = status.Error(codes.FailedPrecondition, "etcdserver: can only promote a learner member")
ErrGRPCLearnerNotReady = status.Error(codes.FailedPrecondition, "etcdserver: can only promote a learner member which is in sync with leader")
ErrGRPCTooManyLearners = status.Error(codes.FailedPrecondition, "etcdserver: too many learner members in cluster")
ErrGRPCClusterIdMismatch = status.Error(codes.FailedPrecondition, "etcdserver: cluster ID mismatch")

ErrGRPCRequestTooLarge = status.Error(codes.InvalidArgument, "etcdserver: request is too large")
ErrGRPCRequestTooManyRequests = status.Error(codes.ResourceExhausted, "etcdserver: too many requests")
Expand Down Expand Up @@ -117,6 +118,7 @@ var (
ErrorDesc(ErrGRPCMemberNotLearner): ErrGRPCMemberNotLearner,
ErrorDesc(ErrGRPCLearnerNotReady): ErrGRPCLearnerNotReady,
ErrorDesc(ErrGRPCTooManyLearners): ErrGRPCTooManyLearners,
ErrorDesc(ErrGRPCClusterIdMismatch): ErrGRPCClusterIdMismatch,

ErrorDesc(ErrGRPCRequestTooLarge): ErrGRPCRequestTooLarge,
ErrorDesc(ErrGRPCRequestTooManyRequests): ErrGRPCRequestTooManyRequests,
Expand Down Expand Up @@ -204,6 +206,7 @@ var (
ErrInvalidAuthToken = Error(ErrGRPCInvalidAuthToken)
ErrAuthOldRevision = Error(ErrGRPCAuthOldRevision)
ErrInvalidAuthMgmt = Error(ErrGRPCInvalidAuthMgmt)
ErrClusterIdMismatch = Error(ErrGRPCClusterIdMismatch)

ErrNoLeader = Error(ErrGRPCNoLeader)
ErrNotLeader = Error(ErrGRPCNotLeader)
Expand Down
4 changes: 2 additions & 2 deletions server/etcdserver/api/rafthttp/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ var (
RaftSnapshotPrefix = path.Join(RaftPrefix, "snapshot")

errIncompatibleVersion = errors.New("incompatible version")
errClusterIDMismatch = errors.New("cluster ID mismatch")
ErrClusterIDMismatch = errors.New("cluster ID mismatch")
)

type peerGetter interface {
Expand Down Expand Up @@ -508,7 +508,7 @@ func checkClusterCompatibilityFromHeader(lg *zap.Logger, localID types.ID, heade
zap.String("remote-peer-server-minimum-cluster-version", remoteMinClusterVs),
zap.String("remote-peer-cluster-id", gcid),
)
return errClusterIDMismatch
return ErrClusterIDMismatch
}
return nil
}
Expand Down
6 changes: 3 additions & 3 deletions server/etcdserver/api/rafthttp/stream.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,18 +648,18 @@ func (cr *streamReader) dial(t streamType) (io.ReadCloser, error) {
}
return nil, errIncompatibleVersion

case errClusterIDMismatch.Error():
case ErrClusterIDMismatch.Error():
if cr.lg != nil {
cr.lg.Warn(
"request sent was ignored by remote peer due to cluster ID mismatch",
zap.String("remote-peer-id", cr.peerID.String()),
zap.String("remote-peer-cluster-id", resp.Header.Get("X-Etcd-Cluster-ID")),
zap.String("local-member-id", cr.tr.ID.String()),
zap.String("local-member-cluster-id", cr.tr.ClusterID.String()),
zap.Error(errClusterIDMismatch),
zap.Error(ErrClusterIDMismatch),
)
}
return nil, errClusterIDMismatch
return nil, ErrClusterIDMismatch

default:
return nil, fmt.Errorf("unhandled error %q when precondition failed", string(b))
Expand Down
4 changes: 2 additions & 2 deletions server/etcdserver/api/rafthttp/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func checkPostResponse(lg *zap.Logger, resp *http.Response, body []byte, req *ht
)
}
return errIncompatibleVersion
case errClusterIDMismatch.Error():
case ErrClusterIDMismatch.Error():
if lg != nil {
lg.Error(
"request sent was ignored due to cluster ID mismatch",
Expand All @@ -103,7 +103,7 @@ func checkPostResponse(lg *zap.Logger, resp *http.Response, body []byte, req *ht
zap.String("local-member-cluster-id", req.Header.Get("X-Etcd-Cluster-ID")),
)
}
return errClusterIDMismatch
return ErrClusterIDMismatch
default:
return fmt.Errorf("unhandled error %q when precondition failed", string(body))
}
Expand Down
25 changes: 23 additions & 2 deletions server/etcdserver/corrupt.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
pb "go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
"go.etcd.io/etcd/client/pkg/v3/types"
"go.etcd.io/etcd/server/v3/etcdserver/api/rafthttp"
"go.etcd.io/etcd/server/v3/storage/mvcc"

"go.uber.org/zap"
Expand Down Expand Up @@ -149,6 +150,17 @@ func (cm *corruptionChecker) InitialCheck() error {
zap.Strings("remote-peer-endpoints", p.eps),
zap.Error(err),
)
case rpctypes.ErrClusterIdMismatch:
cm.lg.Warn(
"cluster ID mismatch",
zap.String("local-member-id", cm.hasher.MemberId().String()),
zap.Int64("local-member-revision", h.Revision),
zap.Int64("local-member-compact-revision", h.CompactRevision),
zap.Uint32("local-member-hash", h.Hash),
zap.String("remote-peer-id", p.id.String()),
zap.Strings("remote-peer-endpoints", p.eps),
zap.Error(err),
)
}
}
}
Expand Down Expand Up @@ -466,7 +478,7 @@ func (s *EtcdServer) getPeerHashKVs(rev int64) []*peerHashKVResp {
var lastErr error
for _, ep := range p.eps {
ctx, cancel := context.WithTimeout(context.Background(), s.Cfg.ReqTimeout())
resp, lastErr := HashByRev(ctx, cc, ep, rev)
resp, lastErr := HashByRev(ctx, s.cluster.ID(), cc, ep, rev)
cancel()
if lastErr == nil {
resps = append(resps, &peerHashKVResp{peerInfo: p, resp: resp, err: nil})
Expand Down Expand Up @@ -510,6 +522,10 @@ func (h *hashKVHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
http.Error(w, "bad path", http.StatusBadRequest)
return
}
if gcid := r.Header.Get("X-Etcd-Cluster-ID"); gcid != "" && gcid != h.server.cluster.ID().String() {
http.Error(w, rafthttp.ErrClusterIDMismatch.Error(), http.StatusPreconditionFailed)
return
}

defer r.Body.Close()
b, err := io.ReadAll(r.Body)
Expand Down Expand Up @@ -553,7 +569,7 @@ func (h *hashKVHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
}

// HashByRev fetch hash of kv store at the given rev via http call to the given url
func HashByRev(ctx context.Context, cc *http.Client, url string, rev int64) (*pb.HashKVResponse, error) {
func HashByRev(ctx context.Context, cid types.ID, cc *http.Client, url string, rev int64) (*pb.HashKVResponse, error) {
hashReq := &pb.HashKVRequest{Revision: rev}
hashReqBytes, err := json.Marshal(hashReq)
if err != nil {
Expand All @@ -566,6 +582,7 @@ func HashByRev(ctx context.Context, cc *http.Client, url string, rev int64) (*pb
}
req = req.WithContext(ctx)
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-Etcd-Cluster-ID", cid.String())
req.Cancel = ctx.Done()

resp, err := cc.Do(req)
Expand All @@ -585,6 +602,10 @@ func HashByRev(ctx context.Context, cc *http.Client, url string, rev int64) (*pb
if strings.Contains(string(b), mvcc.ErrFutureRev.Error()) {
return nil, rpctypes.ErrFutureRev
}
} else if resp.StatusCode == http.StatusPreconditionFailed {
if strings.Contains(string(b), rafthttp.ErrClusterIDMismatch.Error()) {
return nil, rpctypes.ErrClusterIdMismatch
}
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unknown error: %s", string(b))
Expand Down
119 changes: 119 additions & 0 deletions server/etcdserver/corrupt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,23 @@
package etcdserver

import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/http/httptest"
"strconv"
"strings"
CaojiamingAlan marked this conversation as resolved.
Show resolved Hide resolved
"testing"
"time"

"go.uber.org/zap"

"go.etcd.io/etcd/server/v3/lease"
betesting "go.etcd.io/etcd/server/v3/storage/backend/testing"

"github.com/stretchr/testify/assert"
"go.uber.org/zap/zaptest"

Expand Down Expand Up @@ -86,6 +98,13 @@ func TestInitialCheck(t *testing.T) {
hasher: fakeHasher{hashByRevResponses: []hashByRev{{hash: mvcc.KeyValueHash{Hash: 1, CompactRevision: 1}}}, peerHashes: []*peerHashKVResp{{resp: &pb.HashKVResponse{Header: &pb.ResponseHeader{}, Hash: 2, CompactRevision: 2}}}},
expectActions: []string{"MemberId()", "ReqTimeout()", "HashByRev(0)", "PeerHashByRev(0)", "MemberId()", "MemberId()"},
},
{
name: "Cluster ID Mismatch does not fail CorruptionChecker.InitialCheck()",
hasher: fakeHasher{
peerHashes: []*peerHashKVResp{{err: rpctypes.ErrClusterIdMismatch}},
},
expectActions: []string{"MemberId()", "ReqTimeout()", "HashByRev(0)", "PeerHashByRev(0)", "MemberId()", "MemberId()"},
},
}
for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
Expand Down Expand Up @@ -203,6 +222,13 @@ func TestPeriodicCheck(t *testing.T) {
expectActions: []string{"HashByRev(0)", "PeerHashByRev(0)", "ReqTimeout()", "LinearizableReadNotify()", "HashByRev(0)", "TriggerCorruptAlarm(88)"},
expectCorrupt: true,
},
{
name: "Cluster ID Mismatch does not fail CorruptionChecker.PeriodicCheck()",
hasher: fakeHasher{
peerHashes: []*peerHashKVResp{{err: rpctypes.ErrClusterIdMismatch}},
},
expectActions: []string{"HashByRev(0)", "PeerHashByRev(0)", "ReqTimeout()", "LinearizableReadNotify()", "HashByRev(0)"},
},
}
for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
Expand Down Expand Up @@ -388,6 +414,14 @@ func TestCompactHashCheck(t *testing.T) {
},
expectActions: []string{"MemberId()", "ReqTimeout()", "Hashes()", "PeerHashByRev(1)", "MemberId()"},
},
{
name: "Cluster ID Mismatch does not fail CorruptionChecker.CompactHashCheck()",
hasher: fakeHasher{
hashes: []mvcc.KeyValueHash{{Revision: 1, CompactRevision: 1, Hash: 1}},
peerHashes: []*peerHashKVResp{{err: rpctypes.ErrClusterIdMismatch}},
},
expectActions: []string{"MemberId()", "ReqTimeout()", "Hashes()", "PeerHashByRev(1)", "MemberId()"},
},
}
for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
Expand Down Expand Up @@ -473,3 +507,88 @@ func (f *fakeHasher) TriggerCorruptAlarm(memberId types.ID) {
f.actions = append(f.actions, fmt.Sprintf("TriggerCorruptAlarm(%d)", memberId))
f.alarmTriggered = true
}

func TestHashKVHandler(t *testing.T) {
var remoteClusterID = 111195
var localClusterID = 111196
var revision = 1

etcdSrv := &EtcdServer{}
etcdSrv.cluster = newTestCluster(t, nil)
etcdSrv.cluster.SetID(types.ID(localClusterID), types.ID(localClusterID))
be, _ := betesting.NewDefaultTmpBackend(t)
defer betesting.Close(t, be)
etcdSrv.kv = mvcc.New(zap.NewNop(), be, &lease.FakeLessor{}, mvcc.StoreConfig{})
ph := &hashKVHandler{
lg: zap.NewNop(),
server: etcdSrv,
}
srv := httptest.NewServer(ph)
defer srv.Close()

tests := []struct {
name string
remoteClusterID int
wcode int
wKeyWords string
}{
{
CaojiamingAlan marked this conversation as resolved.
Show resolved Hide resolved
name: "HashKV returns 200 if cluster hash matches",
remoteClusterID: localClusterID,
wcode: http.StatusOK,
wKeyWords: "",
},
{
name: "HashKV returns 400 if cluster hash doesn't matche",
remoteClusterID: remoteClusterID,
wcode: http.StatusPreconditionFailed,
wKeyWords: "cluster ID mismatch",
},
}
for i, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
hashReq := &pb.HashKVRequest{Revision: int64(revision)}
hashReqBytes, err := json.Marshal(hashReq)
if err != nil {
t.Fatalf("failed to marshal request: %v", err)
}
req, err := http.NewRequest(http.MethodGet, srv.URL+PeerHashKVPath, bytes.NewReader(hashReqBytes))
if err != nil {
t.Fatalf("failed to create request: %v", err)
}
req.Header.Set("X-Etcd-Cluster-ID", strconv.FormatUint(uint64(tt.remoteClusterID), 16))

resp, err := http.DefaultClient.Do(req)
if err != nil {
t.Fatalf("failed to get http response: %v", err)
}
body, err := io.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
t.Fatalf("unexpected io.ReadAll error: %v", err)
}
if resp.StatusCode != tt.wcode {
t.Fatalf("#%d: code = %d, want %d", i, resp.StatusCode, tt.wcode)
}
if resp.StatusCode != http.StatusOK {
if !strings.Contains(string(body), tt.wKeyWords) {
t.Errorf("#%d: body: %s, want body to contain keywords: %s", i, string(body), tt.wKeyWords)
}
return
}

hashKVResponse := pb.HashKVResponse{}
err = json.Unmarshal(body, &hashKVResponse)
if err != nil {
t.Fatalf("unmarshal response error: %v", err)
}
hashValue, _, err := etcdSrv.KV().HashStorage().HashByRev(int64(revision))
if err != nil {
t.Fatalf("etcd server hash failed: %v", err)
}
if hashKVResponse.Hash != hashValue.Hash {
t.Fatalf("hash value inconsistent: %d != %d", hashKVResponse.Hash, hashValue)
}
})
}
}
Loading
Loading