From 52613b262bef8a04d94269b6813fca93420c262e Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Thu, 27 Apr 2017 11:19:29 -0700 Subject: [PATCH 1/2] raft: Set the RecentActive flag for newly added nodes I found that enabling the CheckQuorum flag led to spurious leader elections when new nodes joined. It looks like in the time between a new node joining the cluster, and that node first communicating with the leader, the quorum check could fail because the new node looks inactive. To solve this, set the RecentActive flag when nodes are first added. This gives a grace period for the node to communicate before it causes the quorum check to fail. Signed-off-by: Aaron Lehmann --- raft/raft.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/raft/raft.go b/raft/raft.go index 7be4407ee2b..29f20398203 100644 --- a/raft/raft.go +++ b/raft/raft.go @@ -1159,6 +1159,10 @@ func (r *raft) addNode(id uint64) { } r.setProgress(id, 0, r.raftLog.lastIndex()+1) + // When a node is first added, we should mark it as recently active. + // Otherwise, CheckQuorum may cause us to step down if it is invoked + // before the added node has a chance to communicate with us. + r.prs[id].RecentActive = true } func (r *raft) removeNode(id uint64) { From 9451fa1f9c1ac5b9579e59fc8019ce49612dae05 Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Thu, 4 May 2017 14:58:05 -0700 Subject: [PATCH 2/2] raft: Add unit test TestAddNodeCheckQuorum This test verifies that adding a node does not cause the leader to step down until at least one full ElectionTick cycle elapses. Signed-off-by: Aaron Lehmann --- raft/raft_test.go | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/raft/raft_test.go b/raft/raft_test.go index 2dffe7acc6d..fae05b4deae 100644 --- a/raft/raft_test.go +++ b/raft/raft_test.go @@ -2565,6 +2565,41 @@ func TestAddNode(t *testing.T) { } } +// TestAddNodeCheckQuorum tests that addNode does not trigger a leader election +// immediately when checkQuorum is set. +func TestAddNodeCheckQuorum(t *testing.T) { + r := newTestRaft(1, []uint64{1}, 10, 1, NewMemoryStorage()) + r.pendingConf = true + r.checkQuorum = true + + r.becomeCandidate() + r.becomeLeader() + + for i := 0; i < r.electionTimeout-1; i++ { + r.tick() + } + + r.addNode(2) + + // This tick will reach electionTimeout, which triggers a quorum check. + r.tick() + + // Node 1 should still be the leader after a single tick. + if r.state != StateLeader { + t.Errorf("state = %v, want %v", r.state, StateLeader) + } + + // After another electionTimeout ticks without hearing from node 2, + // node 1 should step down. + for i := 0; i < r.electionTimeout; i++ { + r.tick() + } + + if r.state != StateFollower { + t.Errorf("state = %v, want %v", r.state, StateFollower) + } +} + // TestRemoveNode tests that removeNode could update pendingConf, nodes and // and removed list correctly. func TestRemoveNode(t *testing.T) {