From 05fbb566b9a342b881954c4c1086faecd7d670af Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Mon, 12 Aug 2019 13:51:18 +0200 Subject: [PATCH] storage: avoid RaftGroupDeletedError from RHS in splitTrigger The right hand side of a split can be readded before the split trigger fires, in which case the split trigger fails. See [bug description]. I [suggested] a test to reprduce this bug "properly", so we should look into that. In the meantime, it'll be good to see that this passes tests. I verified manually that setting `minReplicaID` to some large number before the call to `rightRng.initRaftMuLockedReplicaMuLocked` reproduces the symptoms prior to this commit, but that doesn't come as a surprise nor does it prove that the fix works flawlessly. [bug description]: https://github.com/cockroachdb/cockroach/issues/21146#issuecomment-365757329 [suggested]: https://github.com/cockroachdb/cockroach/pull/39034#issuecomment-520371326 Fixes #21146. Release note (bug fix): Fixed a rare panic (message: "raft group deleted") that could occur during splits. --- pkg/storage/store.go | 47 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/pkg/storage/store.go b/pkg/storage/store.go index 05a276f6b239..a0414e91cd59 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -2134,8 +2134,44 @@ func splitPostApply( log.Fatalf(ctx, "unable to find RHS replica: %+v", err) } { - rightRng.mu.Lock() // Already holding raftMu, see above. + rightRng.mu.Lock() + // The right hand side of the split may have been removed and re-added + // in the meantime, and the replicaID in RightDesc may be stale. + // Consequently the call below may fail with a RaftGroupDeletedError. In + // general, this protects earlier incarnations of the replica that were + // since replicaGC'ed from reneging on promises made earlier (for + // example, once the HardState is removed, a replica could cast a + // different vote for the same term). + // + // It is safe to circumvent that in this case because the RHS must have + // remained uninitialized (the LHS blocks all user data, and since our + // Raft logs start at a nonzero index a snapshot must go through before + // any log entries are appended). This means the data in that range is + // just a HardState which we "logically" snapshot by assigning it data + // formerly located within the LHS. + // + // Note that if we ever have a way to replicaGC uninitialized replicas, + // the RHS may have been gc'ed and so the HardState would be gone. In + // that case, the requirement that the HardState remains would have been + // violated if the bypass below were used, which is why we place an + // assertion. + // + // See: + // https://github.com/cockroachdb/cockroach/issues/21146#issuecomment-365757329 + tombstoneKey := keys.RaftTombstoneKey(rightRng.RangeID) + var tombstone roachpb.RaftTombstone + if ok, err := engine.MVCCGetProto( + ctx, r.store.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, engine.MVCCGetOptions{}, + ); err != nil { + log.Fatalf(ctx, "unable to load tombstone for RHS: %+v", err) + } else if ok { + log.Fatalf(ctx, "split trigger found right-hand side with tombstone %+v: %v", tombstone, rightRng) + } + // NB: the safety argument above implies that we don't have to worry + // about restoring the existing minReplicaID if it's nonzero. No + // promises have been made, so none need to be kept. + rightRng.mu.minReplicaID = 0 err := rightRng.initRaftMuLockedReplicaMuLocked(&split.RightDesc, r.store.Clock(), 0) rightRng.mu.Unlock() if err != nil { @@ -2536,6 +2572,15 @@ func (s *Store) removeReplicaImpl( return err } + if !rep.IsInitialized() { + // The split trigger relies on the fact that it can bypass the tombstone + // check for the RHS, but this is only true as long as we never delete + // its HardState. + // + // See the comment in splitPostApply for details. + log.Fatalf(ctx, "can not replicaGC uninitialized replicas") + } + // During merges, the context might have the subsuming range, so we explicitly // log the replica to be removed. log.Infof(ctx, "removing replica r%d/%d", rep.RangeID, replicaID)