commit ab08dad9503873768dc673a1e594c7bddc58b01b from: Vladislav Shpilevoy date: Wed Sep 07 22:52:46 2022 UTC promote: abort it when become non-candidate box.ctl.promote() bumps the term, makes the node a candidate, and waits for the term outcome. The waiting used to be until there is a leader elected or the node lost connection quorum or the term was bumped again. There was a bug that a node could hang in box.ctl.promote() even when became a voter. It could happen if the quorum was still there and a leader couldn't be elected in the current term at all. For instance, others could have `election_mode='off'`. The fix is to stop waiting for the term outcome if the node can't win anyway. NO_DOC=bugfix commit - dd89c57e789a5fad0d6776d4236575b9adee891f commit + ab08dad9503873768dc673a1e594c7bddc58b01b blob - /dev/null blob + fcbcdd4614530e961df42aa7836694ec870ac7d3 (mode 644) --- /dev/null +++ changelogs/unreleased/promote-hang.md @@ -0,0 +1,4 @@ +## bugfix/raft + +* Fixed a bug when a node with `election_mode='voter'` could hang in + `box.ctl.promote()` or even become a leader. blob - de53e95244a6f4ea0283bb7975ea563542726015 blob + 3182fa02476d588e2fbf0c19d0cef2e64b7704d6 --- src/box/raft.c +++ src/box/raft.c @@ -437,8 +437,13 @@ box_raft_try_promote_f(struct trigger *trig, void *eve */ if (raft->volatile_term > ctx->term) goto done; - /* Instance does not participate in terms anymore. */ - if (!raft->is_enabled) + /* + * Instance does not participate in terms as a candidate anymore. Can + * happen not only if the node was a temporary candidate and the term + * was bumped, but also if it was reconfigured during the waiting or it + * lost the connection quorum. + */ + if (!raft->is_candidate) goto done; /* The term ended with a leader being found. */ if (raft->leader != REPLICA_ID_NIL) @@ -469,11 +474,6 @@ box_raft_try_promote(void) }; trigger_create(&trig, box_raft_try_promote_f, &ctx, NULL); raft_on_update(raft, &trig); - /* - * XXX: it is not a good idea not to have a timeout here. If all nodes - * are voters, the term might never end with any result nor bump to a - * new value. - */ while (!fiber_is_cancelled() && !ctx.is_done) fiber_yield(); trigger_clear(&trig); @@ -493,7 +493,9 @@ box_raft_try_promote(void) diag_set(ClientError, ER_OLD_TERM, (unsigned long long)ctx.term, (unsigned long long)raft->volatile_term); } else { - assert(!raft->is_enabled); + assert(!raft->is_candidate); + assert(box_election_mode != ELECTION_MODE_MANUAL && + box_election_mode != ELECTION_MODE_CANDIDATE); diag_set(ClientError, ER_ELECTION_DISABLED); } raft_restore(raft); blob - 2e3f5e8018e29102014efc9dbb34074e4d355ad3 blob + 9403739af67a88dec67c426d516e07262041cf96 --- test/replication-luatest/gh_6033_box_promote_demote_test.lua +++ test/replication-luatest/gh_6033_box_promote_demote_test.lua @@ -86,6 +86,14 @@ local function wal_delay_start(server, countdown) end end +local function wal_delay_wait(server) + luatest.helpers.retrying({}, server.exec, server, function() + if not box.error.injection.get('ERRINJ_WAL_DELAY') then + error('WAL still is not blocked') + end + end) +end + local function wal_delay_end(server) server:exec(function() box.error.injection.set('ERRINJ_WAL_DELAY', false) @@ -97,6 +105,7 @@ local function cluster_init(g) g.box_cfg = { election_mode = 'off', + election_timeout = box.NULL, replication_timeout = 0.1, replication_synchro_timeout = 5, replication_synchro_quorum = 1, @@ -207,6 +216,37 @@ g_common.test_raft_leader_promote = function(g) box_cfg_update({g.server_1}, {election_mode = 'off'}) demote(g.server_1) +end + +-- +-- If a node stopped being a candidate, its box.ctl.promote() should abort right +-- away. +-- +g_common.test_voter_during_promote = function(g) + box_cfg_update({g.server_1, g.server_2}, { + election_mode = 'manual', + election_timeout = 1000, + replication_synchro_quorum = 2, + }) + wal_delay_start(g.server_1, 0) + local promote_fid = promote_start(g.server_2) + -- Server1 hangs on new term WAL write. + wal_delay_wait(g.server_1) + luatest.assert_equals(g.server_1:election_term(), + g.server_2:election_term()) + + -- Server2 should stop the promotion without waiting for the term outcome + -- because it no longer can win anyway. + box_cfg_update({g.server_2}, {election_mode = 'voter'}) + wal_delay_end(g.server_1) + fiber_join(g.server_2, promote_fid) + + -- Nobody won. + local function get_election_state_f() + return box.info.election.state + end + luatest.assert_equals(g.server_1:exec(get_election_state_f), 'follower') + luatest.assert_equals(g.server_2:exec(get_election_state_f), 'follower') end -- Promoting and demoting should work when everything is ok.