commit - dd89c57e789a5fad0d6776d4236575b9adee891f
commit + ab08dad9503873768dc673a1e594c7bddc58b01b
blob - /dev/null
blob + fcbcdd4614530e961df42aa7836694ec870ac7d3 (mode 644)
--- /dev/null
+++ changelogs/unreleased/promote-hang.md
+## bugfix/raft
+
+* Fixed a bug when a node with `election_mode='voter'` could hang in
+ `box.ctl.promote()` or even become a leader.
blob - de53e95244a6f4ea0283bb7975ea563542726015
blob + 3182fa02476d588e2fbf0c19d0cef2e64b7704d6
--- src/box/raft.c
+++ src/box/raft.c
*/
if (raft->volatile_term > ctx->term)
goto done;
- /* Instance does not participate in terms anymore. */
- if (!raft->is_enabled)
+ /*
+ * Instance does not participate in terms as a candidate anymore. Can
+ * happen not only if the node was a temporary candidate and the term
+ * was bumped, but also if it was reconfigured during the waiting or it
+ * lost the connection quorum.
+ */
+ if (!raft->is_candidate)
goto done;
/* The term ended with a leader being found. */
if (raft->leader != REPLICA_ID_NIL)
};
trigger_create(&trig, box_raft_try_promote_f, &ctx, NULL);
raft_on_update(raft, &trig);
- /*
- * XXX: it is not a good idea not to have a timeout here. If all nodes
- * are voters, the term might never end with any result nor bump to a
- * new value.
- */
while (!fiber_is_cancelled() && !ctx.is_done)
fiber_yield();
trigger_clear(&trig);
diag_set(ClientError, ER_OLD_TERM, (unsigned long long)ctx.term,
(unsigned long long)raft->volatile_term);
} else {
- assert(!raft->is_enabled);
+ assert(!raft->is_candidate);
+ assert(box_election_mode != ELECTION_MODE_MANUAL &&
+ box_election_mode != ELECTION_MODE_CANDIDATE);
diag_set(ClientError, ER_ELECTION_DISABLED);
}
raft_restore(raft);
blob - 2e3f5e8018e29102014efc9dbb34074e4d355ad3
blob + 9403739af67a88dec67c426d516e07262041cf96
--- test/replication-luatest/gh_6033_box_promote_demote_test.lua
+++ test/replication-luatest/gh_6033_box_promote_demote_test.lua
end
end
+local function wal_delay_wait(server)
+ luatest.helpers.retrying({}, server.exec, server, function()
+ if not box.error.injection.get('ERRINJ_WAL_DELAY') then
+ error('WAL still is not blocked')
+ end
+ end)
+end
+
local function wal_delay_end(server)
server:exec(function()
box.error.injection.set('ERRINJ_WAL_DELAY', false)
g.box_cfg = {
election_mode = 'off',
+ election_timeout = box.NULL,
replication_timeout = 0.1,
replication_synchro_timeout = 5,
replication_synchro_quorum = 1,
box_cfg_update({g.server_1}, {election_mode = 'off'})
demote(g.server_1)
+end
+
+--
+-- If a node stopped being a candidate, its box.ctl.promote() should abort right
+-- away.
+--
+g_common.test_voter_during_promote = function(g)
+ box_cfg_update({g.server_1, g.server_2}, {
+ election_mode = 'manual',
+ election_timeout = 1000,
+ replication_synchro_quorum = 2,
+ })
+ wal_delay_start(g.server_1, 0)
+ local promote_fid = promote_start(g.server_2)
+ -- Server1 hangs on new term WAL write.
+ wal_delay_wait(g.server_1)
+ luatest.assert_equals(g.server_1:election_term(),
+ g.server_2:election_term())
+
+ -- Server2 should stop the promotion without waiting for the term outcome
+ -- because it no longer can win anyway.
+ box_cfg_update({g.server_2}, {election_mode = 'voter'})
+ wal_delay_end(g.server_1)
+ fiber_join(g.server_2, promote_fid)
+
+ -- Nobody won.
+ local function get_election_state_f()
+ return box.info.election.state
+ end
+ luatest.assert_equals(g.server_1:exec(get_election_state_f), 'follower')
+ luatest.assert_equals(g.server_2:exec(get_election_state_f), 'follower')
end
-- Promoting and demoting should work when everything is ok.