Commit Diff


commit - dd89c57e789a5fad0d6776d4236575b9adee891f
commit + ab08dad9503873768dc673a1e594c7bddc58b01b
blob - /dev/null
blob + fcbcdd4614530e961df42aa7836694ec870ac7d3 (mode 644)
--- /dev/null
+++ changelogs/unreleased/promote-hang.md
@@ -0,0 +1,4 @@
+## bugfix/raft
+
+* Fixed a bug when a node with `election_mode='voter'` could hang in
+  `box.ctl.promote()` or even become a leader.
blob - de53e95244a6f4ea0283bb7975ea563542726015
blob + 3182fa02476d588e2fbf0c19d0cef2e64b7704d6
--- src/box/raft.c
+++ src/box/raft.c
@@ -437,8 +437,13 @@ box_raft_try_promote_f(struct trigger *trig, void *eve
 	 */
 	if (raft->volatile_term > ctx->term)
 		goto done;
-	/* Instance does not participate in terms anymore. */
-	if (!raft->is_enabled)
+	/*
+	 * Instance does not participate in terms as a candidate anymore. Can
+	 * happen not only if the node was a temporary candidate and the term
+	 * was bumped, but also if it was reconfigured during the waiting or it
+	 * lost the connection quorum.
+	 */
+	if (!raft->is_candidate)
 		goto done;
 	/* The term ended with a leader being found. */
 	if (raft->leader != REPLICA_ID_NIL)
@@ -469,11 +474,6 @@ box_raft_try_promote(void)
 	};
 	trigger_create(&trig, box_raft_try_promote_f, &ctx, NULL);
 	raft_on_update(raft, &trig);
-	/*
-	 * XXX: it is not a good idea not to have a timeout here. If all nodes
-	 * are voters, the term might never end with any result nor bump to a
-	 * new value.
-	 */
 	while (!fiber_is_cancelled() && !ctx.is_done)
 		fiber_yield();
 	trigger_clear(&trig);
@@ -493,7 +493,9 @@ box_raft_try_promote(void)
 		diag_set(ClientError, ER_OLD_TERM, (unsigned long long)ctx.term,
 			 (unsigned long long)raft->volatile_term);
 	} else {
-		assert(!raft->is_enabled);
+		assert(!raft->is_candidate);
+		assert(box_election_mode != ELECTION_MODE_MANUAL &&
+		       box_election_mode != ELECTION_MODE_CANDIDATE);
 		diag_set(ClientError, ER_ELECTION_DISABLED);
 	}
 	raft_restore(raft);
blob - 2e3f5e8018e29102014efc9dbb34074e4d355ad3
blob + 9403739af67a88dec67c426d516e07262041cf96
--- test/replication-luatest/gh_6033_box_promote_demote_test.lua
+++ test/replication-luatest/gh_6033_box_promote_demote_test.lua
@@ -86,6 +86,14 @@ local function wal_delay_start(server, countdown)
     end
 end
 
+local function wal_delay_wait(server)
+    luatest.helpers.retrying({}, server.exec, server, function()
+        if not box.error.injection.get('ERRINJ_WAL_DELAY') then
+            error('WAL still is not blocked')
+        end
+    end)
+end
+
 local function wal_delay_end(server)
     server:exec(function()
         box.error.injection.set('ERRINJ_WAL_DELAY', false)
@@ -97,6 +105,7 @@ local function cluster_init(g)
 
     g.box_cfg = {
         election_mode = 'off',
+        election_timeout = box.NULL,
         replication_timeout = 0.1,
         replication_synchro_timeout = 5,
         replication_synchro_quorum = 1,
@@ -207,6 +216,37 @@ g_common.test_raft_leader_promote = function(g)
 
     box_cfg_update({g.server_1}, {election_mode = 'off'})
     demote(g.server_1)
+end
+
+--
+-- If a node stopped being a candidate, its box.ctl.promote() should abort right
+-- away.
+--
+g_common.test_voter_during_promote = function(g)
+    box_cfg_update({g.server_1, g.server_2}, {
+        election_mode = 'manual',
+        election_timeout = 1000,
+        replication_synchro_quorum = 2,
+    })
+    wal_delay_start(g.server_1, 0)
+    local promote_fid = promote_start(g.server_2)
+    -- Server1 hangs on new term WAL write.
+    wal_delay_wait(g.server_1)
+    luatest.assert_equals(g.server_1:election_term(),
+                          g.server_2:election_term())
+
+    -- Server2 should stop the promotion without waiting for the term outcome
+    -- because it no longer can win anyway.
+    box_cfg_update({g.server_2}, {election_mode = 'voter'})
+    wal_delay_end(g.server_1)
+    fiber_join(g.server_2, promote_fid)
+
+    -- Nobody won.
+    local function get_election_state_f()
+        return box.info.election.state
+    end
+    luatest.assert_equals(g.server_1:exec(get_election_state_f), 'follower')
+    luatest.assert_equals(g.server_2:exec(get_election_state_f), 'follower')
 end
 
 -- Promoting and demoting should work when everything is ok.