commit - ec6281002f3b6ffa7735f99199bfba4e6d880c70
commit + c9155ac8636264ce6d1b10a2937a6ad3844db50a
blob - /dev/null
blob + 37365ac053a818f041a8d8ba1b93ed17d71c8d4f (mode 644)
--- /dev/null
+++ changelogs/unreleased/gh-7253-split-brain-early-vote.md
+## bugfix/raft
+
+* Fixed a bug when a replicaset could be split in parts if a node during
+ elections voted for another instance while having some local WAL writes not
+ finished (gh-7253).
blob - 453c5080484654d9ecce374a8699f4f5ca72c71d
blob + 1fe342a6a12ed65aeb706cbb061c46733015de51
--- src/lib/raft/raft.c
+++ src/lib/raft/raft.c
static inline bool
raft_can_vote_for(const struct raft *raft, const struct vclock *v)
{
+ assert(vclock_is_set(v));
int cmp = vclock_compare_ignore0(v, raft->vclock);
return cmp == 0 || cmp == 1;
}
return true;
}
+/**
+ * Vote can be revoked if it was volatile (not yet in WAL) and thus wasn't shown
+ * to any other instance yet.
+ */
+static void
+raft_revoke_vote(struct raft *raft)
+{
+ assert(raft->volatile_vote != 0);
+ assert(raft->vote == 0);
+ struct raft_vote *v = &raft->votes[raft->self];
+ assert(v->did_vote);
+ v->did_vote = false;
+ assert(raft->voted_count > 0);
+ --raft->voted_count;
+ v = &raft->votes[raft->volatile_vote];
+ assert(v->count > 0);
+ bool was_max = v->count == raft->max_vote;
+ --v->count;
+ if (was_max) {
+ --raft->max_vote;
+ for (int i = 0; i < VCLOCK_MAX; ++i) {
+ v = &raft->votes[i];
+ if (v->count > raft->max_vote)
+ raft->max_vote = v->count;
+ }
+ }
+ raft->volatile_vote = 0;
+ vclock_clear(&raft->candidate_vclock);
+}
+
static bool
raft_has_split_vote(const struct raft *raft)
{
static void
raft_sm_schedule_new_term(struct raft *raft, uint64_t new_term);
-/** Bump volatile vote and schedule its flush to disk. */
+/**
+ * Set a volatile vote for the given candidate and schedule flush to disk if the
+ * vclock would be still acceptable for the current instance by that time.
+ */
static void
-raft_sm_schedule_new_vote(struct raft *raft, uint32_t new_vote);
+raft_sm_schedule_new_vote(struct raft *raft, uint32_t candidate_id,
+ const struct vclock *candidate_vclock);
+
+/** Try to schedule a vote for the given candidate if applicable. */
+static void
+raft_sm_try_new_vote(struct raft *raft, uint32_t candidate_id,
+ const struct vclock *candidate_vclock);
/**
* Bump term and vote for self immediately. After that is persisted, the
"already voted in this term");
break;
}
- /* Vclock is not NULL, validated above. */
- if (!raft_can_vote_for(raft, req->vclock)) {
- say_info("RAFT: vote request is skipped - the "
- "vclock is not acceptable");
- break;
- }
- /*
- * Either the term is new, or didn't vote in the current
- * term yet. Anyway can vote now.
- */
- raft_sm_schedule_new_vote(raft, req->vote);
+ raft_sm_try_new_vote(raft, req->vote, req->vclock);
break;
case RAFT_STATE_CANDIDATE:
/* Check if this is a vote for a competing candidate. */
raft_sm_wait_election_end(raft);
} else {
/* No leaders, no votes. */
- raft_sm_schedule_new_vote(raft, raft->self);
+ raft_sm_schedule_new_vote(raft, raft->self,
+ raft->vclock);
}
}
} else {
memset(&req, 0, sizeof(req));
assert(raft->volatile_term >= raft->term);
- req.term = raft->volatile_term;
+ if (raft->volatile_vote == 0)
+ goto do_dump;
+ /*
+ * Vote and term bumps are persisted separately. This serves as
+ * a flush of all transactions going to WAL right now so as the
+ * current node could correctly compare its own vclock vs
+ * candidate's one. Otherwise the local vclock can be <=
+ * candidate's now, but that can change after the WAL queue is
+ * flushed.
+ */
+ if (raft->volatile_term > raft->term)
+ goto do_dump;
+ /*
+ * Skip self. When vote was issued, own vclock could be smaller,
+ * but that doesn't matter. Can always vote for self. Not having
+ * this special case still works if the node is configured as a
+ * candidate, but the node might log that it canceled a vote for
+ * self, which is confusing.
+ */
+ if (raft->volatile_vote == raft->self)
+ goto do_dump_with_vote;
+ if (!raft_can_vote_for(raft, &raft->candidate_vclock)) {
+ say_info("RAFT: vote request for %u is canceled - the "
+ "vclock is not acceptable anymore",
+ raft->volatile_vote);
+ raft_revoke_vote(raft);
+ assert(raft_is_fully_on_disk(raft));
+ goto end_dump;
+ }
+do_dump_with_vote:
req.vote = raft->volatile_vote;
+do_dump:
+ req.term = raft->volatile_term;
/*
* Skip vclock. It is used only to be sent to network when vote
* for self. It is a job of the vclock owner to persist it
raft->volatile_term = new_term;
/* New terms means completely new Raft state. */
raft->volatile_vote = 0;
+ vclock_clear(&raft->candidate_vclock);
if (raft->leader == raft->self) {
/*
* Update leader_last_seen when resigning so that leader_idle
}
static void
-raft_sm_schedule_new_vote(struct raft *raft, uint32_t new_vote)
+raft_sm_schedule_new_vote(struct raft *raft, uint32_t candidate_id,
+ const struct vclock *candidate_vclock)
{
- say_info("RAFT: vote for %u, follow", new_vote);
+ say_info("RAFT: vote for %u, follow", candidate_id);
+ assert(raft_can_vote_for(raft, candidate_vclock));
assert(raft->volatile_vote == 0);
+ assert(!vclock_is_set(&raft->candidate_vclock));
assert(raft->leader == 0);
assert(raft->state == RAFT_STATE_FOLLOWER);
assert(!raft->votes[raft->self].did_vote);
- raft->volatile_vote = new_vote;
+ raft->volatile_vote = candidate_id;
+ vclock_copy(&raft->candidate_vclock, candidate_vclock);
raft_add_vote(raft, raft->self, raft->self);
raft_sm_pause_and_dump(raft);
/* Nothing visible is changed - no broadcast. */
+}
+
+static void
+raft_sm_try_new_vote(struct raft *raft, uint32_t candidate_id,
+ const struct vclock *candidate_vclock)
+{
+ if (!raft_can_vote_for(raft, candidate_vclock)) {
+ assert(candidate_id != raft->self);
+ say_info("RAFT: vote request for %u is skipped - the vclock "
+ "is not acceptable", candidate_id);
+ return;
+ }
+ raft_sm_schedule_new_vote(raft, candidate_id, candidate_vclock);
}
static void
assert(raft->is_candidate);
/* Everyone is a follower until its vote for self is persisted. */
raft_sm_schedule_new_term(raft, raft->volatile_term + 1);
- raft_sm_schedule_new_vote(raft, raft->self);
+ raft_sm_schedule_new_vote(raft, raft->self, raft->vclock);
}
static void
blob - 68aaf50aaf27f17a5d011313bd7634b9930a8ec0
blob + 5d21510a922e7e6c5eb22c00bd466fbf9c90b711
--- src/lib/raft/raft.h
+++ src/lib/raft/raft.h
* subsystems, such as Raft.
*/
const struct vclock *vclock;
+ /**
+ * Vclock of the candidate which the current instance is trying to vote
+ * for right now. It is used to double-check if the instance still can
+ * vote for the given candidate after own WAL queue was flushed.
+ */
+ struct vclock candidate_vclock;
/** State machine timed event trigger. */
struct ev_timer timer;
/** The moment of the last communication with the leader. */
blob - 382aeef6099381866385fc45a5d015b79c3f195c
blob + 744297f4f8e03277597e51a26138ecc1c419662b
--- test/replication/election_basic.result
+++ test/replication/election_basic.result
| ---
| ...
-test_run:wait_cond(function() return #election_tbl == 3 end)
+test_run:wait_cond(function() return #election_tbl == 4 end)
| ---
| - true
| ...
| ---
| - true
| ...
-assert(election_tbl[2].state == 'candidate')
+assert(election_tbl[2].state == 'follower')
+ | ---
+ | - true
+ | ...
+assert(election_tbl[2].term > election_tbl[1].term)
| ---
| - true
| ...
+-- Vote is visible here already, but it is volatile.
assert(election_tbl[2].vote == 1)
| ---
| - true
| ...
-assert(election_tbl[3].state == 'leader')
+assert(election_tbl[3].state == 'candidate')
+ | ---
+ | - true
+ | ...
+assert(election_tbl[3].vote == 1)
| ---
| - true
| ...
+assert(election_tbl[4].state == 'leader')
+ | ---
+ | - true
+ | ...
box.cfg{election_mode='voter'}
| ---
| ...
-test_run:wait_cond(function() return #election_tbl == 4 end)
+test_run:wait_cond(function() return #election_tbl == 5 end)
| ---
| - true
| ...
-assert(election_tbl[4].state == 'follower')
+assert(election_tbl[5].state == 'follower')
| ---
| - true
| ...
box.cfg{election_mode='off'}
| ---
| ...
-test_run:wait_cond(function() return #election_tbl == 5 end)
+test_run:wait_cond(function() return #election_tbl == 6 end)
| ---
| - true
| ...
box.cfg{election_mode='manual'}
| ---
| ...
-test_run:wait_cond(function() return #election_tbl == 6 end)
+test_run:wait_cond(function() return #election_tbl == 7 end)
| ---
| - true
| ...
-assert(election_tbl[6].state == 'follower')
+assert(election_tbl[7].state == 'follower')
| ---
| - true
| ...
| ---
| ...
-test_run:wait_cond(function() return #election_tbl == 9 end)
+test_run:wait_cond(function() return #election_tbl == 10 end)
| ---
| - true
| ...
-assert(election_tbl[7].state == 'follower')
+assert(election_tbl[8].state == 'follower')
| ---
| - true
| ...
-assert(election_tbl[7].term == election_tbl[6].term + 1)
+assert(election_tbl[8].term == election_tbl[7].term + 1)
| ---
| - true
| ...
-assert(election_tbl[8].state == 'candidate')
+-- Vote is visible here already, but it is volatile.
+assert(election_tbl[8].vote == 1)
| ---
| - true
| ...
-assert(election_tbl[9].state == 'leader')
+assert(election_tbl[9].state == 'candidate')
| ---
| - true
| ...
+assert(election_tbl[9].vote == 1)
+ | ---
+ | - true
+ | ...
+assert(election_tbl[10].state == 'leader')
+ | ---
+ | - true
+ | ...
test_run:cmd('stop server replica')
| ---
blob - 47f3d318eab4f99ac6146fb7fdd1a3eecdc80d80
blob + a1a0aa678102293ba12c9c58c0c1168286e7638d
--- test/replication/election_basic.test.lua
+++ test/replication/election_basic.test.lua
box.cfg{replication_synchro_quorum=2}
box.cfg{election_mode='candidate'}
-test_run:wait_cond(function() return #election_tbl == 3 end)
+test_run:wait_cond(function() return #election_tbl == 4 end)
assert(election_tbl[1].state == 'follower')
-assert(election_tbl[2].state == 'candidate')
+assert(election_tbl[2].state == 'follower')
+assert(election_tbl[2].term > election_tbl[1].term)
+-- Vote is visible here already, but it is volatile.
assert(election_tbl[2].vote == 1)
-assert(election_tbl[3].state == 'leader')
+assert(election_tbl[3].state == 'candidate')
+assert(election_tbl[3].vote == 1)
+assert(election_tbl[4].state == 'leader')
box.cfg{election_mode='voter'}
-test_run:wait_cond(function() return #election_tbl == 4 end)
-assert(election_tbl[4].state == 'follower')
+test_run:wait_cond(function() return #election_tbl == 5 end)
+assert(election_tbl[5].state == 'follower')
box.cfg{election_mode='off'}
-test_run:wait_cond(function() return #election_tbl == 5 end)
+test_run:wait_cond(function() return #election_tbl == 6 end)
box.cfg{election_mode='manual'}
-test_run:wait_cond(function() return #election_tbl == 6 end)
-assert(election_tbl[6].state == 'follower')
+test_run:wait_cond(function() return #election_tbl == 7 end)
+assert(election_tbl[7].state == 'follower')
box.ctl.promote()
-test_run:wait_cond(function() return #election_tbl == 9 end)
-assert(election_tbl[7].state == 'follower')
-assert(election_tbl[7].term == election_tbl[6].term + 1)
-assert(election_tbl[8].state == 'candidate')
-assert(election_tbl[9].state == 'leader')
+test_run:wait_cond(function() return #election_tbl == 10 end)
+assert(election_tbl[8].state == 'follower')
+assert(election_tbl[8].term == election_tbl[7].term + 1)
+-- Vote is visible here already, but it is volatile.
+assert(election_tbl[8].vote == 1)
+assert(election_tbl[9].state == 'candidate')
+assert(election_tbl[9].vote == 1)
+assert(election_tbl[10].state == 'leader')
test_run:cmd('stop server replica')
test_run:cmd('delete server replica')
blob - 6ea5208ebd74a83407b6e035e0fafe700279f964
blob + eb55a436a9c50129d081dcb095cd5a04450c616b
--- test/replication-luatest/gh_7253_election_long_wal_write_test.lua
+++ test/replication-luatest/gh_7253_election_long_wal_write_test.lua
local function wait_fullmesh(g)
wait_pair_sync(g.server1, g.server2)
+ wait_pair_sync(g.server2, g.server3)
+ wait_pair_sync(g.server3, g.server1)
end
local function block_next_wal_write_f()
box.error.injection.set('ERRINJ_WAL_DELAY', false)
end
+local function get_election_state_f()
+ return box.info.election.state
+end
+
local function server_block_next_wal_write(server)
server:exec(block_next_wal_write_f)
end
server:exec(unblock_wal_write_f)
end
+local function server_get_election_state(server)
+ return server:exec(get_election_state_f)
+end
+
local function check_wal_is_blocked_f()
if not box.error.injection.get('ERRINJ_WAL_DELAY') then
error('WAL still is not paused')
replication = {
server.build_instance_uri('server1'),
server.build_instance_uri('server2'),
+ server.build_instance_uri('server3'),
},
}
box_cfg.election_mode = 'manual'
g.server2 = g.cluster:build_and_add_server({
alias = 'server2', box_cfg = box_cfg
})
+ g.server3 = g.cluster:build_and_add_server({
+ alias = 'server3', box_cfg = box_cfg
+ })
g.cluster:start()
g.server1:exec(function()
g.cluster:drop()
g.server1 = nil
g.server2 = nil
+ g.server3 = nil
end)
g.test_fence_during_confirm_wal_write = function(g)
+ -- Prevent server3 intervention.
+ server_block_next_wal_write(g.server3)
--
-- Server1 starts a txn.
--
election_timeout = box.NULL,
}
end)
+ server_wait_wal_is_blocked(g.server3)
+ server_unblock_wal_write(g.server3)
g.server1:exec(function()
box.ctl.promote()
end)
wait_fullmesh(g)
end
+
+g.test_vote_during_txn_wal_write = function(g)
+ --
+ -- Make the following topology:
+ -- server1 <-> server2 <-> server3
+ --
+ g.server1:exec(function()
+ local replication = table.copy(box.cfg.replication)
+ rawset(_G, 'old_replication', table.copy(replication))
+ table.remove(replication, 3)
+ box.cfg{replication = replication}
+ end)
+ g.server3:exec(function()
+ local replication = table.copy(box.cfg.replication)
+ rawset(_G, 'old_replication', table.copy(replication))
+ table.remove(replication, 1)
+ box.cfg{replication = replication}
+ end)
+ --
+ -- Server2 gets a foreign txn going to WAL too long.
+ --
+ server_block_next_wal_write(g.server2)
+ local f = fiber.new(g.server1.exec, g.server1, function()
+ box.space.test:replace({1})
+ end)
+ f:set_joinable(true)
+ server_wait_wal_is_blocked(g.server2)
+ --
+ -- Server3 tries to become a leader by requesting a vote from server2.
+ --
+ local term = g.server2:election_term()
+ fiber.create(g.server3.exec, g.server3, function()
+ box.cfg{
+ election_mode = 'manual',
+ election_timeout = 1000,
+ }
+ pcall(box.ctl.promote)
+ end)
+ g.server2:wait_election_term(term + 1)
+ --
+ -- Server2 shouldn't have persisted a vote yet. Instead, when it finishes
+ -- the txn WAL write, it sees that its vclock is > server3's one and it
+ -- cancels the vote.
+ --
+ server_unblock_wal_write(g.server2)
+ --
+ -- Server1 gets the new term via server2.
+ --
+ g.server1:wait_election_term(term + 1)
+ g.server3:wait_vclock_of(g.server2)
+ t.assert_equals(server_get_election_state(g.server1), 'follower')
+ t.assert_equals(server_get_election_state(g.server2), 'follower')
+ t.assert_not_equals(server_get_election_state(g.server3), 'leader')
+ -- Restore server3 original params.
+ g.server3:exec(function()
+ box.cfg{
+ election_mode = 'voter',
+ election_timeout = box.NULL,
+ }
+ end)
+ -- Restore server1 leadership in the new term.
+ g.server1:exec(function()
+ box.ctl.promote()
+ end)
+ --
+ -- Server1's txn ends fine. Server3 wasn't able to roll it back via own
+ -- PROMOTE.
+ --
+ local ok, err = f:join(wait_timeout)
+ t.assert_equals(err, nil)
+ t.assert(ok)
+ t.assert(g.server1:exec(function()
+ return box.space.test:get{1} ~= nil
+ end))
+ --
+ -- Cleanup.
+ --
+ g.server3:exec(function()
+ box.cfg{replication = _G.old_replication}
+ _G.old_replication = nil
+ end)
+ g.server1:exec(function()
+ box.cfg{replication = _G.old_replication}
+ _G.old_replication = nil
+ end)
+ wait_fullmesh(g)
+end
blob - b89198335d74eab5eff7d3dc0496214586de8a7b
blob + b63183e888151b9c1f28e5385e73822cbf35727f
--- test/unit/raft.c
+++ test/unit/raft.c
static void
raft_test_leader_election(void)
{
- raft_start_test(24);
+ raft_start_test(26);
struct raft_node node;
raft_node_create(&node);
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "elections with a new term");
is(raft_vote_count(&node.raft), 1, "single vote for self");
ok(node.update_count > 0, "trigger worked");
/* Check if all async work is done properly. */
- is(node.journal.size, 1, "1 record in the journal");
- ok(raft_node_journal_check_row(&node,
+ is(node.journal.size, 2, "2 records in the journal");
+ ok(raft_node_journal_check_row(
+ &node,
0 /* Index. */,
2 /* Term. */,
+ 0 /* Vote. */
+ ), "term is on disk");
+ ok(raft_node_journal_check_row(
+ &node,
+ 1 /* Index. */,
+ 2 /* Term. */,
1 /* Vote. */
- ), "term and vote are on disk");
+ ), "vote is on disk");
- is(node.net.count, 1, "1 pending message");
+ is(node.net.count, 2, "2 pending messages");
ok(raft_node_net_check_msg(&node,
0 /* Index. */,
+ RAFT_STATE_FOLLOWER /* State. */,
+ 2 /* Term. */,
+ 0 /* Vote. */,
+ NULL /* Vclock. */
+ ), "term bump is sent");
+ ok(raft_node_net_check_msg(
+ &node,
+ 1 /* Index. */,
RAFT_STATE_CANDIDATE /* State. */,
2 /* Term. */,
1 /* Vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "vote request is sent");
raft_node_net_drop(&node);
/* New leader should do a broadcast when elected. */
ok(!node.has_work, "no work - broadcast should be done");
- is(node.journal.size, 1, "no new rows in the journal - state change "
+ is(node.journal.size, 2, "no new rows in the journal - state change "
"is not persisted");
is(node.net.count, 1, "1 pending message");
ok(raft_node_net_check_msg(&node,
RAFT_STATE_CANDIDATE /* State. */,
2 /* Term. */,
1 /* Vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "remote checkpoint of a candidate");
raft_checkpoint_local(&node.raft, &msg);
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "election is finished");
/* Leader's checkpoint. */
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "restart always as a follower");
is(raft_vote_count(&node.raft), 1, "vote count is restored correctly");
2 /* Vote. */,
2 /* Volatile term. */,
2 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "voted for 2");
is(raft_node_send_vote_request(&node,
2 /* Vote. */,
2 /* Volatile term. */,
2 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "still kept vote for 2");
/* If the candidate didn't become a leader, start own election. */
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "became candidate");
raft_node_destroy(&node);
0 /* Vote. */,
3 /* Volatile term. */,
0 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "term bump to be able to vote again");
is(raft_node_send_vote_request(&node,
3 /* Term. */,
0 /* Vote. */,
4 /* Volatile term. */,
0 /* Volatile vote. */,
- "{0: 3}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "term is bumped, but vote request is ignored");
raft_node_cfg_is_enabled(&node, true);
0 /* Vote. */,
4 /* Volatile term. */,
0 /* Volatile vote. */,
- "{0: 3, 1: 5, 2: 5}" /* Vclock. */
+ "{0: 4, 1: 5, 2: 5}" /* Vclock. */
), "vclock is bumped");
is(raft_node_send_vote_request(&node,
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 3}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "term is bumped and became candidate");
raft_node_destroy(&node);
}
static void
+raft_test_vote_during_wal_write(void)
+{
+ raft_start_test(11);
+ struct raft_node node;
+
+ /*
+ * Vote request from another node causes WAL flush before the current
+ * node can make a vote decision.
+ */
+
+ raft_node_create(&node);
+ /*
+ * Server1 wins elections in the current term.
+ */
+ raft_run_next_event();
+ is(node.raft.state, RAFT_STATE_CANDIDATE, "became candidate");
+ is(raft_node_send_vote_response(
+ &node,
+ 2 /* Term. */,
+ 1 /* Vote. */,
+ 2 /* Source. */
+ ), 0, "vote response from 2");
+ is(raft_node_send_vote_response(
+ &node,
+ 2 /* Term. */,
+ 1 /* Vote. */,
+ 3 /* Source. */
+ ), 0, "vote response from 3");
+ raft_node_journal_follow(&node, 1, 3);
+ raft_node_journal_follow(&node, 2, 5);
+ ok(raft_node_check_full_state(
+ &node,
+ RAFT_STATE_LEADER /* State. */,
+ 1 /* Leader. */,
+ 2 /* Term. */,
+ 1 /* Vote. */,
+ 2 /* Volatile term. */,
+ 1 /* Volatile vote. */,
+ "{0: 2, 1: 3, 2: 5}" /* Vclock. */
+ ), "became leader");
+ /*
+ * Server1 WAL is blocked and it gets a vote request with a matching
+ * vclock.
+ */
+ raft_node_block(&node);
+ is(raft_node_send_vote_request(
+ &node,
+ 3 /* Term. */,
+ "{1: 3, 2: 5}" /* Vclock. */,
+ 2 /* Source. */
+ ), 0, "vote request in a new term but WAL is blocked");
+ /*
+ * A WAL write ends, which was started before the vote request arrived.
+ */
+ raft_node_journal_follow(&node, 1, 1);
+ raft_node_unblock(&node);
+ /*
+ * Server1 rejects the vote request then, because its own vclock became
+ * bigger after the WAL sync. Instead, it voted for self.
+ */
+ ok(raft_node_check_full_state(
+ &node,
+ RAFT_STATE_CANDIDATE /* State. */,
+ 0 /* Leader. */,
+ 3 /* Term. */,
+ 1 /* Vote. */,
+ 3 /* Volatile term. */,
+ 1 /* Volatile vote. */,
+ "{0: 4, 1: 4, 2: 5}" /* Vclock. */
+ ), "canceled the vote for other node and voted for self");
+
+ raft_node_destroy(&node);
+ raft_node_create(&node);
+
+ /*
+ * Vote request for self works always even if there were some pending
+ * rows in the WAL queue when the vote was issued.
+ */
+
+ raft_run_next_event();
+ is(node.raft.state, RAFT_STATE_CANDIDATE, "became candidate");
+ is(node.raft.term, 2, "term is 2");
+ raft_node_block(&node);
+ /* Start new term on election timeout, but can't persist anything. */
+ raft_run_next_event();
+ is(node.raft.term, 2, "term is 2");
+ is(node.raft.volatile_term, 3, "volatile term is 3");
+ /* WAL queue is flushed and there was some data before the vote. */
+ raft_node_journal_follow(&node, 1, 10);
+ raft_node_unblock(&node);
+ ok(raft_node_check_full_state(
+ &node,
+ RAFT_STATE_CANDIDATE /* State. */,
+ 0 /* Leader. */,
+ 3 /* Term. */,
+ 1 /* Vote. */,
+ 3 /* Volatile term. */,
+ 1 /* Volatile vote. */,
+ "{0: 4, 1: 10}" /* Vclock. */
+ ), "vote for self worked even though the WAL had non-empty queue");
+
+ raft_node_destroy(&node);
+
+ raft_finish_test();
+}
+
+static void
raft_test_leader_resign(void)
{
raft_start_test(24);
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "became candidate");
raft_node_destroy(&node);
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "became leader");
raft_node_net_drop(&node);
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "the leader has resigned");
ok(raft_node_net_check_msg(&node,
0 /* Index. */,
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "new election is waiting for WAL write");
/* Now another node wins the election earlier. */
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "the leader is accepted");
/*
1 /* Vote. */,
4 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "the leader has resigned, new election is scheduled");
raft_node_unblock(&node);
1 /* Vote. */,
4 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "the leader is elected");
raft_node_destroy(&node);
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "enter candidate state when no heartbeats from the leader");
/* Non-candidate ignores heartbeats. */
1 /* Vote. */,
5 /* Volatile term. */,
0 /* Volatile vote. */,
- "{0: 4}" /* Vclock. */
+ "{0: 5}" /* Vclock. */
), "nothing changed - waiting for WAL write");
raft_node_unblock(&node);
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "enter candidate state");
ts = raft_time();
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "re-enter candidate state");
/* Reconfiguration works when done during election. */
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "still in the same term - new election timeout didn't expire");
raft_run_next_event();
1 /* Vote. */,
4 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 3}" /* Vclock. */
+ "{0: 6}" /* Vclock. */
), "re-enter candidate state");
/* Decrease election timeout to earlier than now. */
1 /* Vote. */,
5 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 4}" /* Vclock. */
+ "{0: 8}" /* Vclock. */
), "re-enter candidate state");
/*
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "enter candidate state");
raft_node_cfg_election_quorum(&node, 3);
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "enter leader state after another quorum lowering");
/* Quorum 1 allows to become leader right after WAL write. */
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 3}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "became leader again immediately with 1 self vote");
raft_node_destroy(&node);
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "enter candidate state when the new death timeout expires");
/* Decrease timeout to earlier than now. */
1 /* Vote. */,
4 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 3}" /* Vclock. */
+ "{0: 5}" /* Vclock. */
), "enter candidate state");
raft_node_destroy(&node);
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "became candidate");
/* Multiple enabling does not break anything. */
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "nothing changed");
/* Leader disable makes it forget he was a leader. */
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "resigned from leader state");
/* Multiple disabling does not break anything. */
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "nothing changed");
/* Disabled node still bumps the term when needed. */
0 /* Vote. */,
4 /* Volatile term. */,
0 /* Volatile vote. */,
- "{0: 3}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "term bump when disabled");
raft_node_destroy(&node);
2 /* Vote. */,
2 /* Volatile term. */,
2 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "wal write is finished");
double ts = raft_time();
1 /* Vote. */,
4 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 4}" /* Vclock. */
+ "{0: 6}" /* Vclock. */
), "new term is started with vote for self");
/*
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "new term is started with vote for self");
raft_node_destroy(&node);
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "new term is started with vote for self");
raft_node_stop(&node);
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 3}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "started new term");
raft_node_destroy(&node);
1 /* Vote. */,
2 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 1}" /* Vclock. */
+ "{0: 2}" /* Vclock. */
), "elections with a new term");
/* Make so node 1 has votes 1 and 2. Node 3 has votes 3 and 4. */
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "a new term");
ok(node.raft.timer.repeat >= node.raft.election_timeout, "timeout is "
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "a new term");
/*
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "elections once no one sees the leader");
raft_node_cfg_election_quorum(&node, 1);
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "become leader on quorum change");
raft_cfg_is_candidate_later(&node.raft, false);
1 /* Vote. */,
3 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 2}" /* Vclock. */
+ "{0: 3}" /* Vclock. */
), "cfg_is_candidate_later doesn't disrupt leader");
is(raft_node_send_follower(
0 /* Vote. */,
4 /* Volatile term. */,
0 /* Volatile vote. */,
- "{0: 3}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "term bump after cfg_is_candidate_later makes node a voter.");
raft_cfg_is_candidate_later(&node.raft, true);
0 /* Vote. */,
4 /* Volatile term. */,
0 /* Volatile vote. */,
- "{0: 3}" /* Vclock. */
+ "{0: 4}" /* Vclock. */
), "cfg_is_candidate_later doesn't transfer voter to a candidate");
is(raft_node_send_follower(
1 /* Vote. */,
5 /* Volatile term. */,
1 /* Volatile vote. */,
- "{0: 5}" /* Vclock. */
+ "{0: 6}" /* Vclock. */
), "Term bump with cfg_is_candidate_later transfers voter to candiate");
is(raft_leader_idle(&node.raft), 0,
0 /* Vote. */,
6 /* Volatile term. */,
0 /* Volatile vote. */,
- "{0: 6}" /* Vclock. */
+ "{0: 7}" /* Vclock. */
), "no elections on start when someone sees the leader");
ok(!raft_ev_is_active(&node.raft.timer),
0 /* Vote. */,
6 /* Volatile term. */,
0 /* Volatile vote. */,
- "{0: 6}" /* Vclock. */
+ "{0: 7}" /* Vclock. */
), "no elections on becoming candidate when someone sees the leader");
ok(!raft_ev_is_active(&node.raft.timer),
static int
main_f(va_list ap)
{
- raft_start_test(18);
+ raft_start_test(19);
(void) ap;
fakeev_init();
raft_test_bad_msg();
raft_test_vote();
raft_test_vote_skip();
+ raft_test_vote_during_wal_write();
raft_test_leader_resign();
raft_test_split_brain();
raft_test_heartbeat();
blob - d61317d1ce2cddfd3331b73439052985cd51179d
blob + 14fc52759958d72f00fa643cc90b6b93b179b88d
--- test/unit/raft.result
+++ test/unit/raft.result
*** main_f ***
-1..18
+1..19
*** raft_test_leader_election ***
- 1..24
+ 1..26
ok 1 - 1 pending message at start
ok 2 - trigger worked
ok 3 - broadcast at start
ok 5 - elections with a new term
ok 6 - single vote for self
ok 7 - trigger worked
- ok 8 - 1 record in the journal
- ok 9 - term and vote are on disk
- ok 10 - 1 pending message
- ok 11 - vote request is sent
- ok 12 - vote response from 2
- ok 13 - 2 votes - 1 self and 1 foreign
- ok 14 - no work to do - not enough votes yet
- ok 15 - still candidate, waiting for elections
- ok 16 - trigger is the same
- ok 17 - vote response from 3
- ok 18 - 2 votes - 1 self and 2 foreign
- ok 19 - became leader
- ok 20 - trigger worked
- ok 21 - no work - broadcast should be done
- ok 22 - no new rows in the journal - state change is not persisted
- ok 23 - 1 pending message
- ok 24 - sent new-leader notification
+ ok 8 - 2 records in the journal
+ ok 9 - term is on disk
+ ok 10 - vote is on disk
+ ok 11 - 2 pending messages
+ ok 12 - term bump is sent
+ ok 13 - vote request is sent
+ ok 14 - vote response from 2
+ ok 15 - 2 votes - 1 self and 1 foreign
+ ok 16 - no work to do - not enough votes yet
+ ok 17 - still candidate, waiting for elections
+ ok 18 - trigger is the same
+ ok 19 - vote response from 3
+ ok 20 - 2 votes - 1 self and 2 foreign
+ ok 21 - became leader
+ ok 22 - trigger worked
+ ok 23 - no work - broadcast should be done
+ ok 24 - no new rows in the journal - state change is not persisted
+ ok 25 - 1 pending message
+ ok 26 - sent new-leader notification
ok 1 - subtests
*** raft_test_leader_election: done ***
*** raft_test_recovery ***
ok 39 - term is bumped and became candidate
ok 5 - subtests
*** raft_test_vote_skip: done ***
+ *** raft_test_vote_during_wal_write ***
+ 1..11
+ ok 1 - became candidate
+ ok 2 - vote response from 2
+ ok 3 - vote response from 3
+ ok 4 - became leader
+ ok 5 - vote request in a new term but WAL is blocked
+ ok 6 - canceled the vote for other node and voted for self
+ ok 7 - became candidate
+ ok 8 - term is 2
+ ok 9 - term is 2
+ ok 10 - volatile term is 3
+ ok 11 - vote for self worked even though the WAL had non-empty queue
+ok 6 - subtests
+ *** raft_test_vote_during_wal_write: done ***
*** raft_test_leader_resign ***
1..24
ok 1 - message is accepted
ok 22 - vote from 2
ok 23 - vote from 3
ok 24 - the leader is elected
-ok 6 - subtests
+ok 7 - subtests
*** raft_test_leader_resign: done ***
*** raft_test_split_brain ***
1..4
ok 2 - leader is found
ok 3 - second leader notification
ok 4 - split brain, the old leader is kept
-ok 7 - subtests
+ok 8 - subtests
*** raft_test_split_brain: done ***
*** raft_test_heartbeat ***
1..12
ok 10 - became leader
ok 11 - message from leader
ok 12 - nothing changed - waiting for WAL write
-ok 8 - subtests
+ok 9 - subtests
*** raft_test_heartbeat: done ***
*** raft_test_election_timeout ***
1..13
ok 11 - re-enter candidate state
ok 12 - term is bumped, timeout was truly random
ok 13 - still candidate
-ok 9 - subtests
+ok 10 - subtests
*** raft_test_election_timeout: done ***
*** raft_test_election_quorum ***
1..7
ok 5 - but still candidate
ok 6 - enter leader state after another quorum lowering
ok 7 - became leader again immediately with 1 self vote
-ok 10 - subtests
+ok 11 - subtests
*** raft_test_election_quorum: done ***
*** raft_test_death_timeout ***
1..9
ok 7 - became follower
ok 8 - death is detected immediately
ok 9 - enter candidate state
-ok 11 - subtests
+ok 12 - subtests
*** raft_test_death_timeout: done ***
*** raft_test_enable_disable ***
1..11
ok 9 - resigned from leader state
ok 10 - nothing changed
ok 11 - term bump when disabled
-ok 12 - subtests
+ok 13 - subtests
*** raft_test_enable_disable: done ***
*** raft_test_too_long_wal_write ***
1..22
ok 20 - volatile vote is self
ok 21 - new election timeout works
ok 22 - new term is started with vote for self
-ok 13 - subtests
+ok 14 - subtests
*** raft_test_too_long_wal_write: done ***
*** raft_test_promote_restore ***
1..12
ok 10 - still old term
ok 11 - not a candidate
ok 12 - not a candidate
-ok 14 - subtests
+ok 15 - subtests
*** raft_test_promote_restore: done ***
*** raft_test_bump_term_before_cfg ***
1..6
ok 4 - bump term externally
ok 5 - term write is in progress
ok 6 - started new term
-ok 15 - subtests
+ok 16 - subtests
*** raft_test_bump_term_before_cfg: done ***
*** raft_test_split_vote ***
1..64
ok 62 - planned new election after yield
ok 63 - vote response for 3 from 3
ok 64 - still waiting for yield
-ok 16 - subtests
+ok 17 - subtests
*** raft_test_split_vote: done ***
*** raft_test_pre_vote ***
1..43
ok 41 - leader accepted
ok 42 - leader seen notification accepted
ok 43 - No timer re_start on election timeout reconfig when it's not time for elections yet
-ok 17 - subtests
+ok 18 - subtests
*** raft_test_pre_vote: done ***
*** raft_test_resign ***
1..2
ok 1 - became leader
ok 2 - resigned from leader state
-ok 18 - subtests
+ok 19 - subtests
*** raft_test_resign: done ***
*** main_f: done ***