commit - e319c21ca3520de83de9faec1b722dc5da5d776f
commit + f0f9647d8b80a2bad56b9462fff28b28af2203aa
blob - /dev/null
blob + 0ef0fa34fcdab4a086c714ae37fa4457412d2ab0 (mode 644)
--- /dev/null
+++ changelogs/unreleased/gh-7486-prohibit-roll-back-due-to-replication_synchro_timeout.md
+## feature/replication
+
+* A new compat option `compat.replication_synchro_timeout` has been added.
+ This option determines whether the `replication.synchro_timeout` option rolls
+ back transactions. When set to 'new', transactions are not rolled back due to
+ a timeout. In this mode `replication.synchro_timeout` is used to wait
+ confirmation in promote/demote and gc-checkpointing. If 'old' is set, the
+ behavior is no different from what it was before this patch appeared.
+* A new `replication.synchro_queue_max_size` option limits the number of
+ transactions in the master synchronous queue.
+ `replication.synchro_queue_max_size` is measured in the number of bytes to be
+ written (0 means unlimited, which was the default behavior before).
+ Currently, this option defaults to 16 megabytes.
+ (gh-7486)
blob - 24ce034371205db858d92df3d7b6705086c17a46
blob + d5b870d386c614d6d98fabb929aa46ace6c0c410
--- src/box/lua/config/instance_config.lua
+++ src/box/lua/config/instance_config.lua
return rebase_file_path(base_dir, path)
end
--- Read a config.context[name] variable depending of its "from"
+-- Read a config.context[name] variable depending on its "from"
-- type.
local function read_context_var_noexc(base_dir, def)
if def.from == 'env' then
'candidate',
}, {
box_cfg = 'election_mode',
- -- The effective default is determined depending of
+ -- The effective default is determined depending on
-- the replication.failover option.
default = box.NULL,
}),
default = 'old',
}),
wal_cleanup_delay_deprecation = schema.enum({
+ 'old',
+ 'new',
+ }, {
+ default = 'old',
+ }),
+ replication_synchro_timeout = schema.enum({
'old',
'new',
}, {
blob - e50fc0c637d14a23f3b2c27f0d50adafe277044a
blob + 656e6b7bbd552ebf3d1e5dfe4037d981d4676df2
--- src/box/replication.cc
+++ src/box/replication.cc
#include "raft.h"
#include "relay.h"
#include "sio.h"
+#include "tweaks.h"
uint32_t instance_id = REPLICA_ID_NIL;
struct tt_uuid INSTANCE_UUID;
struct uri cfg_bootstrap_leader_uri;
char cfg_bootstrap_leader_name[NODE_NAME_SIZE_MAX];
char cfg_instance_name[NODE_NAME_SIZE_MAX];
+
+bool replication_synchro_timeout_rollback_enabled = true;
+TWEAK_BOOL(replication_synchro_timeout_rollback_enabled);
struct replicaset replicaset;
blob - ff3e59d1570e883a3e07f1f75ced973bf798a88c
blob + 9d93202976ba593caf00ddc39b6c7049541d611f
--- src/box/replication.h
+++ src/box/replication.h
* for a synchronous transaction until it is rolled back.
*/
extern double replication_synchro_timeout;
+
+/**
+ * Part of internal.tweaks.replication_synchro_timeout_rollback_enabled.
+ * Indicates whether the replication_synchro_timeout option rolls back
+ * transactions or it only used to wait confirmation in box_(promote/demote)
+ * and gc_do_checkpoint.
+ */
+extern bool replication_synchro_timeout_rollback_enabled;
/**
* Max time to wait for appliers to synchronize before entering
blob - a86da8e94437c76d53b1df41b81db24192a4b4a9
blob + 8a65b341cd0a0e7e82b0722439c8420b7e690860
--- src/box/txn_limbo.c
+++ src/box/txn_limbo.c
assert(txn_has_flag(entry->txn, TXN_WAIT_SYNC));
double start_time = fiber_clock();
while (true) {
- double deadline = start_time + replication_synchro_timeout;
- double timeout = deadline - fiber_clock();
- int rc = fiber_cond_wait_timeout(&limbo->wait_cond, timeout);
+ int rc;
+ if (replication_synchro_timeout_rollback_enabled) {
+ double timeout = start_time +
+ replication_synchro_timeout - fiber_clock();
+ rc = fiber_cond_wait_timeout(
+ &limbo->wait_cond, timeout);
+ } else {
+ rc = fiber_cond_wait(&limbo->wait_cond);
+ }
if (txn_limbo_entry_is_complete(entry))
goto complete;
if (rc != 0 && fiber_is_cancelled())
blob - 81a9f9453520daa547beecd453f1a4f59bc59246
blob + fb28862bfd8cb816e018fed2a3b6ec60791dce52
--- src/lua/compat.lua
+++ src/lua/compat.lua
https://tarantool.io/compat/wal_cleanup_delay_deprecation
]]
+local REPLICATION_SYNCHRO_TIMEOUT_COMPAT_BRIEF = [[
+Determines whether the replication_synchro_timeout option rolls back
+transactions or it only used to wait confirmation in promote/demote and
+gc-checkpointing.
+
+https://tarantool.io/compat/replication_synchro_timeout
+]]
-- Returns an action callback that toggles a tweak.
local function tweak_action(tweak_name, old_tweak_value, new_tweak_value)
brief = WAL_CLEANUP_DELAY_DEPRECATION_BRIEF,
action = function() end,
},
+ replication_synchro_timeout = {
+ default = 'old',
+ obsolete = nil,
+ brief = REPLICATION_SYNCHRO_TIMEOUT_COMPAT_BRIEF,
+ action = tweak_action(
+ 'replication_synchro_timeout_rollback_enabled', true, false),
+ },
}
-- Array with option names in order of addition.
blob - ef5f7af6698172578d61a6416bfae3014f3ae7f5
blob + fc04a72e60879f21eff703fa23c6018591996a59
--- test/config-luatest/cluster_config_schema_test.lua
+++ test/config-luatest/cluster_config_schema_test.lua
box_cfg_replication_sync_timeout = 'new',
box_consider_system_spaces_synchronous = 'old',
box_error_serialize_verbose = 'old',
+ replication_synchro_timeout = 'old',
sql_seq_scan_default = 'new',
fiber_slice_default = 'new',
box_info_cluster_meaning = 'new',
blob - /dev/null
blob + 63c2d99efcb8582f615d5a14a5856cd26c9b9fdb (mode 644)
--- /dev/null
+++ test/replication-luatest/gh_7486_prohibit_roll_back_due_to_replication_synchro_timeout_test.lua
+local t = require('luatest')
+local server = require('luatest.server')
+
+local g = t.group('replication_synchro_timeout_does_not_roll_back')
+--
+-- gh-7486: `replication_synchro_timeout` does not roll back.
+--
+local wait_timeout = 10
+
+g.before_all(function(cg)
+ cg.server = server:new()
+ cg.server:start()
+ cg.server:exec(function()
+ box.ctl.promote()
+ box.ctl.wait_rw()
+ box.schema.space.create('test', {is_sync = true})
+ box.space.test:create_index('pk')
+ end)
+end)
+
+g.after_all(function(cg)
+ cg.server:drop()
+end)
+
+g.test_replication_synchro_timeout_rolls_back_if_old = function(cg)
+ cg.server:exec(function()
+ box.cfg{
+ replication_synchro_quorum = 2,
+ replication_synchro_timeout = 0.01,
+ }
+ require('compat').replication_synchro_timeout = 'old'
+ t.assert_error_msg_content_equals(
+ 'Quorum collection for a synchronous transaction is timed out',
+ box.space.test.insert, box.space.test, {1})
+ box.cfg{ replication_synchro_quorum = 2, }
+ end)
+end
+
+-- We can't make sure that replication_synchro_timeout will never roll back the
+-- transaction, but we can check that it won't happen in some period longer
+-- than replication_synchro_timeout by 10 times, for example.
+g.test_replication_synchro_timeout_does_not_roll_back_if_new = function(cg)
+ cg.server:exec(function(wait_timeout)
+ box.cfg{
+ replication_synchro_quorum = 2,
+ replication_synchro_timeout = 0.01,
+ }
+ local fiber = require('fiber')
+ require('compat').replication_synchro_timeout = 'new'
+ local f = fiber.create(function() box.space.test:insert{1} end)
+ f:set_joinable(true)
+ t.helpers.retrying({timeout = wait_timeout}, function()
+ t.assert_equals(box.info.synchro.queue.len, 1)
+ end)
+ fiber.sleep(0.1)
+ box.cfg{ replication_synchro_quorum = 1, }
+ local ok, _ = f:join()
+ t.assert(ok)
+ box.space.test:drop()
+ end, {wait_timeout})
+end