commit f0f9647d8b80a2bad56b9462fff28b28af2203aa from: Astronomax via: Serge Petrenko <35663196+sergepetrenko@users.noreply.github.com> date: Fri Oct 04 15:41:15 2024 UTC replication: prohibit roll back due to `replication_synchro_timeout` To better match the canonical Raft design, this patch prohibits automatic transaction rollback due to `replication.synchro_timeout`. A new compat option has been added for this purpose. The compat option is named `compat.replication_synchro_timeout` and is `'old'` by default. When set to 'new', the `replication.synchro_timeout` option has slightly different semantics. With this semantics, transactions are no longer rolled back at this timeout, `replication.synchro_timeout` is used only to wait confirmation in promote/demote and gc-checkpointing. If some transaction in limbo did not have time to commit within `replication_synchro_timeput`, the corresponding operation: promote/demote or gc-checkpointing can be aborted automatically (in this aspect, the behavior of the option is no different from what it was before). If 'old' is set, the option has the same semantics as before. In order to be able to understand from the code what value the `compat.replication_synchro_timeout` option is set to - 'old' or 'new', a special Boolean tweak `replication_synchro_timeout_enabled` was introduced. Note that PROMOTE and DEMOTE can still rollback a transaction. Only the ability to rollback by timeout has been prohibited. Closes #7486 @TarantoolBot document Title: new compat option: 'compat.replication_synchro_timeout' Product: Tarantool Since: 3.3 Root document: New page - https://www.tarantool.io/en/doc/latest/reference/reference_lua/compat/replication_synchro_timeout/ The `compat` module allows you to choose between: * the old behavior: unconfirmed synchronous transactions are rolled back after a `replication.synchro_timeout`. * and the new behavior: A synchronous transaction can remain in the synchro queue indefinitely until it reaches a quorum of confirmations. `replication.synchro_timeout` is used only to wait confirmation in promote/demote and gc-checkpointing. If some transaction in limbo did not have time to commit within `replication_synchro_timeput`, the corresponding operation: promote/demote or gc-checkpointing can be aborted automatically. commit - e319c21ca3520de83de9faec1b722dc5da5d776f commit + f0f9647d8b80a2bad56b9462fff28b28af2203aa blob - /dev/null blob + 0ef0fa34fcdab4a086c714ae37fa4457412d2ab0 (mode 644) --- /dev/null +++ changelogs/unreleased/gh-7486-prohibit-roll-back-due-to-replication_synchro_timeout.md @@ -0,0 +1,14 @@ +## feature/replication + +* A new compat option `compat.replication_synchro_timeout` has been added. + This option determines whether the `replication.synchro_timeout` option rolls + back transactions. When set to 'new', transactions are not rolled back due to + a timeout. In this mode `replication.synchro_timeout` is used to wait + confirmation in promote/demote and gc-checkpointing. If 'old' is set, the + behavior is no different from what it was before this patch appeared. +* A new `replication.synchro_queue_max_size` option limits the number of + transactions in the master synchronous queue. + `replication.synchro_queue_max_size` is measured in the number of bytes to be + written (0 means unlimited, which was the default behavior before). + Currently, this option defaults to 16 megabytes. + (gh-7486) blob - 24ce034371205db858d92df3d7b6705086c17a46 blob + d5b870d386c614d6d98fabb929aa46ace6c0c410 --- src/box/lua/config/instance_config.lua +++ src/box/lua/config/instance_config.lua @@ -404,7 +404,7 @@ local function prepare_file_path(self, iconfig, path) return rebase_file_path(base_dir, path) end --- Read a config.context[name] variable depending of its "from" +-- Read a config.context[name] variable depending on its "from" -- type. local function read_context_var_noexc(base_dir, def) if def.from == 'env' then @@ -1504,7 +1504,7 @@ return schema.new('instance_config', schema.record({ 'candidate', }, { box_cfg = 'election_mode', - -- The effective default is determined depending of + -- The effective default is determined depending on -- the replication.failover option. default = box.NULL, }), @@ -2451,6 +2451,12 @@ return schema.new('instance_config', schema.record({ default = 'old', }), wal_cleanup_delay_deprecation = schema.enum({ + 'old', + 'new', + }, { + default = 'old', + }), + replication_synchro_timeout = schema.enum({ 'old', 'new', }, { blob - e50fc0c637d14a23f3b2c27f0d50adafe277044a blob + 656e6b7bbd552ebf3d1e5dfe4037d981d4676df2 --- src/box/replication.cc +++ src/box/replication.cc @@ -42,6 +42,7 @@ #include "raft.h" #include "relay.h" #include "sio.h" +#include "tweaks.h" uint32_t instance_id = REPLICA_ID_NIL; struct tt_uuid INSTANCE_UUID; @@ -67,6 +68,9 @@ struct tt_uuid cfg_bootstrap_leader_uuid; struct uri cfg_bootstrap_leader_uri; char cfg_bootstrap_leader_name[NODE_NAME_SIZE_MAX]; char cfg_instance_name[NODE_NAME_SIZE_MAX]; + +bool replication_synchro_timeout_rollback_enabled = true; +TWEAK_BOOL(replication_synchro_timeout_rollback_enabled); struct replicaset replicaset; blob - ff3e59d1570e883a3e07f1f75ced973bf798a88c blob + 9d93202976ba593caf00ddc39b6c7049541d611f --- src/box/replication.h +++ src/box/replication.h @@ -195,6 +195,14 @@ extern int replication_synchro_quorum; * for a synchronous transaction until it is rolled back. */ extern double replication_synchro_timeout; + +/** + * Part of internal.tweaks.replication_synchro_timeout_rollback_enabled. + * Indicates whether the replication_synchro_timeout option rolls back + * transactions or it only used to wait confirmation in box_(promote/demote) + * and gc_do_checkpoint. + */ +extern bool replication_synchro_timeout_rollback_enabled; /** * Max time to wait for appliers to synchronize before entering blob - a86da8e94437c76d53b1df41b81db24192a4b4a9 blob + 8a65b341cd0a0e7e82b0722439c8420b7e690860 --- src/box/txn_limbo.c +++ src/box/txn_limbo.c @@ -324,9 +324,15 @@ txn_limbo_wait_complete(struct txn_limbo *limbo, struc assert(txn_has_flag(entry->txn, TXN_WAIT_SYNC)); double start_time = fiber_clock(); while (true) { - double deadline = start_time + replication_synchro_timeout; - double timeout = deadline - fiber_clock(); - int rc = fiber_cond_wait_timeout(&limbo->wait_cond, timeout); + int rc; + if (replication_synchro_timeout_rollback_enabled) { + double timeout = start_time + + replication_synchro_timeout - fiber_clock(); + rc = fiber_cond_wait_timeout( + &limbo->wait_cond, timeout); + } else { + rc = fiber_cond_wait(&limbo->wait_cond); + } if (txn_limbo_entry_is_complete(entry)) goto complete; if (rc != 0 && fiber_is_cancelled()) blob - 81a9f9453520daa547beecd453f1a4f59bc59246 blob + fb28862bfd8cb816e018fed2a3b6ec60791dce52 --- src/lua/compat.lua +++ src/lua/compat.lua @@ -156,6 +156,13 @@ be retained by persistent WAL GC. https://tarantool.io/compat/wal_cleanup_delay_deprecation ]] +local REPLICATION_SYNCHRO_TIMEOUT_COMPAT_BRIEF = [[ +Determines whether the replication_synchro_timeout option rolls back +transactions or it only used to wait confirmation in promote/demote and +gc-checkpointing. + +https://tarantool.io/compat/replication_synchro_timeout +]] -- Returns an action callback that toggles a tweak. local function tweak_action(tweak_name, old_tweak_value, new_tweak_value) @@ -287,6 +294,13 @@ local options = { brief = WAL_CLEANUP_DELAY_DEPRECATION_BRIEF, action = function() end, }, + replication_synchro_timeout = { + default = 'old', + obsolete = nil, + brief = REPLICATION_SYNCHRO_TIMEOUT_COMPAT_BRIEF, + action = tweak_action( + 'replication_synchro_timeout_rollback_enabled', true, false), + }, } -- Array with option names in order of addition. blob - ef5f7af6698172578d61a6416bfae3014f3ae7f5 blob + fc04a72e60879f21eff703fa23c6018591996a59 --- test/config-luatest/cluster_config_schema_test.lua +++ test/config-luatest/cluster_config_schema_test.lua @@ -337,6 +337,7 @@ g.test_defaults = function() box_cfg_replication_sync_timeout = 'new', box_consider_system_spaces_synchronous = 'old', box_error_serialize_verbose = 'old', + replication_synchro_timeout = 'old', sql_seq_scan_default = 'new', fiber_slice_default = 'new', box_info_cluster_meaning = 'new', blob - /dev/null blob + 63c2d99efcb8582f615d5a14a5856cd26c9b9fdb (mode 644) --- /dev/null +++ test/replication-luatest/gh_7486_prohibit_roll_back_due_to_replication_synchro_timeout_test.lua @@ -0,0 +1,61 @@ +local t = require('luatest') +local server = require('luatest.server') + +local g = t.group('replication_synchro_timeout_does_not_roll_back') +-- +-- gh-7486: `replication_synchro_timeout` does not roll back. +-- +local wait_timeout = 10 + +g.before_all(function(cg) + cg.server = server:new() + cg.server:start() + cg.server:exec(function() + box.ctl.promote() + box.ctl.wait_rw() + box.schema.space.create('test', {is_sync = true}) + box.space.test:create_index('pk') + end) +end) + +g.after_all(function(cg) + cg.server:drop() +end) + +g.test_replication_synchro_timeout_rolls_back_if_old = function(cg) + cg.server:exec(function() + box.cfg{ + replication_synchro_quorum = 2, + replication_synchro_timeout = 0.01, + } + require('compat').replication_synchro_timeout = 'old' + t.assert_error_msg_content_equals( + 'Quorum collection for a synchronous transaction is timed out', + box.space.test.insert, box.space.test, {1}) + box.cfg{ replication_synchro_quorum = 2, } + end) +end + +-- We can't make sure that replication_synchro_timeout will never roll back the +-- transaction, but we can check that it won't happen in some period longer +-- than replication_synchro_timeout by 10 times, for example. +g.test_replication_synchro_timeout_does_not_roll_back_if_new = function(cg) + cg.server:exec(function(wait_timeout) + box.cfg{ + replication_synchro_quorum = 2, + replication_synchro_timeout = 0.01, + } + local fiber = require('fiber') + require('compat').replication_synchro_timeout = 'new' + local f = fiber.create(function() box.space.test:insert{1} end) + f:set_joinable(true) + t.helpers.retrying({timeout = wait_timeout}, function() + t.assert_equals(box.info.synchro.queue.len, 1) + end) + fiber.sleep(0.1) + box.cfg{ replication_synchro_quorum = 1, } + local ok, _ = f:join() + t.assert(ok) + box.space.test:drop() + end, {wait_timeout}) +end