commit 49e459ea9b039177aa70f7c32ac13f88312ee1ac from: Magomed Kostoev via: Serge Petrenko <35663196+sergepetrenko@users.noreply.github.com> date: Thu Aug 29 11:25:23 2024 UTC memtx: accelerate count in the tree Prior to this patch the count request had been performed in the memtx in a straightforward way: we created an iterator by a type and key and simply called its `next` method until it's exhausted. That means the operation had a linear complexity, which could lead to DoS situations. Also the count operation with ALL iterator hadn't been recorded in the MVCC previously and had an erroneous logic (see the #10140). This is fixed too as a side effect. This patch makes the memtx tree index use a version of BPS tree with the cardinality information enabled and takes advantage of its offset based API to implement the count operation using tree lookups. Since the method does not read each counted tuple now, MVCC subsystem would be unaware of it. In order to fix this, this patch introduces a new entity in the memtx transaction manager to track such operations: GAP_COUNT, and the corresponding `memtx_tx_track_count` function. The entity (gap item) is a record that a concurrent transaction has counted tuples matching some key and iterator type in an index. If a transaction creates such an entity, any insertion or deletion of a matching tuple in the index will be conflicted with it. This works differently for inserts and deletes: 1. If a concurrent transaction inserts a new matching tuple, then its read_gaps list is modified like the counted transaction had read the exact key of the tuple and found nothing. This creates a conflict. 2. If a concurrent transaction deletes a matching tuple, then the transaction that counted the tuple is inserted into the tuple reader list: it pretends to have read the tuple prior to the deletion. This creates a conflict. The existing stories of matching dirty tuples are handled differently. Since its's not stored directly whether the story is a product of an insertion or a replacement, any matching visible tuple is marked as read by the counting transaction and any matching invisible one is marked as gap read, so the conflicting scenarios remain the same for old stories. One thing to be noted is that using the cardinality config introduces a performance regression against the current version of memtx, this is to be mitigated in the scope of #10322. Part of #8204 Closes #10140 NO_DOC=perf improvement NO_CHANGELOG=TBD when fully implemented commit - 79fe74c2ffc3e5721e2d5622c8f08f23014b22e4 commit + 49e459ea9b039177aa70f7c32ac13f88312ee1ac blob - 2079d94b66cc897853cb436e69f5b773091e8d9a blob + f4cf550c01fc2395e41d9167d9c720d4b527d483 --- src/box/memtx_tree.cc +++ src/box/memtx_tree.cc @@ -110,6 +110,7 @@ memtx_tree_data_is_equal(const struct memtx_tree_data_ return a->tuple == b->tuple; } +#define BPS_INNER_CARD #define BPS_TREE_NAME memtx_tree #define BPS_TREE_BLOCK_SIZE (512) #define BPS_TREE_EXTENT_SIZE MEMTX_EXTENT_SIZE @@ -782,9 +783,11 @@ prepare_start_prefix_iterator(struct memtx_tree_key_da * @param type - the lookup iterator type, may be updated; * @param region - the region to allocate a new @a start_data on if required. * @param[out] iterator - the result of the lookup; + * @param[out] offset - the offset @a iterator points to; * @param[out] equals - true if the lookup gave the exact key match; - * @param[out] initial_elem - the element approached on initial lookup, stepped - * over if the iterator is reverse, see the end of this function for more info. + * @param[out] initial_elem - the optional pointer to the element approached on + * the initial lookup, stepped over if the iterator is reverse, see the end of + * this function for more information. * * @retval true on success; * @retval false if the iteration must be stopped without an error. @@ -796,7 +799,7 @@ memtx_tree_lookup(memtx_tree_t *tree, struct memtx_tree_key_data after_data, enum iterator_type *type, struct region *region, memtx_tree_iterator_t *iterator, - bool *equals, + size_t *offset, bool *equals, struct memtx_tree_data **initial_elem) { struct key_def *cmp_def = memtx_tree_cmp_def(tree); @@ -823,7 +826,7 @@ memtx_tree_lookup(memtx_tree_t *tree, /* Perform the initial lookup. */ if (start_data->key == NULL) { assert(*type == ITER_GE || *type == ITER_LE); - if (iterator_type_is_reverse(*type)) + if (iterator_type_is_reverse(*type)) { /* * For all reverse iterators we will step back, * see the and explanation code below. @@ -832,8 +835,11 @@ memtx_tree_lookup(memtx_tree_t *tree, * position to the last element. Let's use that. */ invalidate_tree_iterator(iterator); - else + *offset = memtx_tree_size(tree); + } else { *iterator = memtx_tree_first(tree); + *offset = 0; + } /* If there is at least one tuple in the tree, it is * efficiently equals to the empty key. */ @@ -858,18 +864,18 @@ memtx_tree_lookup(memtx_tree_t *tree, need_lower_bound = !need_lower_bound; if (need_lower_bound) { - *iterator = memtx_tree_lower_bound(tree, start_data, - equals); + *iterator = memtx_tree_lower_bound_get_offset( + tree, start_data, equals, offset); } else { - *iterator = memtx_tree_upper_bound(tree, start_data, - equals); + *iterator = memtx_tree_upper_bound_get_offset( + tree, start_data, equals, offset); } } /* Save the element we approached on the initial lookup. */ *initial_elem = memtx_tree_iterator_get_elem(tree, iterator); - if (iterator_type_is_reverse(*type)) + if (iterator_type_is_reverse(*type)) { /* * Because of limitations of tree search API we use * lower_bound for LT search and upper_bound for LE and @@ -882,6 +888,8 @@ memtx_tree_lookup(memtx_tree_t *tree, * last position in the tree, that's what we need. */ memtx_tree_iterator_prev(tree, iterator); + --*offset; /* Unsigned underflow possible. */ + } return true; } @@ -908,12 +916,13 @@ tree_iterator_start(struct iterator *iterator, struct struct memtx_tree_key_data start_data = it->after_data.key != NULL ? it->after_data : it->key_data; enum iterator_type type = it->type; + size_t unused; /* The flag is true if the found tuple equals to the key. */ bool equals; struct memtx_tree_data *initial_elem; if (!memtx_tree_lookup(tree, &start_data, it->after_data, &type, region, &it->tree_iterator, - &equals, &initial_elem)) + &unused, &equals, &initial_elem)) return 0; /* @@ -1163,9 +1172,120 @@ static ssize_t memtx_tree_index_count(struct index *base, enum iterator_type type, const char *key, uint32_t part_count) { - if (type == ITER_ALL) - return memtx_tree_index_size(base); /* optimization */ - return generic_index_count(base, type, key, part_count); + assert((base->def->opts.hint == INDEX_HINT_ON) == USE_HINT); + + struct region *region = &fiber()->gc; + RegionGuard region_guard(region); + + struct memtx_tree_index *index = + (struct memtx_tree_index *)base; + + if (canonicalize_lookup(base->def, &type, &key, part_count) == -1) + return -1; + + memtx_tree_t *tree = &index->tree; + struct key_def *cmp_def = memtx_tree_cmp_def(&index->tree); + struct memtx_tree_key_data start_data; + start_data.key = key; + start_data.part_count = part_count; + if (USE_HINT) + start_data.set_hint(key_hint(key, part_count, cmp_def)); + struct memtx_tree_key_data null_after_data = {}; + memtx_tree_iterator_t unused; + size_t begin_offset; + bool equals; + struct memtx_tree_data *initial_elem; + if (!memtx_tree_lookup(tree, &start_data, null_after_data, &type, + region, &unused, &begin_offset, &equals, + &initial_elem)) + return 0; + + struct txn *txn = in_txn(); + struct space *space = space_by_id(base->def->space_id); + size_t full_size = memtx_tree_size(tree); + size_t end_offset; + + /* Fast path: not found equal with full key. */ + if (start_data.part_count == cmp_def->part_count && + !equals && (type == ITER_EQ || type == ITER_REQ)) { +/********MVCC TRANSACTION MANAGER STORY GARBAGE COLLECTION BOUND START*********/ + /* + * Inform MVCC like we have attempted to read a full key and + * found nothing. Insertion of this exact key into the tree + * will conflict with us. + */ + memtx_tx_track_point(txn, space, base, start_data.key); +/*********MVCC TRANSACTION MANAGER STORY GARBAGE COLLECTION BOUND END**********/ + return 0; /* No tuple matching the full key. */ + } + + /* Fast path: not found with reverse iterator. */ + if (begin_offset == (size_t)-1) { + assert(iterator_type_is_reverse(type)); + struct tuple *successor = + initial_elem ? initial_elem->tuple : NULL; +/********MVCC TRANSACTION MANAGER STORY GARBAGE COLLECTION BOUND START*********/ + /* + * Inform MVCC that we have attempted to read a tuple prior + * to the successor (the first tuple in the tree or NULL if + * the tree is empty) and got nothing by our key and iterator. + * If someone writes a matching tuple at the beginning of the + * tree it will conflict with us. + */ + memtx_tx_track_gap(txn, space, base, successor, type, + start_data.key, start_data.part_count); +/*********MVCC TRANSACTION MANAGER STORY GARBAGE COLLECTION BOUND END**********/ + return 0; /* No tuples prior to the first one. */ + } + + /* Fast path: not found with forward iterator. */ + if (begin_offset == full_size) { + assert(!iterator_type_is_reverse(type)); +/********MVCC TRANSACTION MANAGER STORY GARBAGE COLLECTION BOUND START*********/ + /* + * Inform MVCC that we have attempted to read a tuple right to + * the rightest one in the tree (NULL successor) and thus, got + * nothing. If someone writes a tuple matching our key+iterator + * pair at the end of the tree it will conflict with us. The + * tree can be empty here. + */ + memtx_tx_track_gap(txn, space, base, NULL, type, start_data.key, + start_data.part_count); +/*********MVCC TRANSACTION MANAGER STORY GARBAGE COLLECTION BOUND END**********/ + return 0; /* No tuples beyond the last one. */ + } + + /* + * Now, when we have the first tuple and its offset, let's find the + * boundary of the iteration. + */ + if (type == ITER_EQ) { + memtx_tree_upper_bound_get_offset(tree, &start_data, + NULL, &end_offset); + } else if (type == ITER_REQ) { + memtx_tree_lower_bound_get_offset(tree, &start_data, + NULL, &end_offset); + end_offset--; /* Unsigned underflow possible. */ + } else { + end_offset = iterator_type_is_reverse(type) ? -1 : full_size; + } + + size_t full_count = ((ssize_t)end_offset - begin_offset) * + iterator_direction(type); + +/********MVCC TRANSACTION MANAGER STORY GARBAGE COLLECTION BOUND START*********/ + /* + * Inform MVCC that we have counted tuples in the index by our key and + * iterator. Insertion or deletion of any matching tuple anywhere in the + * index will conflict with us. + * + * It returns the amount of invisible counted tuples BTW. + */ + size_t invisible_count = memtx_tx_track_count( + txn, space, base, type, start_data.key, start_data.part_count); +/*********MVCC TRANSACTION MANAGER STORY GARBAGE COLLECTION BOUND END**********/ + + return full_count - invisible_count; } template blob - 564626d5ca584c689f81a7c7bb4468d2276b9464 blob + 98d83f14403d96856d20e21af269820b4cd6a60e --- src/box/memtx_tx.c +++ src/box/memtx_tx.c @@ -211,6 +211,13 @@ enum gap_item_type { * in successor's story, or index->read_gaps if there's no successor. */ GAP_NEARBY, + /** + * A transaction has completed a count of tuples matching a key and + * iterator. After that any consequent delete or insert of any tuple + * matching the key+iterator pair must lead to a conflict. Such an + * item will be stored in index->read_gaps. + */ + GAP_COUNT, /** * A transaction completed a full scan of unordered index. After that * any consequent write to any new place of the index must lead to @@ -268,6 +275,11 @@ struct full_scan_gap_item { }; /** + * Derived class for count gap, @sa GAP_COUNT. + */ +#define count_gap_item nearby_gap_item + +/** * Initialize common part of gap item, except for in_read_gaps member, * which initialization is specific for gap item type. */ @@ -303,6 +315,14 @@ static struct full_scan_gap_item * memtx_tx_full_scan_gap_item_new(struct txn *txn); /** + * Allocate and create count gap item. + * Note that in_read_gaps base member must be initialized later. + */ +static struct count_gap_item * +memtx_tx_count_gap_item_new(struct txn *txn, enum iterator_type type, + const char *key, uint32_t part_count); + +/** * Helper structure for searching for point_hole_item in the hash table, * @sa point_hole_item_pool. */ @@ -1785,6 +1805,88 @@ memtx_tx_handle_point_hole_write(struct space *space, } while (has_more_items); mh_point_holes_del(ht, pos, 0); +} + +static bool +memtx_tx_tuple_matches(struct key_def *def, struct tuple *tuple, + enum iterator_type type, const char *key, + uint32_t part_count) +{ + if (key == NULL) { + assert(part_count == 0); + assert(type == ITER_LE || type == ITER_GE); + + /* An empty key matches to any tuple. */ + return true; + } + + int cmp = tuple_compare_with_key(tuple, HINT_NONE, key, + part_count, HINT_NONE, def); + + bool equal_matches = type == ITER_EQ || type == ITER_REQ || + type == ITER_LE || type == ITER_GE; + bool less_matches = type == ITER_LT || type == ITER_LE; + bool greater_matches = type == ITER_GT || type == ITER_GE; + + return (equal_matches && cmp == 0) || + (greater_matches && cmp > 0) || + (less_matches && cmp < 0); +} + +/** + * Check for possible conflict relations with GAP_COUNT entries during insertion + * or deletion of tuple (with the corresponding @a story) in index @a ind. It is + * needed if and only if there was no replaced tuple in the index for insertion + * or in case of a deletion. It's the moment where we can search for count gaps + * and find conflict causes. If some transactions have counted tuples by the key + * and iterator matching the tuple - those transactions will be bind as readers + * of the tuple. + */ +static void +memtx_tx_handle_counted_write(struct space *space, struct memtx_story *story, + uint32_t ind) +{ + bool is_insert = story->del_stmt == NULL; + + assert(story->link[ind].newer_story == NULL || !is_insert); + + struct index *index = space->index[ind]; + + struct gap_item_base *item_base, *tmp; + rlist_foreach_entry_safe(item_base, &index->read_gaps, + in_read_gaps, tmp) { + if (item_base->type != GAP_COUNT) + continue; + struct count_gap_item *item = + (struct count_gap_item *)item_base; + + bool tuple_matches = memtx_tx_tuple_matches( + index->def->key_def, story->tuple, + item->type, item->key, item->part_count); + + /* + * Someone has counted tuples in the index by a key and iterator + * matching to the inserted or deleted tuple, it's a conflict. + */ + if (tuple_matches) { + if (is_insert) { + /* + * Record like the counted transaction had read + * by a key matching the tuple and got nothing + * there. Now this insertion is conflicting. + */ + memtx_tx_track_story_gap(item_base->txn, + story, ind); + } else { + /* + * Record like the counted transaction had read + * the tuple. Now this deletion is conflicting. + */ + memtx_tx_track_read_story(item_base->txn, + space, story); + } + } + } } /** @@ -2054,6 +2156,7 @@ memtx_tx_history_add_insert_stmt(struct txn_stmt *stmt /* Collect conflicts. */ memtx_tx_handle_gap_write(space, add_story, succ, i); memtx_tx_handle_point_hole_write(space, add_story, i); + memtx_tx_handle_counted_write(space, add_story, i); memtx_tx_story_link_top(add_story, NULL, i, true); } if (next != NULL) { @@ -2152,6 +2255,19 @@ memtx_tx_history_add_delete_stmt(struct txn_stmt *stmt stmt->is_own_change = del_story->add_stmt->txn == stmt->txn; memtx_tx_story_link_deleted_by(del_story, stmt); + /* + * The tuple is deleted from the space, let's see if anyone had + * counted it in the indexes the tuple is contained in. + */ + struct space *space = stmt->space; + for (uint32_t i = 0; i < space->index_count; i++) { + if (!tuple_key_is_excluded(del_story->tuple, + space->index[i]->def->key_def, + MULTIKEY_NONE)) { + memtx_tx_handle_counted_write(space, del_story, i); + } + } + /* Notify statistics. */ if (!del_story->tuple_is_retained) memtx_tx_story_track_retained_tuple(del_story); @@ -2944,6 +3060,7 @@ memtx_tx_delete_gap(struct gap_item_base *item) pool = &txm.inplace_gap_item_mempoool; break; case GAP_NEARBY: + case GAP_COUNT: pool = &txm.nearby_gap_item_mempoool; break; case GAP_FULL_SCAN: @@ -3239,6 +3356,20 @@ memtx_tx_nearby_gap_item_new(struct txn *txn, enum ite } /** + * Allocate and create count gap item. + * Note that in_read_gaps base member must be initialized later. + */ +static struct count_gap_item * +memtx_tx_count_gap_item_new(struct txn *txn, enum iterator_type type, + const char *key, uint32_t part_count) +{ + struct count_gap_item *item = + memtx_tx_nearby_gap_item_new(txn, type, key, part_count); + item->base.type = GAP_COUNT; + return item; +} + +/** * Allocate and create full scan gap item. * Note that in_read_gaps base member must be initialized later. */ @@ -3282,9 +3413,64 @@ memtx_tx_track_gap_slow(struct txn *txn, struct space rlist_add(&story->link[index->dense_id].read_gaps, &item->base.in_read_gaps); } else { + rlist_add(&index->read_gaps, &item->base.in_read_gaps); + } + memtx_tx_story_gc(); +} + +/** + * Record in TX manager that a transaction @a txn have counted @a index of @a + * space by @a key and iterator @a type. This function must be used for queries + * that count tuples in indexes (for example, index:size or index:count). + * + * @return the amount of invisible tuples counted. + */ +uint32_t +memtx_tx_track_count_slow(struct txn *txn, struct space *space, + struct index *index, enum iterator_type type, + const char *key, uint32_t part_count) +{ + if (txn != NULL && txn->status == TXN_INPROGRESS) { + struct count_gap_item *item = + memtx_tx_count_gap_item_new(txn, type, key, part_count); rlist_add(&index->read_gaps, &item->base.in_read_gaps); } + + /* + * There may be stories that we have (or have not) counted. Since we + * don't iterate over the counted tuples, the fact we have counted + * these stories is not recorded anywhere. Let's make the counting + * transaction a reader of the stories he has counted and gap reader + * of the matching stories that hadn't been counted. + * + * So rollback of counted stories will roll this TX back too, and + * commit of the matching not counted stories will conflict with it. + * + * The downside is that we'll not only conflict with insertions and + * deletions, but also with replace stories. + */ + uint32_t invisible_count = 0; + struct memtx_story *story; + memtx_tx_foreach_in_index_tuple_story(space, index, story, { + /* All tuples in the story chain share the same key. */ + if (!memtx_tx_tuple_matches(index->def->key_def, story->tuple, + type, key, part_count)) + continue; + + /* + * Track the story as read or gap read and conflict with the + * prepared transactions whose changes are invisible to us. + * + * Let's count invisible BTW, it's free. + */ + bool is_prepared_ok = detect_whether_prepared_ok(txn); + if (memtx_tx_story_clarify_impl(txn, space, story, index, + 0, is_prepared_ok) == NULL) + invisible_count++; + }); + memtx_tx_story_gc(); + return invisible_count; } /** blob - 510fd731beb6063d3047f91afadcd410a4fe12dc blob + 9443deccab053e18ce406647830ee479b0b875cc --- src/box/memtx_tx.h +++ src/box/memtx_tx.h @@ -316,6 +316,34 @@ memtx_tx_track_gap(struct txn *txn, struct space *spac return; memtx_tx_track_gap_slow(txn, space, index, successor, type, key, part_count); +} + +/** + * Helper of memtx_tx_track_count. + */ +uint32_t +memtx_tx_track_count_slow(struct txn *txn, struct space *space, + struct index *index, enum iterator_type type, + const char *key, uint32_t part_count); + +/** + * Record in TX manager that a transaction @a txn have counted @a index from @a + * space by @a key and iterator @a type. This function must be used for queries + * that count tuples in indexes (for example, index:size or index:count). + * + * NB: can trigger story garbage collection. + * + * @return the amount of invisible tuples counted. + */ +static inline uint32_t +memtx_tx_track_count(struct txn *txn, struct space *space, + struct index *index, enum iterator_type type, + const char *key, uint32_t part_count) +{ + if (!memtx_tx_manager_use_mvcc_engine) + return 0; + return memtx_tx_track_count_slow(txn, space, index, + type, key, part_count); } /** blob - /dev/null blob + 7536be36acd759c12225779703ac82828febf944 (mode 644) --- /dev/null +++ test/box-luatest/gh_8204_fast_offset_test.lua @@ -0,0 +1,512 @@ +local server = require('luatest.server') +local t = require('luatest') + +local g_generic = t.group('gh-8204-generic') +local g_mvcc = t.group('gh-8204-mvcc') + +g_generic.before_all(function() + g_generic.server = server:new({ alias = 'master' }) + g_generic.server:start() +end) + +g_mvcc.before_all(function() + g_mvcc.server = server:new({ + alias = 'master', + box_cfg = { memtx_use_mvcc_engine = true } + }) + g_mvcc.server:start() +end) + +for _, g in pairs({g_generic, g_mvcc}) do + g.after_all(function() + g.server:drop() + end) +end + +g_generic.after_each(function() + g_generic.server:exec(function() + if box.space.test then + box.space.test:drop() + end + end) +end) + +g_mvcc.after_each(function() + g_mvcc.server:exec(function() + if box.space.test then + box.space.test:drop() + end + + if box.space.make_conflicting_writer then + box.space.make_conflicting_writer:drop() + end + end) +end) + +g_generic.test_count = function() + g_generic.server:exec(function() + -- A space with a secondary key (so we can check nulls). + local s = box.schema.space.create('test') + s:create_index('pk') + local sk = s:create_index('sk', + {parts = {{2, 'uint64', is_nullable = true}, + {3, 'uint64', is_nullable = true}}}) + + -- A helper function for verbose assertion using pretty printer. + local function check(it, key, expected_count) + local pp = require('luatest.pp') + + -- The location of the callee. + local file = debug.getinfo(2, 'S').source + local line = debug.getinfo(2, 'l').currentline + + -- The stringified key. + local key_str = pp.tostring(key) + + t.assert_equals(sk:count(key, {iterator = it}), expected_count, + string.format('\nkey: %s,\niterator: %s,' .. + '\nfile: %s,\nline: %d,', + key_str, it, file, line)) + end + + -- Test the empty space. + for _, it in pairs({'lt', 'le', 'eq', 'req', 'ge', 'gt', 'all'}) do + check(it, {}, 0) + check(it, {box.NULL}, 0) + check(it, {0}, 0) + check(it, {1}, 0) + check(it, {1, box.NULL}, 0) + check(it, {1, 0}, 0) + check(it, {1, 1}, 0) + end + + -- Fill the space. + s:insert({1, 1, 1}) + s:insert({2, 1, 2}) + s:insert({3, 2, 1}) + s:insert({4, 2, 2}) + s:insert({5, 3, 1}) + s:insert({6, 3, 2}) + t.assert_equals(s:count(), 6) + + -- Empty key always returns the space size. + for _, it in pairs({'lt', 'le', 'eq', 'req', 'ge', 'gt', 'all'}) do + check(it, {}, s:count()) + end + + -- GE, ALL (it's identical to GE according to the documentation). + for _, it in pairs({'ge', 'all'}) do + check(it, {box.NULL}, 6) + check(it, {1}, 6) + check(it, {1, 1}, 6) + check(it, {1, 2}, 5) + check(it, {1, 3}, 4) + check(it, {2}, 4) + check(it, {2, box.NULL}, 4) + check(it, {2, 1}, 4) + check(it, {2, 2}, 3) + check(it, {3, 1}, 2) + check(it, {3, 2}, 1) + check(it, {3, 3}, 0) + check(it, {4}, 0) + end + + -- GT. + check('gt', {box.NULL}, 6) + check('gt', {1}, 4) + check('gt', {2}, 2) + check('gt', {2, 1}, 3) + check('gt', {2, 2}, 2) + check('gt', {2, box.NULL}, 4) + check('gt', {3, 1}, 1) + check('gt', {3, 2}, 0) + check('gt', {3, 3}, 0) + check('gt', {3}, 0) + + -- LE. + check('le', {3}, 6) + check('le', {3, 2}, 6) + check('le', {3, 1}, 5) + check('le', {3, box.NULL}, 4) + check('le', {2}, 4) + check('le', {2, 2}, 4) + check('le', {2, 1}, 3) + check('le', {2, box.NULL}, 2) + check('le', {1}, 2) + check('le', {0}, 0) + check('le', {box.NULL}, 0) + + -- LT. + check('lt', {4}, 6) + check('lt', {3, 3}, 6) + check('lt', {3, 2}, 5) + check('lt', {3, 1}, 4) + check('lt', {3}, 4) + check('lt', {2, 2}, 3) + check('lt', {2, 1}, 2) + check('lt', {2}, 2) + check('lt', {2, box.NULL}, 2) + check('lt', {1}, 0) + check('lt', {0}, 0) + check('lt', {box.NULL}, 0) + + -- EQ/REQ. + for _, it in pairs({'eq', 'req'}) do + check(it, {box.NULL}, 0) + for _, key in pairs({1, 2, 3}) do + check(it, {key}, 2) + check(it, {key, 1}, 1) + check(it, {key, 2}, 1) + check(it, {key, box.NULL}, 0) + end + end + end) +end + +g_mvcc.test_count = function() + g_mvcc.server:exec(function() + -- The test space with fast offset PK. + local s = box.schema.space.create('test') + s:create_index('pk', {parts = {{1, 'unsigned'}, {2, 'unsigned'}}}) + + -- Create a space to make tested transactions writing - only writing + -- transactions can cause conflicts with aborts. + box.schema.space.create('make_conflicting_writer') + box.space.make_conflicting_writer:create_index('pk', {sequence = true}) + + local kd = require('key_def').new(s.index.pk.parts) + + local all_iterators = {'lt', 'le', 'req', 'eq', 'ge', 'gt'} + local existing_keys = {} + local unexisting_keys = {} + local test_keys = {} + + -- Prepare proxies. + local txn_proxy = require('test.box.lua.txn_proxy') + local tx = txn_proxy.new() + local tx1 = txn_proxy.new() + local tx2 = txn_proxy.new() + + -- Proxy helpers. + local conflict = {{error = "Transaction has been aborted by conflict"}} + local success = '' + + -- Stringify a table (key or tuple) to use in lua code string. + -- E. g. array of 2 elements is transformed into "{1, 2}" string. + local function to_lua_code(table) + -- Create a raw table without metatables. + local raw_table = {} + for k, v in pairs(table) do + raw_table[k] = v + end + return require('luatest.pp').tostring(raw_table) + end + + -- Check if count on sk_fast index with given key and iterator gives the + -- expected result for the given transaction. + local function check(tx, it, key, expected_count, file, line) + -- The location of the callee. + local file = file or debug.getinfo(2, 'S').source + local line = line or debug.getinfo(2, 'l').currentline + + -- The stringified key. + local key = to_lua_code(key) + + local code = string.format('box.space.test.index.pk:count(%s, ' .. + '{iterator = "%s"})', key, it) + + local comment = string.format('\nkey: %s,\niterator: %s,' .. + '\nfile: %s,\nline: %d,', + key, it, file, line) + + local ok, res = pcall(tx, code) + t.assert(ok, comment) + t.assert_equals(res, {expected_count}, comment) + end + + -- Make the tx1 open a transaction and count by given it and key, + -- then make the tx2 insert/replace/delete (op) the given tuple, + -- then make the tx1 writing to make it abort on conflict, + -- then make the tx1 commit its transaction and expect tx1_result. + -- + -- The tuple inserted/deleted by tx2 is cleaned up. + -- The make_conflicting_writer space is updated but not restored. + local function count_do(tx1, tx2, it, key, expected_count, op, tuple, + tx1_result) + assert(op == 'insert' or op == 'delete' or op == 'replace') + + local old_len = s:len() + local tuple_existed = s:count(tuple) ~= 0 + + -- The location of the callee. + local file = debug.getinfo(2, 'S').source + local line = debug.getinfo(2, 'l').currentline + + local key_str = to_lua_code(key) + local tuple_str = to_lua_code(tuple) + + local tx2_command = string.format('box.space.test:%s(%s)', + op, tuple_str) + + local comment = string.format('\nkey: %s\niterator: %s' .. + '\noperation: %s\ntuple: %s' .. + '\nfile: %s,\nline: %s', key_str, + it, op, tuple_str, file, line) + + -- Remove past stories cause they cause unconditional conflicts, + -- whereas future statements only conflict with count if they + -- insert a new matching tuple or delete a counted one. + box.internal.memtx_tx_gc(100) + + -- Make the tx1 start a transaction and count. + tx1:begin() + check(tx1, it, key, expected_count, file, line); + + -- Make the tx2 perform the op. + t.assert_equals(tx2(tx2_command), {tuple}, comment) + + -- Make the tx1 writing to abort on conflict. + tx1('box.space.make_conflicting_writer:insert({nil})') + + -- Try to commit the tx1. + t.assert_equals(tx1:commit(), tx1_result, comment) + + -- Cleanup. + local tuple_exists = s:count(tuple) ~= 0 + if op == 'insert' and not tuple_existed and tuple_exists then + s:delete(tuple) + elseif op == 'delete' and tuple_existed and not tuple_exists then + s:insert(tuple) + end + t.assert_equals(s:len(), old_len, comment) + end + + -- Check if a tuple matches to the given iterator type and key. + local function tuple_matches(tuple, it, key) + -- An empty key matches to anything. + if #key == 0 then + return true + end + + local lt_matches = it == 'lt' or it == 'le' + local eq_matches = it == 'le' or it == 'ge' or + it == 'eq' or it == 'req' + local gt_matches = it == 'ge' or it == 'gt' + + local cmp = kd:compare_with_key(tuple, key) + return (cmp == 0 and eq_matches) or + (cmp < 0 and lt_matches) or + (cmp > 0 and gt_matches) + end + + -- Simple manual count implementation. + local function count_matching(t, it, key) + local result = 0 + for _, tuple in pairs(t) do + if tuple_matches(tuple, it, key) then + result = result + 1 + end + end + return result + end + + -- Check for consistency of count with the given key and iterator in the + -- given transaction: first performs a count, and then performs inserts + -- and deletes of keys and checks if the count result remains the same. + -- + -- If the transaction is nil, starts and commits a new one. + local function check_consistency(tx_arg, it, key, expected_count) + -- The location of the callee. + local file = debug.getinfo(2, 'S').source + local line = debug.getinfo(2, 'l').currentline + + local old_len = s:len() + local tx = tx_arg or txn_proxy.new() + + -- Start a transaction manually if no passed. + if tx_arg == nil then + tx:begin() + end + + check(tx, it, key, expected_count, file, line) + for _, new_key in pairs(unexisting_keys) do + s:insert(new_key) + check(tx, it, key, expected_count, file, line) + end + for _, old_key in pairs(existing_keys) do + s:delete(old_key) + check(tx, it, key, expected_count, file, line) + end + for _, old_key in pairs(unexisting_keys) do + s:delete(old_key) + check(tx, it, key, expected_count, file, line) + end + for _, new_key in pairs(existing_keys) do + s:insert(new_key) + check(tx, it, key, expected_count, file, line) + end + + -- Autocommit if no transaction passed. + if tx_arg == nil then + t.assert_equals(tx:commit(), success) + end + + t.assert_equals(s:len(), old_len) + t.assert_equals(s:select(), existing_keys) + end + + -- Some keys are defined to exist in the space, others - aren't. This + -- is useful for testing (one knows what can be inserted or deleted). + local function to_exist(i) + return i % 2 == 1 -- 1, 3, 5 exist, 0, 2, 4, 6 - don't. + end + + -- Check if the local tables are consistent with space contents. + local function check_space() + t.assert_equals(s:len(), #existing_keys) + t.assert_equals(s:select(), existing_keys) + end + + -- Generate key lists. + for i = 0, 6 do + for j = 0, 6 do + if to_exist(i) or j == i then + if not to_exist(i) or not to_exist(j) then + table.insert(unexisting_keys, {i, j}) + else + table.insert(existing_keys, {i, j}) + end + table.insert(test_keys, {i, j}) + end + end + table.insert(test_keys, {i}) + end + table.insert(test_keys, {}) + + -- Insert the keys to exist. + for _, key in pairs(existing_keys) do + s:insert(key) + end + check_space() + + -- No conflict (count by key & replace any key). + for _, it in pairs(all_iterators) do + for _, key in pairs(test_keys) do + local expect = count_matching(existing_keys, it, key) + for _, tuple in pairs(existing_keys) do + count_do(tx1, tx2, it, key, expect, + 'replace', tuple, success) + end + end + end + check_space() + + -- Conflict (count by key & insert matching). + for _, it in pairs(all_iterators) do + for _, key in pairs(test_keys) do + local expect = count_matching(existing_keys, it, key) + for _, tuple in pairs(unexisting_keys) do + if tuple_matches(tuple, it, key) then + count_do(tx1, tx2, it, key, expect, + 'insert', tuple, conflict) + end + end + end + end + check_space() + + -- Conflict (count by key & delete matching). + for _, it in pairs(all_iterators) do + for _, key in pairs(test_keys) do + local expect = count_matching(existing_keys, it, key) + for _, tuple in pairs(existing_keys) do + if tuple_matches(tuple, it, key) then + count_do(tx1, tx2, it, key, expect, + 'delete', tuple, conflict) + end + end + end + end + check_space() + + -- No conflict (count by key & insert not matching). + for _, it in pairs(all_iterators) do + for _, key in pairs(test_keys) do + local expect = count_matching(existing_keys, it, key) + for _, tuple in pairs(unexisting_keys) do + if not tuple_matches(tuple, it, key) then + count_do(tx1, tx2, it, key, expect, + 'insert', tuple, success) + end + end + end + end + check_space() + + -- No conflict (count by key & delete not matching). + for _, it in pairs(all_iterators) do + for _, key in pairs(test_keys) do + local expect = count_matching(existing_keys, it, key) + for _, tuple in pairs(existing_keys) do + if not tuple_matches(tuple, it, key) then + count_do(tx1, tx2, it, key, expect, + 'delete', tuple, success) + end + end + end + end + check_space() + + -- Consistency in the read view. + for _, it in pairs(all_iterators) do + for _, key in pairs(test_keys) do + local expect = count_matching(existing_keys, it, key) + check_consistency(nil, it, key, expect) + end + end + check_space() + + -- Consistency in the read view (in a single transaction). + tx:begin() + for _, it in pairs(all_iterators) do + for _, key in pairs(test_keys) do + local expect = count_matching(existing_keys, it, key) + check_consistency(tx, it, key, expect) + end + end + t.assert_equals(tx:commit(), success) + check_space() + end) +end + +g_mvcc.test_past_history = function() + g_mvcc.server:exec(function() + -- The test space. + local s = box.schema.space.create('test') + s:create_index('pk', {parts = {{1, 'unsigned'}, {2, 'unsigned'}}}) + + -- Prepare proxies. + local txn_proxy = require('test.box.lua.txn_proxy') + local tx1 = txn_proxy.new() + local tx2 = txn_proxy.new() + + tx1:begin() + tx2:begin() + tx1('box.space.test:replace{1, 0}') + + -- No prepared tuples - count must return 0. + local count = tx2('return box.space.test:count{1, 0}')[1] + t.assert_equals(count, 0) + + -- Commit writer. + tx1:commit() + + -- Count again - the same value (zero) must be returned. + count = tx2('return box.space.test:count{1, 0}')[1] + t.assert_equals(count, 0) + tx2:commit() + + -- Check if insert actually happened. + t.assert_equals(box.space.test:select{}, {{1, 0}}) + end) +end