Drizzled Public API Documentation

btr0cur.cc
1 /*****************************************************************************
2 
3 Copyright (C) 1994, 2010, Innobase Oy. All Rights Reserved.
4 Copyright (C) 2008, Google Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11 
12 This program is free software; you can redistribute it and/or modify it under
13 the terms of the GNU General Public License as published by the Free Software
14 Foundation; version 2 of the License.
15 
16 This program is distributed in the hope that it will be useful, but WITHOUT
17 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
19 
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22 St, Fifth Floor, Boston, MA 02110-1301 USA
23 
24 *****************************************************************************/
25 
26 /**************************************************/
44 #include "btr0cur.h"
45 
46 #ifdef UNIV_NONINL
47 #include "btr0cur.ic"
48 #endif
49 
50 #include "row0upd.h"
51 #ifndef UNIV_HOTBACKUP
52 #include "mtr0log.h"
53 #include "page0page.h"
54 #include "page0zip.h"
55 #include "rem0rec.h"
56 #include "rem0cmp.h"
57 #include "buf0lru.h"
58 #include "btr0btr.h"
59 #include "btr0sea.h"
60 #include "row0purge.h"
61 #include "row0upd.h"
62 #include "trx0rec.h"
63 #include "trx0roll.h" /* trx_is_recv() */
64 #include "que0que.h"
65 #include "row0row.h"
66 #include "srv0srv.h"
67 #include "ibuf0ibuf.h"
68 #include "lock0lock.h"
69 #include "zlib.h"
70 
72 typedef enum btr_op_enum {
73  BTR_NO_OP = 0,
74  BTR_INSERT_OP,
75  BTR_INSERT_IGNORE_UNIQUE_OP,
76  BTR_DELETE_OP,
77  BTR_DELMARK_OP
78 } btr_op_t;
79 
80 #ifdef UNIV_DEBUG
81 
83 UNIV_INTERN ibool btr_cur_print_record_ops = FALSE;
84 #endif /* UNIV_DEBUG */
85 
87 UNIV_INTERN ulint btr_cur_n_non_sea = 0;
90 UNIV_INTERN ulint btr_cur_n_sea = 0;
94 UNIV_INTERN ulint btr_cur_n_non_sea_old = 0;
98 UNIV_INTERN ulint btr_cur_n_sea_old = 0;
99 
102 #define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
103 
105 /* @{ */
106 /*--------------------------------------*/
107 #define BTR_BLOB_HDR_PART_LEN 0
109 #define BTR_BLOB_HDR_NEXT_PAGE_NO 4
111 /*--------------------------------------*/
112 #define BTR_BLOB_HDR_SIZE 8
114 /* @} */
115 #endif /* !UNIV_HOTBACKUP */
116 
120 const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE]= {0};
121 
122 #ifndef UNIV_HOTBACKUP
123 /*******************************************************************/
127 static
128 void
129 btr_cur_unmark_extern_fields(
130 /*=========================*/
131  page_zip_des_t* page_zip,
133  rec_t* rec,
134  dict_index_t* index,
135  const ulint* offsets,
136  mtr_t* mtr);
137 /*******************************************************************/
140 static
141 void
142 btr_cur_add_path_info(
143 /*==================*/
144  btr_cur_t* cursor,
145  ulint height,
147  ulint root_height);
148 /***********************************************************/
151 static
152 void
153 btr_rec_free_updated_extern_fields(
154 /*===============================*/
155  dict_index_t* index,
157  rec_t* rec,
158  page_zip_des_t* page_zip,
160  const ulint* offsets,
161  const upd_t* update,
162  enum trx_rb_ctx rb_ctx,
163  mtr_t* mtr);
165 /***********************************************************/
167 static
168 void
169 btr_rec_free_externally_stored_fields(
170 /*==================================*/
171  dict_index_t* index,
173  rec_t* rec,
174  const ulint* offsets,
175  page_zip_des_t* page_zip,
177  enum trx_rb_ctx rb_ctx,
178  mtr_t* mtr);
181 /***********************************************************/
184 static
185 ulint
186 btr_rec_get_externally_stored_len(
187 /*==============================*/
188  rec_t* rec,
189  const ulint* offsets);
190 #endif /* !UNIV_HOTBACKUP */
191 
192 /******************************************************/
194 UNIV_INLINE
195 void
196 btr_rec_set_deleted_flag(
197 /*=====================*/
198  rec_t* rec,
199  page_zip_des_t* page_zip,
200  ulint flag)
201 {
202  if (page_rec_is_comp(rec)) {
203  rec_set_deleted_flag_new(rec, page_zip, flag);
204  } else {
205  ut_ad(!page_zip);
206  rec_set_deleted_flag_old(rec, flag);
207  }
208 }
209 
210 #ifndef UNIV_HOTBACKUP
211 /*==================== B-TREE SEARCH =========================*/
212 
213 /********************************************************************/
215 static
216 void
217 btr_cur_latch_leaves(
218 /*=================*/
219  page_t* page,
221  ulint space,
222  ulint zip_size,
224  ulint page_no,
225  ulint latch_mode,
226  btr_cur_t* cursor,
227  mtr_t* mtr)
228 {
229  ulint mode;
230  ulint left_page_no;
231  ulint right_page_no;
232  buf_block_t* get_block;
233 
234  ut_ad(page && mtr);
235 
236  switch (latch_mode) {
237  case BTR_SEARCH_LEAF:
238  case BTR_MODIFY_LEAF:
239  mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
240  get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
241 #ifdef UNIV_BTR_DEBUG
242  ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
243 #endif /* UNIV_BTR_DEBUG */
244  get_block->check_index_page_at_flush = TRUE;
245  return;
246  case BTR_MODIFY_TREE:
247  /* x-latch also brothers from left to right */
248  left_page_no = btr_page_get_prev(page, mtr);
249 
250  if (left_page_no != FIL_NULL) {
251  get_block = btr_block_get(space, zip_size,
252  left_page_no,
253  RW_X_LATCH, mtr);
254 #ifdef UNIV_BTR_DEBUG
255  ut_a(page_is_comp(get_block->frame)
256  == page_is_comp(page));
257  ut_a(btr_page_get_next(get_block->frame, mtr)
258  == page_get_page_no(page));
259 #endif /* UNIV_BTR_DEBUG */
260  get_block->check_index_page_at_flush = TRUE;
261  }
262 
263  get_block = btr_block_get(space, zip_size, page_no,
264  RW_X_LATCH, mtr);
265 #ifdef UNIV_BTR_DEBUG
266  ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
267 #endif /* UNIV_BTR_DEBUG */
268  get_block->check_index_page_at_flush = TRUE;
269 
270  right_page_no = btr_page_get_next(page, mtr);
271 
272  if (right_page_no != FIL_NULL) {
273  get_block = btr_block_get(space, zip_size,
274  right_page_no,
275  RW_X_LATCH, mtr);
276 #ifdef UNIV_BTR_DEBUG
277  ut_a(page_is_comp(get_block->frame)
278  == page_is_comp(page));
279  ut_a(btr_page_get_prev(get_block->frame, mtr)
280  == page_get_page_no(page));
281 #endif /* UNIV_BTR_DEBUG */
282  get_block->check_index_page_at_flush = TRUE;
283  }
284 
285  return;
286 
287  case BTR_SEARCH_PREV:
288  case BTR_MODIFY_PREV:
289  mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
290  /* latch also left brother */
291  left_page_no = btr_page_get_prev(page, mtr);
292 
293  if (left_page_no != FIL_NULL) {
294  get_block = btr_block_get(space, zip_size,
295  left_page_no, mode, mtr);
296  cursor->left_block = get_block;
297 #ifdef UNIV_BTR_DEBUG
298  ut_a(page_is_comp(get_block->frame)
299  == page_is_comp(page));
300  ut_a(btr_page_get_next(get_block->frame, mtr)
301  == page_get_page_no(page));
302 #endif /* UNIV_BTR_DEBUG */
303  get_block->check_index_page_at_flush = TRUE;
304  }
305 
306  get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
307 #ifdef UNIV_BTR_DEBUG
308  ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
309 #endif /* UNIV_BTR_DEBUG */
310  get_block->check_index_page_at_flush = TRUE;
311  return;
312  }
313 
314  ut_error;
315 }
316 
317 /********************************************************************/
329 UNIV_INTERN
330 void
331 btr_cur_search_to_nth_level(
332 /*========================*/
333  dict_index_t* index,
334  ulint level,
335  const dtuple_t* tuple,
338  ulint mode,
341  ulint latch_mode,
352  btr_cur_t* cursor,
354  ulint has_search_latch,
357  const char* file,
358  ulint line,
359  mtr_t* mtr)
360 {
361  page_t* page;
362  buf_block_t* block;
363  ulint space;
364  buf_block_t* guess;
365  ulint height;
366  ulint page_no;
367  ulint up_match;
368  ulint up_bytes;
369  ulint low_match;
370  ulint low_bytes;
371  ulint savepoint;
372  ulint rw_latch;
373  ulint page_mode;
374  ulint buf_mode;
375  ulint estimate;
376  ulint zip_size;
377  page_cur_t* page_cursor;
378  btr_op_t btr_op;
379  ulint root_height = 0; /* remove warning */
380 
381 #ifdef BTR_CUR_ADAPT
382  btr_search_t* info;
383 #endif
384  mem_heap_t* heap = NULL;
385  ulint offsets_[REC_OFFS_NORMAL_SIZE];
386  ulint* offsets = offsets_;
387  rec_offs_init(offsets_);
388  /* Currently, PAGE_CUR_LE is the only search mode used for searches
389  ending to upper levels */
390 
391  ut_ad(level == 0 || mode == PAGE_CUR_LE);
392  ut_ad(dict_index_check_search_tuple(index, tuple));
393  ut_ad(!dict_index_is_ibuf(index) || ibuf_inside());
394  ut_ad(dtuple_check_typed(tuple));
395 
396 #ifdef UNIV_DEBUG
397  cursor->up_match = ULINT_UNDEFINED;
398  cursor->low_match = ULINT_UNDEFINED;
399 #endif
400 
401  /* These flags are mutually exclusive, they are lumped together
402  with the latch mode for historical reasons. It's possible for
403  none of the flags to be set. */
404  switch (UNIV_EXPECT(latch_mode
406  0)) {
407  case 0:
408  btr_op = BTR_NO_OP;
409  break;
410  case BTR_INSERT:
411  btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
412  ? BTR_INSERT_IGNORE_UNIQUE_OP
413  : BTR_INSERT_OP;
414  break;
415  case BTR_DELETE:
416  btr_op = BTR_DELETE_OP;
417  ut_a(cursor->purge_node);
418  break;
419  case BTR_DELETE_MARK:
420  btr_op = BTR_DELMARK_OP;
421  break;
422  default:
423  /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
424  should be specified at a time */
425  ut_error;
426  }
427 
428  /* Operations on the insert buffer tree cannot be buffered. */
429  ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
430  /* Operations on the clustered index cannot be buffered. */
431  ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
432 
433  estimate = latch_mode & BTR_ESTIMATE;
434 
435  /* Turn the flags unrelated to the latch mode off. */
436  latch_mode &= ~(BTR_INSERT
438  | BTR_DELETE
439  | BTR_ESTIMATE
441 
442  cursor->flag = BTR_CUR_BINARY;
443  cursor->index = index;
444 
445  cursor->ibuf_cnt = ULINT_UNDEFINED;
446 
447 #ifndef BTR_CUR_ADAPT
448  guess = NULL;
449 #else
450  info = btr_search_get_info(index);
451 
452  guess = info->root_guess;
453 
454 #ifdef BTR_CUR_HASH_ADAPT
455 
456 #ifdef UNIV_SEARCH_PERF_STAT
457  info->n_searches++;
458 #endif
459  if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
460  && latch_mode <= BTR_MODIFY_LEAF
461  && info->last_hash_succ
462  && !estimate
463 #ifdef PAGE_CUR_LE_OR_EXTENDS
464  && mode != PAGE_CUR_LE_OR_EXTENDS
465 #endif /* PAGE_CUR_LE_OR_EXTENDS */
466  /* If !has_search_latch, we do a dirty read of
467  btr_search_enabled below, and btr_search_guess_on_hash()
468  will have to check it again. */
469  && UNIV_LIKELY(btr_search_enabled)
470  && btr_search_guess_on_hash(index, info, tuple, mode,
471  latch_mode, cursor,
472  has_search_latch, mtr)) {
473 
474  /* Search using the hash index succeeded */
475 
476  ut_ad(cursor->up_match != ULINT_UNDEFINED
477  || mode != PAGE_CUR_GE);
478  ut_ad(cursor->up_match != ULINT_UNDEFINED
479  || mode != PAGE_CUR_LE);
480  ut_ad(cursor->low_match != ULINT_UNDEFINED
481  || mode != PAGE_CUR_LE);
482  btr_cur_n_sea++;
483 
484  return;
485  }
486 #endif /* BTR_CUR_HASH_ADAPT */
487 #endif /* BTR_CUR_ADAPT */
488  btr_cur_n_non_sea++;
489 
490  /* If the hash search did not succeed, do binary search down the
491  tree */
492 
493  if (has_search_latch) {
494  /* Release possible search latch to obey latching order */
495  rw_lock_s_unlock(&btr_search_latch);
496  }
497 
498  /* Store the position of the tree latch we push to mtr so that we
499  know how to release it when we have latched leaf node(s) */
500 
501  savepoint = mtr_set_savepoint(mtr);
502 
503  if (latch_mode == BTR_MODIFY_TREE) {
504  mtr_x_lock(dict_index_get_lock(index), mtr);
505 
506  } else if (latch_mode == BTR_CONT_MODIFY_TREE) {
507  /* Do nothing */
508  ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
509  MTR_MEMO_X_LOCK));
510  } else {
511  mtr_s_lock(dict_index_get_lock(index), mtr);
512  }
513 
514  page_cursor = btr_cur_get_page_cur(cursor);
515 
516  space = dict_index_get_space(index);
517  page_no = dict_index_get_page(index);
518 
519  up_match = 0;
520  up_bytes = 0;
521  low_match = 0;
522  low_bytes = 0;
523 
524  height = ULINT_UNDEFINED;
525 
526  /* We use these modified search modes on non-leaf levels of the
527  B-tree. These let us end up in the right B-tree leaf. In that leaf
528  we use the original search mode. */
529 
530  switch (mode) {
531  case PAGE_CUR_GE:
532  page_mode = PAGE_CUR_L;
533  break;
534  case PAGE_CUR_G:
535  page_mode = PAGE_CUR_LE;
536  break;
537  default:
538 #ifdef PAGE_CUR_LE_OR_EXTENDS
539  ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
540  || mode == PAGE_CUR_LE_OR_EXTENDS);
541 #else /* PAGE_CUR_LE_OR_EXTENDS */
542  ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
543 #endif /* PAGE_CUR_LE_OR_EXTENDS */
544  page_mode = mode;
545  break;
546  }
547 
548  /* Loop and search until we arrive at the desired level */
549 
550 search_loop:
551  buf_mode = BUF_GET;
552  rw_latch = RW_NO_LATCH;
553 
554  if (height != 0) {
555  /* We are about to fetch the root or a non-leaf page. */
556  } else if (latch_mode <= BTR_MODIFY_LEAF) {
557  rw_latch = latch_mode;
558 
559  if (btr_op != BTR_NO_OP
560  && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
561 
562  /* Try to buffer the operation if the leaf
563  page is not in the buffer pool. */
564 
565  buf_mode = btr_op == BTR_DELETE_OP
568  }
569  }
570 
571  zip_size = dict_table_zip_size(index->table);
572 
573 retry_page_get:
574  block = buf_page_get_gen(
575  space, zip_size, page_no, rw_latch, guess, buf_mode,
576  file, line, mtr);
577 
578  if (block == NULL) {
579  /* This must be a search to perform an insert/delete
580  mark/ delete; try using the insert/delete buffer */
581 
582  ut_ad(height == 0);
583  ut_ad(cursor->thr);
584 
585  switch (btr_op) {
586  case BTR_INSERT_OP:
587  case BTR_INSERT_IGNORE_UNIQUE_OP:
588  ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
589 
590  if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
591  space, zip_size, page_no,
592  cursor->thr)) {
593 
594  cursor->flag = BTR_CUR_INSERT_TO_IBUF;
595 
596  goto func_exit;
597  }
598  break;
599 
600  case BTR_DELMARK_OP:
601  ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
602 
603  if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
604  index, space, zip_size,
605  page_no, cursor->thr)) {
606 
607  cursor->flag = BTR_CUR_DEL_MARK_IBUF;
608 
609  goto func_exit;
610  }
611 
612  break;
613 
614  case BTR_DELETE_OP:
615  ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
616 
617  if (!row_purge_poss_sec(cursor->purge_node,
618  index, tuple)) {
619 
620  /* The record cannot be purged yet. */
621  cursor->flag = BTR_CUR_DELETE_REF;
622  } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
623  index, space, zip_size,
624  page_no,
625  cursor->thr)) {
626 
627  /* The purge was buffered. */
628  cursor->flag = BTR_CUR_DELETE_IBUF;
629  } else {
630  /* The purge could not be buffered. */
631  buf_pool_watch_unset(space, page_no);
632  break;
633  }
634 
635  buf_pool_watch_unset(space, page_no);
636  goto func_exit;
637 
638  default:
639  ut_error;
640  }
641 
642  /* Insert to the insert/delete buffer did not succeed, we
643  must read the page from disk. */
644 
645  buf_mode = BUF_GET;
646 
647  goto retry_page_get;
648  }
649 
650  block->check_index_page_at_flush = TRUE;
651  page = buf_block_get_frame(block);
652 
653  if (rw_latch != RW_NO_LATCH) {
654 #ifdef UNIV_ZIP_DEBUG
655  const page_zip_des_t* page_zip
656  = buf_block_get_page_zip(block);
657  ut_a(!page_zip || page_zip_validate(page_zip, page));
658 #endif /* UNIV_ZIP_DEBUG */
659 
660  buf_block_dbg_add_level(block, SYNC_TREE_NODE);
661  }
662 
663  ut_ad(index->id == btr_page_get_index_id(page));
664 
665  if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
666  /* We are in the root node */
667 
668  height = btr_page_get_level(page, mtr);
669  root_height = height;
670  cursor->tree_height = root_height + 1;
671 
672 #ifdef BTR_CUR_ADAPT
673  if (block != guess) {
674  info->root_guess = block;
675  }
676 #endif
677  }
678 
679  if (height == 0) {
680  if (rw_latch == RW_NO_LATCH) {
681 
682  btr_cur_latch_leaves(
683  page, space, zip_size, page_no, latch_mode,
684  cursor, mtr);
685  }
686 
687  if (latch_mode != BTR_MODIFY_TREE
688  && latch_mode != BTR_CONT_MODIFY_TREE) {
689 
690  /* Release the tree s-latch */
691 
693  mtr, savepoint, dict_index_get_lock(index));
694  }
695 
696  page_mode = mode;
697  }
698 
700  block, index, tuple, page_mode, &up_match, &up_bytes,
701  &low_match, &low_bytes, page_cursor);
702 
703  if (estimate) {
704  btr_cur_add_path_info(cursor, height, root_height);
705  }
706 
707  /* If this is the desired level, leave the loop */
708 
709  ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
710  mtr));
711 
712  if (level != height) {
713 
714  const rec_t* node_ptr;
715  ut_ad(height > 0);
716 
717  height--;
718  guess = NULL;
719 
720  node_ptr = page_cur_get_rec(page_cursor);
721 
722  offsets = rec_get_offsets(
723  node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
724 
725  /* Go to the child node */
726  page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
727 
728  if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
729  /* We're doing a search on an ibuf tree and we're one
730  level above the leaf page. */
731 
732  ulint is_min_rec;
733 
734  ut_ad(level == 0);
735 
736  is_min_rec = rec_get_info_bits(node_ptr, 0)
737  & REC_INFO_MIN_REC_FLAG;
738 
739  if (!is_min_rec) {
740  cursor->ibuf_cnt
741  = ibuf_rec_get_counter(node_ptr);
742 
743  ut_a(cursor->ibuf_cnt <= 0xFFFF
744  || cursor->ibuf_cnt == ULINT_UNDEFINED);
745  }
746 
747  buf_mode = BUF_GET;
748  rw_latch = RW_NO_LATCH;
749  goto retry_page_get;
750  }
751 
752  goto search_loop;
753  }
754 
755  if (level != 0) {
756  /* x-latch the page */
757  page = btr_page_get(
758  space, zip_size, page_no, RW_X_LATCH, mtr);
759 
760  ut_a((ibool)!!page_is_comp(page)
761  == dict_table_is_comp(index->table));
762  } else {
763  cursor->low_match = low_match;
764  cursor->low_bytes = low_bytes;
765  cursor->up_match = up_match;
766  cursor->up_bytes = up_bytes;
767 
768 #ifdef BTR_CUR_ADAPT
769  /* We do a dirty read of btr_search_enabled here. We
770  will properly check btr_search_enabled again in
771  btr_search_build_page_hash_index() before building a
772  page hash index, while holding btr_search_latch. */
773  if (UNIV_LIKELY(btr_search_enabled)) {
774 
775  btr_search_info_update(index, cursor);
776  }
777 #endif
778  ut_ad(cursor->up_match != ULINT_UNDEFINED
779  || mode != PAGE_CUR_GE);
780  ut_ad(cursor->up_match != ULINT_UNDEFINED
781  || mode != PAGE_CUR_LE);
782  ut_ad(cursor->low_match != ULINT_UNDEFINED
783  || mode != PAGE_CUR_LE);
784  }
785 
786 func_exit:
787 
788  if (UNIV_LIKELY_NULL(heap)) {
789  mem_heap_free(heap);
790  }
791 
792  if (has_search_latch) {
793 
795  }
796 }
797 
798 /*****************************************************************/
800 UNIV_INTERN
801 void
802 btr_cur_open_at_index_side_func(
803 /*============================*/
804  ibool from_left,
806  dict_index_t* index,
807  ulint latch_mode,
808  btr_cur_t* cursor,
809  const char* file,
810  ulint line,
811  mtr_t* mtr)
812 {
813  page_cur_t* page_cursor;
814  ulint page_no;
815  ulint space;
816  ulint zip_size;
817  ulint height;
818  ulint root_height = 0; /* remove warning */
819  rec_t* node_ptr;
820  ulint estimate;
821  ulint savepoint;
822  mem_heap_t* heap = NULL;
823  ulint offsets_[REC_OFFS_NORMAL_SIZE];
824  ulint* offsets = offsets_;
825  rec_offs_init(offsets_);
826 
827  estimate = latch_mode & BTR_ESTIMATE;
828  latch_mode = latch_mode & ~BTR_ESTIMATE;
829 
830  /* Store the position of the tree latch we push to mtr so that we
831  know how to release it when we have latched the leaf node */
832 
833  savepoint = mtr_set_savepoint(mtr);
834 
835  if (latch_mode == BTR_MODIFY_TREE) {
836  mtr_x_lock(dict_index_get_lock(index), mtr);
837  } else {
838  mtr_s_lock(dict_index_get_lock(index), mtr);
839  }
840 
841  page_cursor = btr_cur_get_page_cur(cursor);
842  cursor->index = index;
843 
844  space = dict_index_get_space(index);
845  zip_size = dict_table_zip_size(index->table);
846  page_no = dict_index_get_page(index);
847 
848  height = ULINT_UNDEFINED;
849 
850  for (;;) {
851  buf_block_t* block;
852  page_t* page;
853  block = buf_page_get_gen(space, zip_size, page_no,
854  RW_NO_LATCH, NULL, BUF_GET,
855  file, line, mtr);
856  page = buf_block_get_frame(block);
857  ut_ad(index->id == btr_page_get_index_id(page));
858 
859  block->check_index_page_at_flush = TRUE;
860 
861  if (height == ULINT_UNDEFINED) {
862  /* We are in the root node */
863 
864  height = btr_page_get_level(page, mtr);
865  root_height = height;
866  }
867 
868  if (height == 0) {
869  btr_cur_latch_leaves(page, space, zip_size, page_no,
870  latch_mode, cursor, mtr);
871 
872  /* In versions <= 3.23.52 we had forgotten to
873  release the tree latch here. If in an index scan
874  we had to scan far to find a record visible to the
875  current transaction, that could starve others
876  waiting for the tree latch. */
877 
878  if ((latch_mode != BTR_MODIFY_TREE)
879  && (latch_mode != BTR_CONT_MODIFY_TREE)) {
880 
881  /* Release the tree s-latch */
882 
884  mtr, savepoint,
885  dict_index_get_lock(index));
886  }
887  }
888 
889  if (from_left) {
890  page_cur_set_before_first(block, page_cursor);
891  } else {
892  page_cur_set_after_last(block, page_cursor);
893  }
894 
895  if (height == 0) {
896  if (estimate) {
897  btr_cur_add_path_info(cursor, height,
898  root_height);
899  }
900 
901  break;
902  }
903 
904  ut_ad(height > 0);
905 
906  if (from_left) {
907  page_cur_move_to_next(page_cursor);
908  } else {
909  page_cur_move_to_prev(page_cursor);
910  }
911 
912  if (estimate) {
913  btr_cur_add_path_info(cursor, height, root_height);
914  }
915 
916  height--;
917 
918  node_ptr = page_cur_get_rec(page_cursor);
919  offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
920  ULINT_UNDEFINED, &heap);
921  /* Go to the child node */
922  page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
923  }
924 
925  if (UNIV_LIKELY_NULL(heap)) {
926  mem_heap_free(heap);
927  }
928 }
929 
930 /**********************************************************************/
932 UNIV_INTERN
933 void
934 btr_cur_open_at_rnd_pos_func(
935 /*=========================*/
936  dict_index_t* index,
937  ulint latch_mode,
938  btr_cur_t* cursor,
939  const char* file,
940  ulint line,
941  mtr_t* mtr)
942 {
943  page_cur_t* page_cursor;
944  ulint page_no;
945  ulint space;
946  ulint zip_size;
947  ulint height;
948  rec_t* node_ptr;
949  mem_heap_t* heap = NULL;
950  ulint offsets_[REC_OFFS_NORMAL_SIZE];
951  ulint* offsets = offsets_;
952  rec_offs_init(offsets_);
953 
954  if (latch_mode == BTR_MODIFY_TREE) {
955  mtr_x_lock(dict_index_get_lock(index), mtr);
956  } else {
957  mtr_s_lock(dict_index_get_lock(index), mtr);
958  }
959 
960  page_cursor = btr_cur_get_page_cur(cursor);
961  cursor->index = index;
962 
963  space = dict_index_get_space(index);
964  zip_size = dict_table_zip_size(index->table);
965  page_no = dict_index_get_page(index);
966 
967  height = ULINT_UNDEFINED;
968 
969  for (;;) {
970  buf_block_t* block;
971  page_t* page;
972 
973  block = buf_page_get_gen(space, zip_size, page_no,
974  RW_NO_LATCH, NULL, BUF_GET,
975  file, line, mtr);
976  page = buf_block_get_frame(block);
977  ut_ad(index->id == btr_page_get_index_id(page));
978 
979  if (height == ULINT_UNDEFINED) {
980  /* We are in the root node */
981 
982  height = btr_page_get_level(page, mtr);
983  }
984 
985  if (height == 0) {
986  btr_cur_latch_leaves(page, space, zip_size, page_no,
987  latch_mode, cursor, mtr);
988  }
989 
990  page_cur_open_on_rnd_user_rec(block, page_cursor);
991 
992  if (height == 0) {
993 
994  break;
995  }
996 
997  ut_ad(height > 0);
998 
999  height--;
1000 
1001  node_ptr = page_cur_get_rec(page_cursor);
1002  offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
1003  ULINT_UNDEFINED, &heap);
1004  /* Go to the child node */
1005  page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
1006  }
1007 
1008  if (UNIV_LIKELY_NULL(heap)) {
1009  mem_heap_free(heap);
1010  }
1011 }
1012 
1013 /*==================== B-TREE INSERT =========================*/
1014 
1015 /*************************************************************/
1021 static
1022 rec_t*
1023 btr_cur_insert_if_possible(
1024 /*=======================*/
1025  btr_cur_t* cursor,
1027  const dtuple_t* tuple,
1029  ulint n_ext,
1030  mtr_t* mtr)
1031 {
1032  page_cur_t* page_cursor;
1033  buf_block_t* block;
1034  rec_t* rec;
1035 
1036  ut_ad(dtuple_check_typed(tuple));
1037 
1038  block = btr_cur_get_block(cursor);
1039 
1040  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1041  page_cursor = btr_cur_get_page_cur(cursor);
1042 
1043  /* Now, try the insert */
1044  rec = page_cur_tuple_insert(page_cursor, tuple,
1045  cursor->index, n_ext, mtr);
1046 
1047  if (UNIV_UNLIKELY(!rec)) {
1048  /* If record did not fit, reorganize */
1049 
1050  if (btr_page_reorganize(block, cursor->index, mtr)) {
1051 
1052  page_cur_search(block, cursor->index, tuple,
1053  PAGE_CUR_LE, page_cursor);
1054 
1055  rec = page_cur_tuple_insert(page_cursor, tuple,
1056  cursor->index, n_ext, mtr);
1057  }
1058  }
1059 
1060  return(rec);
1061 }
1062 
1063 /*************************************************************/
1066 UNIV_INLINE
1067 ulint
1068 btr_cur_ins_lock_and_undo(
1069 /*======================*/
1070  ulint flags,
1073  btr_cur_t* cursor,
1074  dtuple_t* entry,
1075  que_thr_t* thr,
1076  mtr_t* mtr,
1077  ibool* inherit)
1080 {
1081  dict_index_t* index;
1082  ulint err;
1083  rec_t* rec;
1084  roll_ptr_t roll_ptr;
1085 
1086  /* Check if we have to wait for a lock: enqueue an explicit lock
1087  request if yes */
1088 
1089  rec = btr_cur_get_rec(cursor);
1090  index = cursor->index;
1091 
1092  err = lock_rec_insert_check_and_lock(flags, rec,
1093  btr_cur_get_block(cursor),
1094  index, thr, mtr, inherit);
1095 
1096  if (err != DB_SUCCESS) {
1097 
1098  return(err);
1099  }
1100 
1101  if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) {
1102 
1103  err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
1104  thr, index, entry,
1105  NULL, 0, NULL,
1106  &roll_ptr);
1107  if (err != DB_SUCCESS) {
1108 
1109  return(err);
1110  }
1111 
1112  /* Now we can fill in the roll ptr field in entry */
1113 
1114  if (!(flags & BTR_KEEP_SYS_FLAG)) {
1115 
1116  row_upd_index_entry_sys_field(entry, index,
1117  DATA_ROLL_PTR, roll_ptr);
1118  }
1119  }
1120 
1121  return(DB_SUCCESS);
1122 }
1123 
1124 #ifdef UNIV_DEBUG
1125 /*************************************************************/
1127 static
1128 void
1129 btr_cur_trx_report(
1130 /*===============*/
1131  trx_t* trx,
1132  const dict_index_t* index,
1133  const char* op)
1134 {
1135  fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ",
1136  (ullint) trx->id);
1137  fputs(op, stderr);
1138  dict_index_name_print(stderr, trx, index);
1139  putc('\n', stderr);
1140 }
1141 #endif /* UNIV_DEBUG */
1142 
1143 /*************************************************************/
1150 UNIV_INTERN
1151 ulint
1152 btr_cur_optimistic_insert(
1153 /*======================*/
1154  ulint flags,
1157  btr_cur_t* cursor,
1159  dtuple_t* entry,
1160  rec_t** rec,
1162  big_rec_t** big_rec,
1165  ulint n_ext,
1166  que_thr_t* thr,
1167  mtr_t* mtr)
1172 {
1173  big_rec_t* big_rec_vec = NULL;
1174  dict_index_t* index;
1175  page_cur_t* page_cursor;
1176  buf_block_t* block;
1177  page_t* page;
1178  ulint max_size;
1179  rec_t* dummy_rec;
1180  ibool leaf;
1181  ibool reorg;
1182  ibool inherit;
1183  ulint zip_size;
1184  ulint rec_size;
1185  ulint err;
1186 
1187  *big_rec = NULL;
1188 
1189  block = btr_cur_get_block(cursor);
1190  page = buf_block_get_frame(block);
1191  index = cursor->index;
1192  zip_size = buf_block_get_zip_size(block);
1193 #ifdef UNIV_DEBUG_VALGRIND
1194  if (zip_size) {
1195  UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
1196  UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
1197  }
1198 #endif /* UNIV_DEBUG_VALGRIND */
1199 
1200  if (!dtuple_check_typed_no_assert(entry)) {
1201  fputs("InnoDB: Error in a tuple to insert into ", stderr);
1202  dict_index_name_print(stderr, thr_get_trx(thr), index);
1203  }
1204 #ifdef UNIV_DEBUG
1205  if (btr_cur_print_record_ops && thr) {
1206  btr_cur_trx_report(thr_get_trx(thr), index, "insert into ");
1207  dtuple_print(stderr, entry);
1208  }
1209 #endif /* UNIV_DEBUG */
1210 
1211  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1212  max_size = page_get_max_insert_size_after_reorganize(page, 1);
1213  leaf = page_is_leaf(page);
1214 
1215  /* Calculate the record size when entry is converted to a record */
1216  rec_size = rec_get_converted_size(index, entry, n_ext);
1217 
1218  if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
1219  dtuple_get_n_fields(entry), zip_size)) {
1220 
1221  /* The record is so big that we have to store some fields
1222  externally on separate database pages */
1223  big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
1224 
1225  if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
1226 
1227  return(DB_TOO_BIG_RECORD);
1228  }
1229 
1230  rec_size = rec_get_converted_size(index, entry, n_ext);
1231  }
1232 
1233  if (UNIV_UNLIKELY(zip_size)) {
1234  /* Estimate the free space of an empty compressed page.
1235  Subtract one byte for the encoded heap_no in the
1236  modification log. */
1237  ulint free_space_zip = page_zip_empty_size(
1238  cursor->index->n_fields, zip_size) - 1;
1239  ulint n_uniq = dict_index_get_n_unique_in_tree(index);
1240 
1241  ut_ad(dict_table_is_comp(index->table));
1242 
1243  /* There should be enough room for two node pointer
1244  records on an empty non-leaf page. This prevents
1245  infinite page splits. */
1246 
1247  if (UNIV_LIKELY(entry->n_fields >= n_uniq)
1248  && UNIV_UNLIKELY(REC_NODE_PTR_SIZE
1250  index, entry->fields, n_uniq,
1251  NULL)
1252  /* On a compressed page, there is
1253  a two-byte entry in the dense
1254  page directory for every record.
1255  But there is no record header. */
1256  - (REC_N_NEW_EXTRA_BYTES - 2)
1257  > free_space_zip / 2)) {
1258 
1259  if (big_rec_vec) {
1260  dtuple_convert_back_big_rec(
1261  index, entry, big_rec_vec);
1262  }
1263 
1264  return(DB_TOO_BIG_RECORD);
1265  }
1266  }
1267 
1268  /* If there have been many consecutive inserts, and we are on the leaf
1269  level, check if we have to split the page to reserve enough free space
1270  for future updates of records. */
1271 
1272  if (dict_index_is_clust(index)
1273  && (page_get_n_recs(page) >= 2)
1274  && UNIV_LIKELY(leaf)
1275  && (dict_index_get_space_reserve() + rec_size > max_size)
1276  && (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
1277  || btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
1278 fail:
1279  err = DB_FAIL;
1280 fail_err:
1281 
1282  if (big_rec_vec) {
1283  dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1284  }
1285 
1286  return(err);
1287  }
1288 
1289  if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
1290  || max_size < rec_size)
1291  && UNIV_LIKELY(page_get_n_recs(page) > 1)
1292  && page_get_max_insert_size(page, 1) < rec_size) {
1293 
1294  goto fail;
1295  }
1296 
1297  /* Check locks and write to the undo log, if specified */
1298  err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
1299  thr, mtr, &inherit);
1300 
1301  if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
1302 
1303  goto fail_err;
1304  }
1305 
1306  page_cursor = btr_cur_get_page_cur(cursor);
1307 
1308  /* Now, try the insert */
1309 
1310  {
1311  const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
1312  *rec = page_cur_tuple_insert(page_cursor, entry, index,
1313  n_ext, mtr);
1314  reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
1315 
1316  if (UNIV_UNLIKELY(reorg)) {
1317  ut_a(zip_size);
1318  ut_a(*rec);
1319  }
1320  }
1321 
1322  if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) {
1323  /* If the record did not fit, reorganize */
1324  if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) {
1325  ut_a(zip_size);
1326 
1327  goto fail;
1328  }
1329 
1330  ut_ad(zip_size
1331  || page_get_max_insert_size(page, 1) == max_size);
1332 
1333  reorg = TRUE;
1334 
1335  page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor);
1336 
1337  *rec = page_cur_tuple_insert(page_cursor, entry, index,
1338  n_ext, mtr);
1339 
1340  if (UNIV_UNLIKELY(!*rec)) {
1341  if (UNIV_LIKELY(zip_size != 0)) {
1342 
1343  goto fail;
1344  }
1345 
1346  fputs("InnoDB: Error: cannot insert tuple ", stderr);
1347  dtuple_print(stderr, entry);
1348  fputs(" into ", stderr);
1349  dict_index_name_print(stderr, thr_get_trx(thr), index);
1350  fprintf(stderr, "\nInnoDB: max insert size %lu\n",
1351  (ulong) max_size);
1352  ut_error;
1353  }
1354  }
1355 
1356 #ifdef BTR_CUR_HASH_ADAPT
1357  if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
1358  btr_search_update_hash_node_on_insert(cursor);
1359  } else {
1360  btr_search_update_hash_on_insert(cursor);
1361  }
1362 #endif
1363 
1364  if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
1365 
1366  lock_update_insert(block, *rec);
1367  }
1368 
1369 #if 0
1370  fprintf(stderr, "Insert into page %lu, max ins size %lu,"
1371  " rec %lu ind type %lu\n",
1372  buf_block_get_page_no(block), max_size,
1373  rec_size + PAGE_DIR_SLOT_SIZE, index->type);
1374 #endif
1375  if (leaf && !dict_index_is_clust(index)) {
1376  /* Update the free bits of the B-tree page in the
1377  insert buffer bitmap. */
1378 
1379  /* The free bits in the insert buffer bitmap must
1380  never exceed the free space on a page. It is safe to
1381  decrement or reset the bits in the bitmap in a
1382  mini-transaction that is committed before the
1383  mini-transaction that affects the free space. */
1384 
1385  /* It is unsafe to increment the bits in a separately
1386  committed mini-transaction, because in crash recovery,
1387  the free bits could momentarily be set too high. */
1388 
1389  if (zip_size) {
1390  /* Update the bits in the same mini-transaction. */
1391  ibuf_update_free_bits_zip(block, mtr);
1392  } else {
1393  /* Decrement the bits in a separate
1394  mini-transaction. */
1396  block, max_size,
1397  rec_size + PAGE_DIR_SLOT_SIZE);
1398  }
1399  }
1400 
1401  *big_rec = big_rec_vec;
1402 
1403  return(DB_SUCCESS);
1404 }
1405 
1406 /*************************************************************/
1412 UNIV_INTERN
1413 ulint
1414 btr_cur_pessimistic_insert(
1415 /*=======================*/
1416  ulint flags,
1422  btr_cur_t* cursor,
1424  dtuple_t* entry,
1425  rec_t** rec,
1427  big_rec_t** big_rec,
1430  ulint n_ext,
1431  que_thr_t* thr,
1432  mtr_t* mtr)
1433 {
1434  dict_index_t* index = cursor->index;
1435  ulint zip_size = dict_table_zip_size(index->table);
1436  big_rec_t* big_rec_vec = NULL;
1437  mem_heap_t* heap = NULL;
1438  ulint err;
1439  ibool dummy_inh;
1440  ibool success;
1441  ulint n_extents = 0;
1442  ulint n_reserved;
1443 
1444  ut_ad(dtuple_check_typed(entry));
1445 
1446  *big_rec = NULL;
1447 
1448  ut_ad(mtr_memo_contains(mtr,
1450  MTR_MEMO_X_LOCK));
1451  ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
1452  MTR_MEMO_PAGE_X_FIX));
1453 
1454  /* Try first an optimistic insert; reset the cursor flag: we do not
1455  assume anything of how it was positioned */
1456 
1457  cursor->flag = BTR_CUR_BINARY;
1458 
1459  err = btr_cur_optimistic_insert(flags, cursor, entry, rec,
1460  big_rec, n_ext, thr, mtr);
1461  if (err != DB_FAIL) {
1462 
1463  return(err);
1464  }
1465 
1466  /* Retry with a pessimistic insert. Check locks and write to undo log,
1467  if specified */
1468 
1469  err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
1470  thr, mtr, &dummy_inh);
1471 
1472  if (err != DB_SUCCESS) {
1473 
1474  return(err);
1475  }
1476 
1477  if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
1478  /* First reserve enough free space for the file segments
1479  of the index tree, so that the insert will not fail because
1480  of lack of space */
1481 
1482  n_extents = cursor->tree_height / 16 + 3;
1483 
1484  success = fsp_reserve_free_extents(&n_reserved, index->space,
1485  n_extents, FSP_NORMAL, mtr);
1486  if (!success) {
1487  return(DB_OUT_OF_FILE_SPACE);
1488  }
1489  }
1490 
1491  if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
1492  dict_table_is_comp(index->table),
1493  dict_index_get_n_fields(index),
1494  zip_size)) {
1495  /* The record is so big that we have to store some fields
1496  externally on separate database pages */
1497 
1498  if (UNIV_LIKELY_NULL(big_rec_vec)) {
1499  /* This should never happen, but we handle
1500  the situation in a robust manner. */
1501  ut_ad(0);
1502  dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1503  }
1504 
1505  big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
1506 
1507  if (big_rec_vec == NULL) {
1508 
1509  if (n_extents > 0) {
1510  fil_space_release_free_extents(index->space,
1511  n_reserved);
1512  }
1513  return(DB_TOO_BIG_RECORD);
1514  }
1515  }
1516 
1517  if (dict_index_get_page(index)
1519 
1520  /* The page is the root page */
1521  *rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr);
1522  } else {
1523  *rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr);
1524  }
1525 
1526  if (UNIV_LIKELY_NULL(heap)) {
1527  mem_heap_free(heap);
1528  }
1529 
1530  ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
1531 
1532 #ifdef BTR_CUR_ADAPT
1533  btr_search_update_hash_on_insert(cursor);
1534 #endif
1535  if (!(flags & BTR_NO_LOCKING_FLAG)) {
1536 
1537  lock_update_insert(btr_cur_get_block(cursor), *rec);
1538  }
1539 
1540  if (n_extents > 0) {
1541  fil_space_release_free_extents(index->space, n_reserved);
1542  }
1543 
1544  *big_rec = big_rec_vec;
1545 
1546  return(DB_SUCCESS);
1547 }
1548 
1549 /*==================== B-TREE UPDATE =========================*/
1550 
1551 /*************************************************************/
1554 UNIV_INLINE
1555 ulint
1556 btr_cur_upd_lock_and_undo(
1557 /*======================*/
1558  ulint flags,
1559  btr_cur_t* cursor,
1560  const upd_t* update,
1561  ulint cmpl_info,
1563  que_thr_t* thr,
1564  mtr_t* mtr,
1565  roll_ptr_t* roll_ptr)
1566 {
1567  dict_index_t* index;
1568  rec_t* rec;
1569  ulint err;
1570 
1571  ut_ad(cursor && update && thr && roll_ptr);
1572 
1573  rec = btr_cur_get_rec(cursor);
1574  index = cursor->index;
1575 
1576  if (!dict_index_is_clust(index)) {
1577  /* We do undo logging only when we update a clustered index
1578  record */
1580  flags, btr_cur_get_block(cursor), rec,
1581  index, thr, mtr));
1582  }
1583 
1584  /* Check if we have to wait for a lock: enqueue an explicit lock
1585  request if yes */
1586 
1587  err = DB_SUCCESS;
1588 
1589  if (!(flags & BTR_NO_LOCKING_FLAG)) {
1590  mem_heap_t* heap = NULL;
1591  ulint offsets_[REC_OFFS_NORMAL_SIZE];
1592  rec_offs_init(offsets_);
1593 
1595  flags, btr_cur_get_block(cursor), rec, index,
1596  rec_get_offsets(rec, index, offsets_,
1597  ULINT_UNDEFINED, &heap), thr);
1598  if (UNIV_LIKELY_NULL(heap)) {
1599  mem_heap_free(heap);
1600  }
1601  if (err != DB_SUCCESS) {
1602 
1603  return(err);
1604  }
1605  }
1606 
1607  /* Append the info about the update in the undo log */
1608 
1609  err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
1610  index, NULL, update,
1611  cmpl_info, rec, roll_ptr);
1612  return(err);
1613 }
1614 
1615 /***********************************************************/
1617 UNIV_INLINE
1618 void
1619 btr_cur_update_in_place_log(
1620 /*========================*/
1621  ulint flags,
1622  rec_t* rec,
1623  dict_index_t* index,
1624  const upd_t* update,
1625  trx_t* trx,
1626  roll_ptr_t roll_ptr,
1627  mtr_t* mtr)
1628 {
1629  byte* log_ptr;
1630  page_t* page = page_align(rec);
1631  ut_ad(flags < 256);
1632  ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
1633 
1634  log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
1637  1 + DATA_ROLL_PTR_LEN + 14 + 2
1638  + MLOG_BUF_MARGIN);
1639 
1640  if (!log_ptr) {
1641  /* Logging in mtr is switched off during crash recovery */
1642  return;
1643  }
1644 
1645  /* The code below assumes index is a clustered index: change index to
1646  the clustered index if we are updating a secondary index record (or we
1647  could as well skip writing the sys col values to the log in this case
1648  because they are not needed for a secondary index record update) */
1649 
1650  index = dict_table_get_first_index(index->table);
1651 
1652  mach_write_to_1(log_ptr, flags);
1653  log_ptr++;
1654 
1655  log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
1656  mtr);
1657  mach_write_to_2(log_ptr, page_offset(rec));
1658  log_ptr += 2;
1659 
1660  row_upd_index_write_log(update, log_ptr, mtr);
1661 }
1662 #endif /* UNIV_HOTBACKUP */
1663 
1664 /***********************************************************/
1667 UNIV_INTERN
1668 byte*
1669 btr_cur_parse_update_in_place(
1670 /*==========================*/
1671  byte* ptr,
1672  byte* end_ptr,
1673  page_t* page,
1674  page_zip_des_t* page_zip,
1675  dict_index_t* index)
1676 {
1677  ulint flags;
1678  rec_t* rec;
1679  upd_t* update;
1680  ulint pos;
1681  trx_id_t trx_id;
1682  roll_ptr_t roll_ptr;
1683  ulint rec_offset;
1684  mem_heap_t* heap;
1685  ulint* offsets;
1686 
1687  if (end_ptr < ptr + 1) {
1688 
1689  return(NULL);
1690  }
1691 
1692  flags = mach_read_from_1(ptr);
1693  ptr++;
1694 
1695  ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
1696 
1697  if (ptr == NULL) {
1698 
1699  return(NULL);
1700  }
1701 
1702  if (end_ptr < ptr + 2) {
1703 
1704  return(NULL);
1705  }
1706 
1707  rec_offset = mach_read_from_2(ptr);
1708  ptr += 2;
1709 
1710  ut_a(rec_offset <= UNIV_PAGE_SIZE);
1711 
1712  heap = mem_heap_create(256);
1713 
1714  ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
1715 
1716  if (!ptr || !page) {
1717 
1718  goto func_exit;
1719  }
1720 
1721  ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
1722  rec = page + rec_offset;
1723 
1724  /* We do not need to reserve btr_search_latch, as the page is only
1725  being recovered, and there cannot be a hash index to it. */
1726 
1727  offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1728 
1729  if (!(flags & BTR_KEEP_SYS_FLAG)) {
1730  row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
1731  pos, trx_id, roll_ptr);
1732  }
1733 
1734  row_upd_rec_in_place(rec, index, offsets, update, page_zip);
1735 
1736 func_exit:
1737  mem_heap_free(heap);
1738 
1739  return(ptr);
1740 }
1741 
1742 #ifndef UNIV_HOTBACKUP
1743 /*************************************************************/
1747 UNIV_INTERN
1748 ibool
1749 btr_cur_update_alloc_zip(
1750 /*=====================*/
1751  page_zip_des_t* page_zip,
1752  buf_block_t* block,
1753  dict_index_t* index,
1754  ulint length,
1755  ibool create,
1757  mtr_t* mtr)
1758 {
1759  ut_a(page_zip == buf_block_get_page_zip(block));
1760  ut_ad(page_zip);
1761  ut_ad(!dict_index_is_ibuf(index));
1762 
1763  if (page_zip_available(page_zip, dict_index_is_clust(index),
1764  length, create)) {
1765  return(TRUE);
1766  }
1767 
1768  if (!page_zip->m_nonempty) {
1769  /* The page has been freshly compressed, so
1770  recompressing it will not help. */
1771  return(FALSE);
1772  }
1773 
1774  if (!page_zip_compress(page_zip, buf_block_get_frame(block),
1775  index, mtr)) {
1776  /* Unable to compress the page */
1777  return(FALSE);
1778  }
1779 
1780  /* After recompressing a page, we must make sure that the free
1781  bits in the insert buffer bitmap will not exceed the free
1782  space on the page. Because this function will not attempt
1783  recompression unless page_zip_available() fails above, it is
1784  safe to reset the free bits if page_zip_available() fails
1785  again, below. The free bits can safely be reset in a separate
1786  mini-transaction. If page_zip_available() succeeds below, we
1787  can be sure that the page_zip_compress() above did not reduce
1788  the free space available on the page. */
1789 
1790  if (!page_zip_available(page_zip, dict_index_is_clust(index),
1791  length, create)) {
1792  /* Out of space: reset the free bits. */
1793  if (!dict_index_is_clust(index)
1794  && page_is_leaf(buf_block_get_frame(block))) {
1795  ibuf_reset_free_bits(block);
1796  }
1797  return(FALSE);
1798  }
1799 
1800  return(TRUE);
1801 }
1802 
1803 /*************************************************************/
1807 UNIV_INTERN
1808 ulint
1809 btr_cur_update_in_place(
1810 /*====================*/
1811  ulint flags,
1812  btr_cur_t* cursor,
1815  const upd_t* update,
1816  ulint cmpl_info,
1818  que_thr_t* thr,
1819  mtr_t* mtr)
1821 {
1822  dict_index_t* index;
1823  buf_block_t* block;
1824  page_zip_des_t* page_zip;
1825  ulint err;
1826  rec_t* rec;
1827  roll_ptr_t roll_ptr = 0;
1828  trx_t* trx;
1829  ulint was_delete_marked;
1830  mem_heap_t* heap = NULL;
1831  ulint offsets_[REC_OFFS_NORMAL_SIZE];
1832  ulint* offsets = offsets_;
1833  rec_offs_init(offsets_);
1834 
1835  rec = btr_cur_get_rec(cursor);
1836  index = cursor->index;
1837  ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1838  /* The insert buffer tree should never be updated in place. */
1839  ut_ad(!dict_index_is_ibuf(index));
1840 
1841  trx = thr_get_trx(thr);
1842  offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1843 #ifdef UNIV_DEBUG
1844  if (btr_cur_print_record_ops && thr) {
1845  btr_cur_trx_report(trx, index, "update ");
1846  rec_print_new(stderr, rec, offsets);
1847  }
1848 #endif /* UNIV_DEBUG */
1849 
1850  block = btr_cur_get_block(cursor);
1851  page_zip = buf_block_get_page_zip(block);
1852 
1853  /* Check that enough space is available on the compressed page. */
1854  if (UNIV_LIKELY_NULL(page_zip)
1855  && !btr_cur_update_alloc_zip(page_zip, block, index,
1856  rec_offs_size(offsets), FALSE, mtr)) {
1857  return(DB_ZIP_OVERFLOW);
1858  }
1859 
1860  /* Do lock checking and undo logging */
1861  err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
1862  thr, mtr, &roll_ptr);
1863  if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
1864 
1865  if (UNIV_LIKELY_NULL(heap)) {
1866  mem_heap_free(heap);
1867  }
1868  return(err);
1869  }
1870 
1871  if (block->is_hashed) {
1872  /* The function row_upd_changes_ord_field_binary works only
1873  if the update vector was built for a clustered index, we must
1874  NOT call it if index is secondary */
1875 
1876  if (!dict_index_is_clust(index)
1877  || row_upd_changes_ord_field_binary(NULL, index, update)) {
1878 
1879  /* Remove possible hash index pointer to this record */
1880  btr_search_update_hash_on_delete(cursor);
1881  }
1882 
1883  rw_lock_x_lock(&btr_search_latch);
1884  }
1885 
1886  if (!(flags & BTR_KEEP_SYS_FLAG)) {
1887  row_upd_rec_sys_fields(rec, NULL,
1888  index, offsets, trx, roll_ptr);
1889  }
1890 
1891  was_delete_marked = rec_get_deleted_flag(
1892  rec, page_is_comp(buf_block_get_frame(block)));
1893 
1894  row_upd_rec_in_place(rec, index, offsets, update, page_zip);
1895 
1896  if (block->is_hashed) {
1897  rw_lock_x_unlock(&btr_search_latch);
1898  }
1899 
1900  if (page_zip && !dict_index_is_clust(index)
1901  && page_is_leaf(buf_block_get_frame(block))) {
1902  /* Update the free bits in the insert buffer. */
1903  ibuf_update_free_bits_zip(block, mtr);
1904  }
1905 
1906  btr_cur_update_in_place_log(flags, rec, index, update,
1907  trx, roll_ptr, mtr);
1908 
1909  if (was_delete_marked
1911  buf_block_get_frame(block)))) {
1912  /* The new updated record owns its possible externally
1913  stored fields */
1914 
1915  btr_cur_unmark_extern_fields(page_zip,
1916  rec, index, offsets, mtr);
1917  }
1918 
1919  if (UNIV_LIKELY_NULL(heap)) {
1920  mem_heap_free(heap);
1921  }
1922  return(DB_SUCCESS);
1923 }
1924 
1925 /*************************************************************/
1934 UNIV_INTERN
1935 ulint
1936 btr_cur_optimistic_update(
1937 /*======================*/
1938  ulint flags,
1939  btr_cur_t* cursor,
1942  const upd_t* update,
1944  ulint cmpl_info,
1946  que_thr_t* thr,
1947  mtr_t* mtr)
1949 {
1950  dict_index_t* index;
1951  page_cur_t* page_cursor;
1952  ulint err;
1953  buf_block_t* block;
1954  page_t* page;
1955  page_zip_des_t* page_zip;
1956  rec_t* rec;
1957  ulint max_size;
1958  ulint new_rec_size;
1959  ulint old_rec_size;
1960  dtuple_t* new_entry;
1961  roll_ptr_t roll_ptr;
1962  trx_t* trx;
1963  mem_heap_t* heap;
1964  ulint i;
1965  ulint n_ext;
1966  ulint* offsets;
1967 
1968  block = btr_cur_get_block(cursor);
1969  page = buf_block_get_frame(block);
1970  rec = btr_cur_get_rec(cursor);
1971  index = cursor->index;
1972  ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1973  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1974  /* The insert buffer tree should never be updated in place. */
1975  ut_ad(!dict_index_is_ibuf(index));
1976 
1977  heap = mem_heap_create(1024);
1978  offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1979 
1980 #ifdef UNIV_DEBUG
1981  if (btr_cur_print_record_ops && thr) {
1982  btr_cur_trx_report(thr_get_trx(thr), index, "update ");
1983  rec_print_new(stderr, rec, offsets);
1984  }
1985 #endif /* UNIV_DEBUG */
1986 
1987  if (!row_upd_changes_field_size_or_external(index, offsets, update)) {
1988 
1989  /* The simplest and the most common case: the update does not
1990  change the size of any field and none of the updated fields is
1991  externally stored in rec or update, and there is enough space
1992  on the compressed page to log the update. */
1993 
1994  mem_heap_free(heap);
1995  return(btr_cur_update_in_place(flags, cursor, update,
1996  cmpl_info, thr, mtr));
1997  }
1998 
1999  if (rec_offs_any_extern(offsets)) {
2000 any_extern:
2001  /* Externally stored fields are treated in pessimistic
2002  update */
2003 
2004  mem_heap_free(heap);
2005  return(DB_OVERFLOW);
2006  }
2007 
2008  for (i = 0; i < upd_get_n_fields(update); i++) {
2009  if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
2010 
2011  goto any_extern;
2012  }
2013  }
2014 
2015  page_cursor = btr_cur_get_page_cur(cursor);
2016 
2017  new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
2018  &n_ext, heap);
2019  /* We checked above that there are no externally stored fields. */
2020  ut_a(!n_ext);
2021 
2022  /* The page containing the clustered index record
2023  corresponding to new_entry is latched in mtr.
2024  Thus the following call is safe. */
2025  row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
2026  FALSE, heap);
2027  old_rec_size = rec_offs_size(offsets);
2028  new_rec_size = rec_get_converted_size(index, new_entry, 0);
2029 
2030  page_zip = buf_block_get_page_zip(block);
2031 #ifdef UNIV_ZIP_DEBUG
2032  ut_a(!page_zip || page_zip_validate(page_zip, page));
2033 #endif /* UNIV_ZIP_DEBUG */
2034 
2035  if (UNIV_LIKELY_NULL(page_zip)
2036  && !btr_cur_update_alloc_zip(page_zip, block, index,
2037  new_rec_size, TRUE, mtr)) {
2038  err = DB_ZIP_OVERFLOW;
2039  goto err_exit;
2040  }
2041 
2042  if (UNIV_UNLIKELY(new_rec_size
2044  / 2))) {
2045 
2046  err = DB_OVERFLOW;
2047  goto err_exit;
2048  }
2049 
2050  if (UNIV_UNLIKELY(page_get_data_size(page)
2051  - old_rec_size + new_rec_size
2053 
2054  /* The page would become too empty */
2055 
2056  err = DB_UNDERFLOW;
2057  goto err_exit;
2058  }
2059 
2060  max_size = old_rec_size
2062 
2063  if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
2064  && (max_size >= new_rec_size))
2065  || (page_get_n_recs(page) <= 1))) {
2066 
2067  /* There was not enough space, or it did not pay to
2068  reorganize: for simplicity, we decide what to do assuming a
2069  reorganization is needed, though it might not be necessary */
2070 
2071  err = DB_OVERFLOW;
2072  goto err_exit;
2073  }
2074 
2075  /* Do lock checking and undo logging */
2076  err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
2077  thr, mtr, &roll_ptr);
2078  if (err != DB_SUCCESS) {
2079 
2080  goto err_exit;
2081  }
2082 
2083  /* Ok, we may do the replacement. Store on the page infimum the
2084  explicit locks on rec, before deleting rec (see the comment in
2085  btr_cur_pessimistic_update). */
2086 
2087  lock_rec_store_on_page_infimum(block, rec);
2088 
2089  btr_search_update_hash_on_delete(cursor);
2090 
2091  /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
2092  invokes rec_offs_make_valid() to point to the copied record that
2093  the fields of new_entry point to. We have to undo it here. */
2094  ut_ad(rec_offs_validate(NULL, index, offsets));
2095  rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets);
2096 
2097  page_cur_delete_rec(page_cursor, index, offsets, mtr);
2098 
2099  page_cur_move_to_prev(page_cursor);
2100 
2101  trx = thr_get_trx(thr);
2102 
2103  if (!(flags & BTR_KEEP_SYS_FLAG)) {
2104  row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
2105  roll_ptr);
2106  row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
2107  trx->id);
2108  }
2109 
2110  /* There are no externally stored columns in new_entry */
2111  rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr);
2112  ut_a(rec); /* <- We calculated above the insert would fit */
2113 
2114  if (page_zip && !dict_index_is_clust(index)
2115  && page_is_leaf(page)) {
2116  /* Update the free bits in the insert buffer. */
2117  ibuf_update_free_bits_zip(block, mtr);
2118  }
2119 
2120  /* Restore the old explicit lock state on the record */
2121 
2122  lock_rec_restore_from_page_infimum(block, rec, block);
2123 
2124  page_cur_move_to_next(page_cursor);
2125 
2126  err = DB_SUCCESS;
2127 err_exit:
2128  mem_heap_free(heap);
2129  return(err);
2130 }
2131 
2132 /*************************************************************/
2138 static
2139 void
2140 btr_cur_pess_upd_restore_supremum(
2141 /*==============================*/
2142  buf_block_t* block,
2143  const rec_t* rec,
2144  mtr_t* mtr)
2145 {
2146  page_t* page;
2147  buf_block_t* prev_block;
2148  ulint space;
2149  ulint zip_size;
2150  ulint prev_page_no;
2151 
2152  page = buf_block_get_frame(block);
2153 
2154  if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
2155  /* Updated record is not the first user record on its page */
2156 
2157  return;
2158  }
2159 
2160  space = buf_block_get_space(block);
2161  zip_size = buf_block_get_zip_size(block);
2162  prev_page_no = btr_page_get_prev(page, mtr);
2163 
2164  ut_ad(prev_page_no != FIL_NULL);
2165  prev_block = buf_page_get_with_no_latch(space, zip_size,
2166  prev_page_no, mtr);
2167 #ifdef UNIV_BTR_DEBUG
2168  ut_a(btr_page_get_next(prev_block->frame, mtr)
2169  == page_get_page_no(page));
2170 #endif /* UNIV_BTR_DEBUG */
2171 
2172  /* We must already have an x-latch on prev_block! */
2173  ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
2174 
2175  lock_rec_reset_and_inherit_gap_locks(prev_block, block,
2176  PAGE_HEAP_NO_SUPREMUM,
2177  page_rec_get_heap_no(rec));
2178 }
2179 
2180 /*************************************************************/
2187 UNIV_INTERN
2188 ulint
2189 btr_cur_pessimistic_update(
2190 /*=======================*/
2191  ulint flags,
2193  btr_cur_t* cursor,
2194  mem_heap_t** heap,
2195  big_rec_t** big_rec,
2197  const upd_t* update,
2200  ulint cmpl_info,
2202  que_thr_t* thr,
2203  mtr_t* mtr)
2205 {
2206  big_rec_t* big_rec_vec = NULL;
2207  big_rec_t* dummy_big_rec;
2208  dict_index_t* index;
2209  buf_block_t* block;
2210  page_t* page;
2211  page_zip_des_t* page_zip;
2212  rec_t* rec;
2213  page_cur_t* page_cursor;
2214  dtuple_t* new_entry;
2215  ulint err;
2216  ulint optim_err;
2217  roll_ptr_t roll_ptr;
2218  trx_t* trx;
2219  ibool was_first;
2220  ulint n_extents = 0;
2221  ulint n_reserved;
2222  ulint n_ext;
2223  ulint* offsets = NULL;
2224 
2225  *big_rec = NULL;
2226 
2227  block = btr_cur_get_block(cursor);
2228  page = buf_block_get_frame(block);
2229  page_zip = buf_block_get_page_zip(block);
2230  rec = btr_cur_get_rec(cursor);
2231  index = cursor->index;
2232 
2233  ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2234  MTR_MEMO_X_LOCK));
2235  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2236 #ifdef UNIV_ZIP_DEBUG
2237  ut_a(!page_zip || page_zip_validate(page_zip, page));
2238 #endif /* UNIV_ZIP_DEBUG */
2239  /* The insert buffer tree should never be updated in place. */
2240  ut_ad(!dict_index_is_ibuf(index));
2241 
2242  optim_err = btr_cur_optimistic_update(flags, cursor, update,
2243  cmpl_info, thr, mtr);
2244 
2245  switch (optim_err) {
2246  case DB_UNDERFLOW:
2247  case DB_OVERFLOW:
2248  case DB_ZIP_OVERFLOW:
2249  break;
2250  default:
2251  return(optim_err);
2252  }
2253 
2254  /* Do lock checking and undo logging */
2255  err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
2256  thr, mtr, &roll_ptr);
2257  if (err != DB_SUCCESS) {
2258 
2259  return(err);
2260  }
2261 
2262  if (optim_err == DB_OVERFLOW) {
2263  ulint reserve_flag;
2264 
2265  /* First reserve enough free space for the file segments
2266  of the index tree, so that the update will not fail because
2267  of lack of space */
2268 
2269  n_extents = cursor->tree_height / 16 + 3;
2270 
2271  if (flags & BTR_NO_UNDO_LOG_FLAG) {
2272  reserve_flag = FSP_CLEANING;
2273  } else {
2274  reserve_flag = FSP_NORMAL;
2275  }
2276 
2277  if (!fsp_reserve_free_extents(&n_reserved, index->space,
2278  n_extents, reserve_flag, mtr)) {
2279  return(DB_OUT_OF_FILE_SPACE);
2280  }
2281  }
2282 
2283  if (!*heap) {
2284  *heap = mem_heap_create(1024);
2285  }
2286  offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap);
2287 
2288  trx = thr_get_trx(thr);
2289 
2290  new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
2291  &n_ext, *heap);
2292  /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
2293  invokes rec_offs_make_valid() to point to the copied record that
2294  the fields of new_entry point to. We have to undo it here. */
2295  ut_ad(rec_offs_validate(NULL, index, offsets));
2296  rec_offs_make_valid(rec, index, offsets);
2297 
2298  /* The page containing the clustered index record
2299  corresponding to new_entry is latched in mtr. If the
2300  clustered index record is delete-marked, then its externally
2301  stored fields cannot have been purged yet, because then the
2302  purge would also have removed the clustered index record
2303  itself. Thus the following call is safe. */
2304  row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
2305  FALSE, *heap);
2306  if (!(flags & BTR_KEEP_SYS_FLAG)) {
2307  row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
2308  roll_ptr);
2309  row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
2310  trx->id);
2311  }
2312 
2313  if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) {
2314  /* We are in a transaction rollback undoing a row
2315  update: we must free possible externally stored fields
2316  which got new values in the update, if they are not
2317  inherited values. They can be inherited if we have
2318  updated the primary key to another value, and then
2319  update it back again. */
2320 
2321  ut_ad(big_rec_vec == NULL);
2322 
2323  btr_rec_free_updated_extern_fields(
2324  index, rec, page_zip, offsets, update,
2325  trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr);
2326  }
2327 
2328  /* We have to set appropriate extern storage bits in the new
2329  record to be inserted: we have to remember which fields were such */
2330 
2331  ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
2332  offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap);
2333  n_ext += btr_push_update_extern_fields(new_entry, update, *heap);
2334 
2335  if (UNIV_LIKELY_NULL(page_zip)) {
2336  ut_ad(page_is_comp(page));
2338  rec_get_converted_size(index, new_entry, n_ext),
2339  TRUE,
2340  dict_index_get_n_fields(index),
2341  page_zip_get_size(page_zip))) {
2342 
2343  goto make_external;
2344  }
2345  } else if (page_zip_rec_needs_ext(
2346  rec_get_converted_size(index, new_entry, n_ext),
2347  page_is_comp(page), 0, 0)) {
2348 make_external:
2349  big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
2350  if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
2351 
2352  err = DB_TOO_BIG_RECORD;
2353  goto return_after_reservations;
2354  }
2355  }
2356 
2357  /* Store state of explicit locks on rec on the page infimum record,
2358  before deleting rec. The page infimum acts as a dummy carrier of the
2359  locks, taking care also of lock releases, before we can move the locks
2360  back on the actual record. There is a special case: if we are
2361  inserting on the root page and the insert causes a call of
2362  btr_root_raise_and_insert. Therefore we cannot in the lock system
2363  delete the lock structs set on the root page even if the root
2364  page carries just node pointers. */
2365 
2366  lock_rec_store_on_page_infimum(block, rec);
2367 
2368  btr_search_update_hash_on_delete(cursor);
2369 
2370 #ifdef UNIV_ZIP_DEBUG
2371  ut_a(!page_zip || page_zip_validate(page_zip, page));
2372 #endif /* UNIV_ZIP_DEBUG */
2373  page_cursor = btr_cur_get_page_cur(cursor);
2374 
2375  page_cur_delete_rec(page_cursor, index, offsets, mtr);
2376 
2377  page_cur_move_to_prev(page_cursor);
2378 
2379  rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr);
2380 
2381  if (rec) {
2383  rec, block);
2384 
2385  offsets = rec_get_offsets(rec, index, offsets,
2386  ULINT_UNDEFINED, heap);
2387 
2388  if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
2389  /* The new inserted record owns its possible externally
2390  stored fields */
2391  btr_cur_unmark_extern_fields(page_zip,
2392  rec, index, offsets, mtr);
2393  }
2394 
2395  btr_cur_compress_if_useful(cursor, mtr);
2396 
2397  if (page_zip && !dict_index_is_clust(index)
2398  && page_is_leaf(page)) {
2399  /* Update the free bits in the insert buffer. */
2400  ibuf_update_free_bits_zip(block, mtr);
2401  }
2402 
2403  err = DB_SUCCESS;
2404  goto return_after_reservations;
2405  } else {
2406  ut_a(optim_err != DB_UNDERFLOW);
2407 
2408  /* Out of space: reset the free bits. */
2409  if (!dict_index_is_clust(index)
2410  && page_is_leaf(page)) {
2411  ibuf_reset_free_bits(block);
2412  }
2413  }
2414 
2415  /* Was the record to be updated positioned as the first user
2416  record on its page? */
2417  was_first = page_cur_is_before_first(page_cursor);
2418 
2419  /* The first parameter means that no lock checking and undo logging
2420  is made in the insert */
2421 
2422  err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
2423  | BTR_NO_LOCKING_FLAG
2424  | BTR_KEEP_SYS_FLAG,
2425  cursor, new_entry, &rec,
2426  &dummy_big_rec, n_ext, NULL, mtr);
2427  ut_a(rec);
2428  ut_a(err == DB_SUCCESS);
2429  ut_a(dummy_big_rec == NULL);
2430 
2431  if (dict_index_is_sec_or_ibuf(index)) {
2432  /* Update PAGE_MAX_TRX_ID in the index page header.
2433  It was not updated by btr_cur_pessimistic_insert()
2434  because of BTR_NO_LOCKING_FLAG. */
2435  buf_block_t* rec_block;
2436 
2437  rec_block = btr_cur_get_block(cursor);
2438 
2439  page_update_max_trx_id(rec_block,
2440  buf_block_get_page_zip(rec_block),
2441  trx->id, mtr);
2442  }
2443 
2444  if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
2445  /* The new inserted record owns its possible externally
2446  stored fields */
2447  buf_block_t* rec_block = btr_cur_get_block(cursor);
2448 
2449 #ifdef UNIV_ZIP_DEBUG
2450  ut_a(!page_zip || page_zip_validate(page_zip, page));
2451  page = buf_block_get_frame(rec_block);
2452 #endif /* UNIV_ZIP_DEBUG */
2453  page_zip = buf_block_get_page_zip(rec_block);
2454 
2455  offsets = rec_get_offsets(rec, index, offsets,
2456  ULINT_UNDEFINED, heap);
2457  btr_cur_unmark_extern_fields(page_zip,
2458  rec, index, offsets, mtr);
2459  }
2460 
2462  rec, block);
2463 
2464  /* If necessary, restore also the correct lock state for a new,
2465  preceding supremum record created in a page split. While the old
2466  record was nonexistent, the supremum might have inherited its locks
2467  from a wrong record. */
2468 
2469  if (!was_first) {
2470  btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
2471  rec, mtr);
2472  }
2473 
2474 return_after_reservations:
2475 #ifdef UNIV_ZIP_DEBUG
2476  ut_a(!page_zip || page_zip_validate(page_zip, page));
2477 #endif /* UNIV_ZIP_DEBUG */
2478 
2479  if (n_extents > 0) {
2480  fil_space_release_free_extents(index->space, n_reserved);
2481  }
2482 
2483  *big_rec = big_rec_vec;
2484 
2485  return(err);
2486 }
2487 
2488 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
2489 
2490 /****************************************************************/
2493 UNIV_INLINE
2494 void
2495 btr_cur_del_mark_set_clust_rec_log(
2496 /*===============================*/
2497  ulint flags,
2498  rec_t* rec,
2499  dict_index_t* index,
2500  ibool val,
2501  trx_t* trx,
2502  roll_ptr_t roll_ptr,
2503  mtr_t* mtr)
2504 {
2505  byte* log_ptr;
2506  ut_ad(flags < 256);
2507  ut_ad(val <= 1);
2508 
2509  ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2510 
2511  log_ptr = mlog_open_and_write_index(mtr, rec, index,
2512  page_rec_is_comp(rec)
2515  1 + 1 + DATA_ROLL_PTR_LEN
2516  + 14 + 2);
2517 
2518  if (!log_ptr) {
2519  /* Logging in mtr is switched off during crash recovery */
2520  return;
2521  }
2522 
2523  mach_write_to_1(log_ptr, flags);
2524  log_ptr++;
2525  mach_write_to_1(log_ptr, val);
2526  log_ptr++;
2527 
2528  log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
2529  mtr);
2530  mach_write_to_2(log_ptr, page_offset(rec));
2531  log_ptr += 2;
2532 
2533  mlog_close(mtr, log_ptr);
2534 }
2535 #endif /* !UNIV_HOTBACKUP */
2536 
2537 /****************************************************************/
2541 UNIV_INTERN
2542 byte*
2543 btr_cur_parse_del_mark_set_clust_rec(
2544 /*=================================*/
2545  byte* ptr,
2546  byte* end_ptr,
2547  page_t* page,
2548  page_zip_des_t* page_zip,
2549  dict_index_t* index)
2550 {
2551  ulint flags;
2552  ulint val;
2553  ulint pos;
2554  trx_id_t trx_id;
2555  roll_ptr_t roll_ptr;
2556  ulint offset;
2557  rec_t* rec;
2558 
2559  ut_ad(!page
2560  || !!page_is_comp(page) == dict_table_is_comp(index->table));
2561 
2562  if (end_ptr < ptr + 2) {
2563 
2564  return(NULL);
2565  }
2566 
2567  flags = mach_read_from_1(ptr);
2568  ptr++;
2569  val = mach_read_from_1(ptr);
2570  ptr++;
2571 
2572  ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
2573 
2574  if (ptr == NULL) {
2575 
2576  return(NULL);
2577  }
2578 
2579  if (end_ptr < ptr + 2) {
2580 
2581  return(NULL);
2582  }
2583 
2584  offset = mach_read_from_2(ptr);
2585  ptr += 2;
2586 
2587  ut_a(offset <= UNIV_PAGE_SIZE);
2588 
2589  if (page) {
2590  rec = page + offset;
2591 
2592  /* We do not need to reserve btr_search_latch, as the page
2593  is only being recovered, and there cannot be a hash index to
2594  it. */
2595 
2596  btr_rec_set_deleted_flag(rec, page_zip, val);
2597 
2598  if (!(flags & BTR_KEEP_SYS_FLAG)) {
2599  mem_heap_t* heap = NULL;
2600  ulint offsets_[REC_OFFS_NORMAL_SIZE];
2601  rec_offs_init(offsets_);
2602 
2604  rec, page_zip,
2605  rec_get_offsets(rec, index, offsets_,
2606  ULINT_UNDEFINED, &heap),
2607  pos, trx_id, roll_ptr);
2608  if (UNIV_LIKELY_NULL(heap)) {
2609  mem_heap_free(heap);
2610  }
2611  }
2612  }
2613 
2614  return(ptr);
2615 }
2616 
2617 #ifndef UNIV_HOTBACKUP
2618 /***********************************************************/
2624 UNIV_INTERN
2625 ulint
2626 btr_cur_del_mark_set_clust_rec(
2627 /*===========================*/
2628  ulint flags,
2629  btr_cur_t* cursor,
2630  ibool val,
2631  que_thr_t* thr,
2632  mtr_t* mtr)
2633 {
2634  dict_index_t* index;
2635  buf_block_t* block;
2636  roll_ptr_t roll_ptr;
2637  ulint err;
2638  rec_t* rec;
2639  page_zip_des_t* page_zip;
2640  trx_t* trx;
2641  mem_heap_t* heap = NULL;
2642  ulint offsets_[REC_OFFS_NORMAL_SIZE];
2643  ulint* offsets = offsets_;
2644  rec_offs_init(offsets_);
2645 
2646  rec = btr_cur_get_rec(cursor);
2647  index = cursor->index;
2648  ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2649  offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
2650 
2651 #ifdef UNIV_DEBUG
2652  if (btr_cur_print_record_ops && thr) {
2653  btr_cur_trx_report(thr_get_trx(thr), index, "del mark ");
2654  rec_print_new(stderr, rec, offsets);
2655  }
2656 #endif /* UNIV_DEBUG */
2657 
2658  ut_ad(dict_index_is_clust(index));
2659  ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
2660 
2662  btr_cur_get_block(cursor),
2663  rec, index, offsets, thr);
2664 
2665  if (err != DB_SUCCESS) {
2666 
2667  goto func_exit;
2668  }
2669 
2670  err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
2671  index, NULL, NULL, 0, rec,
2672  &roll_ptr);
2673  if (err != DB_SUCCESS) {
2674 
2675  goto func_exit;
2676  }
2677 
2678  block = btr_cur_get_block(cursor);
2679 
2680  if (block->is_hashed) {
2681  rw_lock_x_lock(&btr_search_latch);
2682  }
2683 
2684  page_zip = buf_block_get_page_zip(block);
2685 
2686  btr_rec_set_deleted_flag(rec, page_zip, val);
2687 
2688  trx = thr_get_trx(thr);
2689 
2690  if (!(flags & BTR_KEEP_SYS_FLAG)) {
2691  row_upd_rec_sys_fields(rec, page_zip,
2692  index, offsets, trx, roll_ptr);
2693  }
2694 
2695  if (block->is_hashed) {
2696  rw_lock_x_unlock(&btr_search_latch);
2697  }
2698 
2699  btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx,
2700  roll_ptr, mtr);
2701 
2702 func_exit:
2703  if (UNIV_LIKELY_NULL(heap)) {
2704  mem_heap_free(heap);
2705  }
2706  return(err);
2707 }
2708 
2709 /****************************************************************/
2712 UNIV_INLINE
2713 void
2714 btr_cur_del_mark_set_sec_rec_log(
2715 /*=============================*/
2716  rec_t* rec,
2717  ibool val,
2718  mtr_t* mtr)
2719 {
2720  byte* log_ptr;
2721  ut_ad(val <= 1);
2722 
2723  log_ptr = mlog_open(mtr, 11 + 1 + 2);
2724 
2725  if (!log_ptr) {
2726  /* Logging in mtr is switched off during crash recovery:
2727  in that case mlog_open returns NULL */
2728  return;
2729  }
2730 
2732  rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
2733  mach_write_to_1(log_ptr, val);
2734  log_ptr++;
2735 
2736  mach_write_to_2(log_ptr, page_offset(rec));
2737  log_ptr += 2;
2738 
2739  mlog_close(mtr, log_ptr);
2740 }
2741 #endif /* !UNIV_HOTBACKUP */
2742 
2743 /****************************************************************/
2747 UNIV_INTERN
2748 byte*
2749 btr_cur_parse_del_mark_set_sec_rec(
2750 /*===============================*/
2751  byte* ptr,
2752  byte* end_ptr,
2753  page_t* page,
2754  page_zip_des_t* page_zip)
2755 {
2756  ulint val;
2757  ulint offset;
2758  rec_t* rec;
2759 
2760  if (end_ptr < ptr + 3) {
2761 
2762  return(NULL);
2763  }
2764 
2765  val = mach_read_from_1(ptr);
2766  ptr++;
2767 
2768  offset = mach_read_from_2(ptr);
2769  ptr += 2;
2770 
2771  ut_a(offset <= UNIV_PAGE_SIZE);
2772 
2773  if (page) {
2774  rec = page + offset;
2775 
2776  /* We do not need to reserve btr_search_latch, as the page
2777  is only being recovered, and there cannot be a hash index to
2778  it. */
2779 
2780  btr_rec_set_deleted_flag(rec, page_zip, val);
2781  }
2782 
2783  return(ptr);
2784 }
2785 
2786 #ifndef UNIV_HOTBACKUP
2787 /***********************************************************/
2790 UNIV_INTERN
2791 ulint
2792 btr_cur_del_mark_set_sec_rec(
2793 /*=========================*/
2794  ulint flags,
2795  btr_cur_t* cursor,
2796  ibool val,
2797  que_thr_t* thr,
2798  mtr_t* mtr)
2799 {
2800  buf_block_t* block;
2801  rec_t* rec;
2802  ulint err;
2803 
2804  block = btr_cur_get_block(cursor);
2805  rec = btr_cur_get_rec(cursor);
2806 
2807 #ifdef UNIV_DEBUG
2808  if (btr_cur_print_record_ops && thr) {
2809  btr_cur_trx_report(thr_get_trx(thr), cursor->index,
2810  "del mark ");
2811  rec_print(stderr, rec, cursor->index);
2812  }
2813 #endif /* UNIV_DEBUG */
2814 
2816  btr_cur_get_block(cursor),
2817  rec, cursor->index, thr, mtr);
2818  if (err != DB_SUCCESS) {
2819 
2820  return(err);
2821  }
2822 
2823  ut_ad(!!page_rec_is_comp(rec)
2824  == dict_table_is_comp(cursor->index->table));
2825 
2826  if (block->is_hashed) {
2827  rw_lock_x_lock(&btr_search_latch);
2828  }
2829 
2830  btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
2831 
2832  if (block->is_hashed) {
2833  rw_lock_x_unlock(&btr_search_latch);
2834  }
2835 
2836  btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
2837 
2838  return(DB_SUCCESS);
2839 }
2840 
2841 /***********************************************************/
2844 UNIV_INTERN
2845 void
2846 btr_cur_set_deleted_flag_for_ibuf(
2847 /*==============================*/
2848  rec_t* rec,
2849  page_zip_des_t* page_zip,
2853  ibool val,
2854  mtr_t* mtr)
2855 {
2856  /* We do not need to reserve btr_search_latch, as the page has just
2857  been read to the buffer pool and there cannot be a hash index to it. */
2858 
2859  btr_rec_set_deleted_flag(rec, page_zip, val);
2860 
2861  btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
2862 }
2863 
2864 /*==================== B-TREE RECORD REMOVE =========================*/
2865 
2866 /*************************************************************/
2873 UNIV_INTERN
2874 ibool
2875 btr_cur_compress_if_useful(
2876 /*=======================*/
2877  btr_cur_t* cursor,
2880  mtr_t* mtr)
2881 {
2882  ut_ad(mtr_memo_contains(mtr,
2884  MTR_MEMO_X_LOCK));
2885  ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
2886  MTR_MEMO_PAGE_X_FIX));
2887 
2888  return(btr_cur_compress_recommendation(cursor, mtr)
2889  && btr_compress(cursor, mtr));
2890 }
2891 
2892 /*******************************************************/
2897 UNIV_INTERN
2898 ibool
2899 btr_cur_optimistic_delete(
2900 /*======================*/
2901  btr_cur_t* cursor,
2905  mtr_t* mtr)
2909 {
2910  buf_block_t* block;
2911  rec_t* rec;
2912  mem_heap_t* heap = NULL;
2913  ulint offsets_[REC_OFFS_NORMAL_SIZE];
2914  ulint* offsets = offsets_;
2915  ibool no_compress_needed;
2916  rec_offs_init(offsets_);
2917 
2918  ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
2919  MTR_MEMO_PAGE_X_FIX));
2920  /* This is intended only for leaf page deletions */
2921 
2922  block = btr_cur_get_block(cursor);
2923 
2924  ut_ad(page_is_leaf(buf_block_get_frame(block)));
2925 
2926  rec = btr_cur_get_rec(cursor);
2927  offsets = rec_get_offsets(rec, cursor->index, offsets,
2928  ULINT_UNDEFINED, &heap);
2929 
2930  no_compress_needed = !rec_offs_any_extern(offsets)
2931  && btr_cur_can_delete_without_compress(
2932  cursor, rec_offs_size(offsets), mtr);
2933 
2934  if (no_compress_needed) {
2935 
2936  page_t* page = buf_block_get_frame(block);
2937  page_zip_des_t* page_zip= buf_block_get_page_zip(block);
2938  ulint max_ins = 0;
2939 
2940  lock_update_delete(block, rec);
2941 
2942  btr_search_update_hash_on_delete(cursor);
2943 
2944  if (!page_zip) {
2946  page, 1);
2947  }
2948 #ifdef UNIV_ZIP_DEBUG
2949  ut_a(!page_zip || page_zip_validate(page_zip, page));
2950 #endif /* UNIV_ZIP_DEBUG */
2951  page_cur_delete_rec(btr_cur_get_page_cur(cursor),
2952  cursor->index, offsets, mtr);
2953 #ifdef UNIV_ZIP_DEBUG
2954  ut_a(!page_zip || page_zip_validate(page_zip, page));
2955 #endif /* UNIV_ZIP_DEBUG */
2956 
2957  if (dict_index_is_clust(cursor->index)
2958  || dict_index_is_ibuf(cursor->index)
2959  || !page_is_leaf(page)) {
2960  /* The insert buffer does not handle
2961  inserts to clustered indexes, to
2962  non-leaf pages of secondary index B-trees,
2963  or to the insert buffer. */
2964  } else if (page_zip) {
2965  ibuf_update_free_bits_zip(block, mtr);
2966  } else {
2967  ibuf_update_free_bits_low(block, max_ins, mtr);
2968  }
2969  }
2970 
2971  if (UNIV_LIKELY_NULL(heap)) {
2972  mem_heap_free(heap);
2973  }
2974 
2975  return(no_compress_needed);
2976 }
2977 
2978 /*************************************************************/
2986 UNIV_INTERN
2987 ibool
2988 btr_cur_pessimistic_delete(
2989 /*=======================*/
2990  ulint* err,
2995  ibool has_reserved_extents,
2999  btr_cur_t* cursor,
3003  enum trx_rb_ctx rb_ctx,
3004  mtr_t* mtr)
3005 {
3006  buf_block_t* block;
3007  page_t* page;
3008  page_zip_des_t* page_zip;
3009  dict_index_t* index;
3010  rec_t* rec;
3011  dtuple_t* node_ptr;
3012  ulint n_extents = 0;
3013  ulint n_reserved;
3014  ibool success;
3015  ibool ret = FALSE;
3016  ulint level;
3017  mem_heap_t* heap;
3018  ulint* offsets;
3019 
3020  block = btr_cur_get_block(cursor);
3021  page = buf_block_get_frame(block);
3022  index = btr_cur_get_index(cursor);
3023 
3024  ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
3025  MTR_MEMO_X_LOCK));
3026  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3027  if (!has_reserved_extents) {
3028  /* First reserve enough free space for the file segments
3029  of the index tree, so that the node pointer updates will
3030  not fail because of lack of space */
3031 
3032  n_extents = cursor->tree_height / 32 + 1;
3033 
3034  success = fsp_reserve_free_extents(&n_reserved,
3035  index->space,
3036  n_extents,
3037  FSP_CLEANING, mtr);
3038  if (!success) {
3039  *err = DB_OUT_OF_FILE_SPACE;
3040 
3041  return(FALSE);
3042  }
3043  }
3044 
3045  heap = mem_heap_create(1024);
3046  rec = btr_cur_get_rec(cursor);
3047  page_zip = buf_block_get_page_zip(block);
3048 #ifdef UNIV_ZIP_DEBUG
3049  ut_a(!page_zip || page_zip_validate(page_zip, page));
3050 #endif /* UNIV_ZIP_DEBUG */
3051 
3052  offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
3053 
3054  if (rec_offs_any_extern(offsets)) {
3055  btr_rec_free_externally_stored_fields(index,
3056  rec, offsets, page_zip,
3057  rb_ctx, mtr);
3058 #ifdef UNIV_ZIP_DEBUG
3059  ut_a(!page_zip || page_zip_validate(page_zip, page));
3060 #endif /* UNIV_ZIP_DEBUG */
3061  }
3062 
3063  if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
3064  && UNIV_UNLIKELY(dict_index_get_page(index)
3065  != buf_block_get_page_no(block))) {
3066 
3067  /* If there is only one record, drop the whole page in
3068  btr_discard_page, if this is not the root page */
3069 
3070  btr_discard_page(cursor, mtr);
3071 
3072  *err = DB_SUCCESS;
3073  ret = TRUE;
3074 
3075  goto return_after_reservations;
3076  }
3077 
3078  lock_update_delete(block, rec);
3079  level = btr_page_get_level(page, mtr);
3080 
3081  if (level > 0
3082  && UNIV_UNLIKELY(rec == page_rec_get_next(
3083  page_get_infimum_rec(page)))) {
3084 
3085  rec_t* next_rec = page_rec_get_next(rec);
3086 
3087  if (btr_page_get_prev(page, mtr) == FIL_NULL) {
3088 
3089  /* If we delete the leftmost node pointer on a
3090  non-leaf level, we must mark the new leftmost node
3091  pointer as the predefined minimum record */
3092 
3093  /* This will make page_zip_validate() fail until
3094  page_cur_delete_rec() completes. This is harmless,
3095  because everything will take place within a single
3096  mini-transaction and because writing to the redo log
3097  is an atomic operation (performed by mtr_commit()). */
3098  btr_set_min_rec_mark(next_rec, mtr);
3099  } else {
3100  /* Otherwise, if we delete the leftmost node pointer
3101  on a page, we have to change the father node pointer
3102  so that it is equal to the new leftmost node pointer
3103  on the page */
3104 
3105  btr_node_ptr_delete(index, block, mtr);
3106 
3107  node_ptr = dict_index_build_node_ptr(
3108  index, next_rec, buf_block_get_page_no(block),
3109  heap, level);
3110 
3111  btr_insert_on_non_leaf_level(index,
3112  level + 1, node_ptr, mtr);
3113  }
3114  }
3115 
3116  btr_search_update_hash_on_delete(cursor);
3117 
3118  page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
3119 #ifdef UNIV_ZIP_DEBUG
3120  ut_a(!page_zip || page_zip_validate(page_zip, page));
3121 #endif /* UNIV_ZIP_DEBUG */
3122 
3123  ut_ad(btr_check_node_ptr(index, block, mtr));
3124 
3125  *err = DB_SUCCESS;
3126 
3127 return_after_reservations:
3128  mem_heap_free(heap);
3129 
3130  if (ret == FALSE) {
3131  ret = btr_cur_compress_if_useful(cursor, mtr);
3132  }
3133 
3134  if (n_extents > 0) {
3135  fil_space_release_free_extents(index->space, n_reserved);
3136  }
3137 
3138  return(ret);
3139 }
3140 
3141 /*******************************************************************/
3144 static
3145 void
3146 btr_cur_add_path_info(
3147 /*==================*/
3148  btr_cur_t* cursor,
3149  ulint height,
3151  ulint root_height)
3152 {
3153  btr_path_t* slot;
3154  rec_t* rec;
3155  page_t* page;
3156 
3157  ut_a(cursor->path_arr);
3158 
3159  if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
3160  /* Do nothing; return empty path */
3161 
3162  slot = cursor->path_arr;
3163  slot->nth_rec = ULINT_UNDEFINED;
3164 
3165  return;
3166  }
3167 
3168  if (height == 0) {
3169  /* Mark end of slots for path */
3170  slot = cursor->path_arr + root_height + 1;
3171  slot->nth_rec = ULINT_UNDEFINED;
3172  }
3173 
3174  rec = btr_cur_get_rec(cursor);
3175 
3176  slot = cursor->path_arr + (root_height - height);
3177 
3178  page = page_align(rec);
3179 
3180  slot->nth_rec = page_rec_get_n_recs_before(rec);
3181  slot->n_recs = page_get_n_recs(page);
3182  slot->page_no = page_get_page_no(page);
3183  slot->page_level = btr_page_get_level_low(page);
3184 }
3185 
3186 /*******************************************************************/
3198 static
3199 ib_int64_t
3200 btr_estimate_n_rows_in_range_on_level(
3201 /*==================================*/
3202  dict_index_t* index,
3203  btr_path_t* slot1,
3204  btr_path_t* slot2,
3205  ib_int64_t n_rows_on_prev_level,
3210  ibool* is_n_rows_exact)
3213 {
3214  ulint space;
3215  ib_int64_t n_rows;
3216  ulint n_pages_read;
3217  ulint page_no;
3218  ulint zip_size;
3219  ulint level;
3220 
3221  space = dict_index_get_space(index);
3222 
3223  n_rows = 0;
3224  n_pages_read = 0;
3225 
3226  /* Assume by default that we will scan all pages between
3227  slot1->page_no and slot2->page_no */
3228  *is_n_rows_exact = TRUE;
3229 
3230  /* add records from slot1->page_no which are to the right of
3231  the record which serves as a left border of the range, if any */
3232  if (slot1->nth_rec < slot1->n_recs) {
3233  n_rows += slot1->n_recs - slot1->nth_rec;
3234  }
3235 
3236  /* add records from slot2->page_no which are to the left of
3237  the record which servers as a right border of the range, if any */
3238  if (slot2->nth_rec > 1) {
3239  n_rows += slot2->nth_rec - 1;
3240  }
3241 
3242  /* count the records in the pages between slot1->page_no and
3243  slot2->page_no (non inclusive), if any */
3244 
3245  zip_size = fil_space_get_zip_size(space);
3246 
3247  /* Do not read more than this number of pages in order not to hurt
3248  performance with this code which is just an estimation. If we read
3249  this many pages before reaching slot2->page_no then we estimate the
3250  average from the pages scanned so far */
3251 # define N_PAGES_READ_LIMIT 10
3252 
3253  page_no = slot1->page_no;
3254  level = slot1->page_level;
3255 
3256  do {
3257  mtr_t mtr;
3258  page_t* page;
3259  buf_block_t* block;
3260 
3261  mtr_start(&mtr);
3262 
3263  /* fetch the page */
3264  block = buf_page_get(space, zip_size, page_no, RW_S_LATCH,
3265  &mtr);
3266 
3267  page = buf_block_get_frame(block);
3268 
3269  /* It is possible that the tree has been reorganized in the
3270  meantime and this is a different page. If this happens the
3271  calculated estimate will be bogus, which is not fatal as
3272  this is only an estimate. We are sure that a page with
3273  page_no exists because InnoDB never frees pages, only
3274  reuses them. */
3275  if (fil_page_get_type(page) != FIL_PAGE_INDEX
3276  || btr_page_get_index_id(page) != index->id
3277  || btr_page_get_level_low(page) != level) {
3278 
3279  /* The page got reused for something else */
3280  mtr_commit(&mtr);
3281  goto inexact;
3282  }
3283 
3284  n_pages_read++;
3285 
3286  if (page_no != slot1->page_no) {
3287  /* Do not count the records on slot1->page_no,
3288  we already counted them before this loop. */
3289  n_rows += page_get_n_recs(page);
3290  }
3291 
3292  page_no = btr_page_get_next(page, &mtr);
3293 
3294  mtr_commit(&mtr);
3295 
3296  if (n_pages_read == N_PAGES_READ_LIMIT
3297  || page_no == FIL_NULL) {
3298  /* Either we read too many pages or
3299  we reached the end of the level without passing
3300  through slot2->page_no, the tree must have changed
3301  in the meantime */
3302  goto inexact;
3303  }
3304 
3305  } while (page_no != slot2->page_no);
3306 
3307  return(n_rows);
3308 
3309 inexact:
3310 
3311  *is_n_rows_exact = FALSE;
3312 
3313  /* We did interrupt before reaching slot2->page */
3314 
3315  if (n_pages_read > 0) {
3316  /* The number of pages on this level is
3317  n_rows_on_prev_level, multiply it by the
3318  average number of recs per page so far */
3319  n_rows = n_rows_on_prev_level
3320  * n_rows / n_pages_read;
3321  } else {
3322  /* The tree changed before we could even
3323  start with slot1->page_no */
3324  n_rows = 10;
3325  }
3326 
3327  return(n_rows);
3328 }
3329 
3330 /*******************************************************************/
3333 UNIV_INTERN
3334 ib_int64_t
3335 btr_estimate_n_rows_in_range(
3336 /*=========================*/
3337  dict_index_t* index,
3338  const dtuple_t* tuple1,
3339  ulint mode1,
3340  const dtuple_t* tuple2,
3341  ulint mode2)
3342 {
3345  btr_cur_t cursor;
3346  btr_path_t* slot1;
3347  btr_path_t* slot2;
3348  ibool diverged;
3349  ibool diverged_lot;
3350  ulint divergence_level;
3351  ib_int64_t n_rows;
3352  ibool is_n_rows_exact;
3353  ulint i;
3354  mtr_t mtr;
3355 
3356  mtr_start(&mtr);
3357 
3358  cursor.path_arr = path1;
3359 
3360  if (dtuple_get_n_fields(tuple1) > 0) {
3361 
3362  btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
3364  &cursor, 0,
3365  __FILE__, __LINE__, &mtr);
3366  } else {
3367  btr_cur_open_at_index_side(TRUE, index,
3369  &cursor, &mtr);
3370  }
3371 
3372  mtr_commit(&mtr);
3373 
3374  mtr_start(&mtr);
3375 
3376  cursor.path_arr = path2;
3377 
3378  if (dtuple_get_n_fields(tuple2) > 0) {
3379 
3380  btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
3382  &cursor, 0,
3383  __FILE__, __LINE__, &mtr);
3384  } else {
3385  btr_cur_open_at_index_side(FALSE, index,
3387  &cursor, &mtr);
3388  }
3389 
3390  mtr_commit(&mtr);
3391 
3392  /* We have the path information for the range in path1 and path2 */
3393 
3394  n_rows = 1;
3395  is_n_rows_exact = TRUE;
3396  diverged = FALSE; /* This becomes true when the path is not
3397  the same any more */
3398  diverged_lot = FALSE; /* This becomes true when the paths are
3399  not the same or adjacent any more */
3400  divergence_level = 1000000; /* This is the level where paths diverged
3401  a lot */
3402  for (i = 0; ; i++) {
3404 
3405  slot1 = path1 + i;
3406  slot2 = path2 + i;
3407 
3408  if (slot1->nth_rec == ULINT_UNDEFINED
3409  || slot2->nth_rec == ULINT_UNDEFINED) {
3410 
3411  if (i > divergence_level + 1 && !is_n_rows_exact) {
3412  /* In trees whose height is > 1 our algorithm
3413  tends to underestimate: multiply the estimate
3414  by 2: */
3415 
3416  n_rows = n_rows * 2;
3417  }
3418 
3419  /* Do not estimate the number of rows in the range
3420  to over 1 / 2 of the estimated rows in the whole
3421  table */
3422 
3423  if (n_rows > index->table->stat_n_rows / 2
3424  && !is_n_rows_exact) {
3425 
3426  n_rows = index->table->stat_n_rows / 2;
3427 
3428  /* If there are just 0 or 1 rows in the table,
3429  then we estimate all rows are in the range */
3430 
3431  if (n_rows == 0) {
3432  n_rows = index->table->stat_n_rows;
3433  }
3434  }
3435 
3436  return(n_rows);
3437  }
3438 
3439  if (!diverged && slot1->nth_rec != slot2->nth_rec) {
3440 
3441  diverged = TRUE;
3442 
3443  if (slot1->nth_rec < slot2->nth_rec) {
3444  n_rows = slot2->nth_rec - slot1->nth_rec;
3445 
3446  if (n_rows > 1) {
3447  diverged_lot = TRUE;
3448  divergence_level = i;
3449  }
3450  } else {
3451  /* It is possible that
3452  slot1->nth_rec >= slot2->nth_rec
3453  if, for example, we have a single page
3454  tree which contains (inf, 5, 6, supr)
3455  and we select where x > 20 and x < 30;
3456  in this case slot1->nth_rec will point
3457  to the supr record and slot2->nth_rec
3458  will point to 6 */
3459  n_rows = 0;
3460  }
3461 
3462  } else if (diverged && !diverged_lot) {
3463 
3464  if (slot1->nth_rec < slot1->n_recs
3465  || slot2->nth_rec > 1) {
3466 
3467  diverged_lot = TRUE;
3468  divergence_level = i;
3469 
3470  n_rows = 0;
3471 
3472  if (slot1->nth_rec < slot1->n_recs) {
3473  n_rows += slot1->n_recs
3474  - slot1->nth_rec;
3475  }
3476 
3477  if (slot2->nth_rec > 1) {
3478  n_rows += slot2->nth_rec - 1;
3479  }
3480  }
3481  } else if (diverged_lot) {
3482 
3483  n_rows = btr_estimate_n_rows_in_range_on_level(
3484  index, slot1, slot2, n_rows,
3485  &is_n_rows_exact);
3486  }
3487  }
3488 }
3489 
3490 /*******************************************************************/
3494 UNIV_INTERN
3495 void
3496 btr_estimate_number_of_different_key_vals(
3497 /*======================================*/
3498  dict_index_t* index)
3499 {
3500  btr_cur_t cursor;
3501  page_t* page;
3502  rec_t* rec;
3503  ulint n_cols;
3504  ulint matched_fields;
3505  ulint matched_bytes;
3506  ib_int64_t* n_diff;
3507  ullint n_sample_pages; /* number of pages to sample */
3508  ulint not_empty_flag = 0;
3509  ulint total_external_size = 0;
3510  ulint i;
3511  ulint j;
3512  ullint add_on;
3513  mtr_t mtr;
3514  mem_heap_t* heap = NULL;
3515  ulint offsets_rec_[REC_OFFS_NORMAL_SIZE];
3516  ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE];
3517  ulint* offsets_rec = offsets_rec_;
3518  ulint* offsets_next_rec= offsets_next_rec_;
3519  rec_offs_init(offsets_rec_);
3520  rec_offs_init(offsets_next_rec_);
3521 
3522  n_cols = dict_index_get_n_unique(index);
3523 
3524  n_diff = (ib_int64_t *)mem_zalloc((n_cols + 1) * sizeof(ib_int64_t));
3525 
3526  /* It makes no sense to test more pages than are contained
3527  in the index, thus we lower the number if it is too high */
3528  if (srv_stats_sample_pages > index->stat_index_size) {
3529  if (index->stat_index_size > 0) {
3530  n_sample_pages = index->stat_index_size;
3531  } else {
3532  n_sample_pages = 1;
3533  }
3534  } else {
3535  n_sample_pages = srv_stats_sample_pages;
3536  }
3537 
3538  /* We sample some pages in the index to get an estimate */
3539 
3540  for (i = 0; i < n_sample_pages; i++) {
3541  rec_t* supremum;
3542  mtr_start(&mtr);
3543 
3544  btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
3545 
3546  /* Count the number of different key values for each prefix of
3547  the key on this index page. If the prefix does not determine
3548  the index record uniquely in the B-tree, then we subtract one
3549  because otherwise our algorithm would give a wrong estimate
3550  for an index where there is just one key value. */
3551 
3552  page = btr_cur_get_page(&cursor);
3553 
3554  supremum = page_get_supremum_rec(page);
3555  rec = page_rec_get_next(page_get_infimum_rec(page));
3556 
3557  if (rec != supremum) {
3558  not_empty_flag = 1;
3559  offsets_rec = rec_get_offsets(rec, index, offsets_rec,
3560  ULINT_UNDEFINED, &heap);
3561  }
3562 
3563  while (rec != supremum) {
3564  rec_t* next_rec = page_rec_get_next(rec);
3565  if (next_rec == supremum) {
3566  break;
3567  }
3568 
3569  matched_fields = 0;
3570  matched_bytes = 0;
3571  offsets_next_rec = rec_get_offsets(next_rec, index,
3572  offsets_next_rec,
3573  n_cols, &heap);
3574 
3575  cmp_rec_rec_with_match(rec, next_rec,
3576  offsets_rec, offsets_next_rec,
3577  index, &matched_fields,
3578  &matched_bytes);
3579 
3580  for (j = matched_fields + 1; j <= n_cols; j++) {
3581  /* We add one if this index record has
3582  a different prefix from the previous */
3583 
3584  n_diff[j]++;
3585  }
3586 
3587  total_external_size
3588  += btr_rec_get_externally_stored_len(
3589  rec, offsets_rec);
3590 
3591  rec = next_rec;
3592  /* Initialize offsets_rec for the next round
3593  and assign the old offsets_rec buffer to
3594  offsets_next_rec. */
3595  {
3596  ulint* offsets_tmp = offsets_rec;
3597  offsets_rec = offsets_next_rec;
3598  offsets_next_rec = offsets_tmp;
3599  }
3600  }
3601 
3602 
3603  if (n_cols == dict_index_get_n_unique_in_tree(index)) {
3604 
3605  /* If there is more than one leaf page in the tree,
3606  we add one because we know that the first record
3607  on the page certainly had a different prefix than the
3608  last record on the previous index page in the
3609  alphabetical order. Before this fix, if there was
3610  just one big record on each clustered index page, the
3611  algorithm grossly underestimated the number of rows
3612  in the table. */
3613 
3614  if (btr_page_get_prev(page, &mtr) != FIL_NULL
3615  || btr_page_get_next(page, &mtr) != FIL_NULL) {
3616 
3617  n_diff[n_cols]++;
3618  }
3619  }
3620 
3621  offsets_rec = rec_get_offsets(rec, index, offsets_rec,
3622  ULINT_UNDEFINED, &heap);
3623  total_external_size += btr_rec_get_externally_stored_len(
3624  rec, offsets_rec);
3625  mtr_commit(&mtr);
3626  }
3627 
3628  /* If we saw k borders between different key values on
3629  n_sample_pages leaf pages, we can estimate how many
3630  there will be in index->stat_n_leaf_pages */
3631 
3632  /* We must take into account that our sample actually represents
3633  also the pages used for external storage of fields (those pages are
3634  included in index->stat_n_leaf_pages) */
3635 
3636  for (j = 0; j <= n_cols; j++) {
3637  index->stat_n_diff_key_vals[j]
3638  = ((n_diff[j]
3639  * (ib_int64_t)index->stat_n_leaf_pages
3640  + n_sample_pages - 1
3641  + total_external_size
3642  + not_empty_flag)
3643  / (n_sample_pages
3644  + total_external_size));
3645 
3646  /* If the tree is small, smaller than
3647  10 * n_sample_pages + total_external_size, then
3648  the above estimate is ok. For bigger trees it is common that we
3649  do not see any borders between key values in the few pages
3650  we pick. But still there may be n_sample_pages
3651  different key values, or even more. Let us try to approximate
3652  that: */
3653 
3654  add_on = index->stat_n_leaf_pages
3655  / (10 * (n_sample_pages
3656  + total_external_size));
3657 
3658  if (add_on > n_sample_pages) {
3659  add_on = n_sample_pages;
3660  }
3661 
3662  index->stat_n_diff_key_vals[j] += add_on;
3663  }
3664 
3665  mem_free(n_diff);
3666  if (UNIV_LIKELY_NULL(heap)) {
3667  mem_heap_free(heap);
3668  }
3669 }
3670 
3671 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
3672 
3673 /***********************************************************/
3676 static
3677 ulint
3678 btr_rec_get_externally_stored_len(
3679 /*==============================*/
3680  rec_t* rec,
3681  const ulint* offsets)
3682 {
3683  ulint n_fields;
3684  byte* data;
3685  ulint local_len;
3686  ulint extern_len;
3687  ulint total_extern_len = 0;
3688  ulint i;
3689 
3690  ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3691  n_fields = rec_offs_n_fields(offsets);
3692 
3693  for (i = 0; i < n_fields; i++) {
3694  if (rec_offs_nth_extern(offsets, i)) {
3695 
3696  data = rec_get_nth_field(rec, offsets, i, &local_len);
3697 
3698  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3699 
3700  extern_len = mach_read_from_4(data + local_len
3701  + BTR_EXTERN_LEN + 4);
3702 
3703  total_extern_len += ut_calc_align(extern_len,
3704  UNIV_PAGE_SIZE);
3705  }
3706  }
3707 
3708  return(total_extern_len / UNIV_PAGE_SIZE);
3709 }
3710 
3711 /*******************************************************************/
3713 static
3714 void
3715 btr_cur_set_ownership_of_extern_field(
3716 /*==================================*/
3717  page_zip_des_t* page_zip,
3719  rec_t* rec,
3720  dict_index_t* index,
3721  const ulint* offsets,
3722  ulint i,
3723  ibool val,
3724  mtr_t* mtr)
3725 {
3726  byte* data;
3727  ulint local_len;
3728  ulint byte_val;
3729 
3730  data = rec_get_nth_field(rec, offsets, i, &local_len);
3731 
3732  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3733 
3734  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3735 
3736  byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
3737 
3738  if (val) {
3739  byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
3740  } else {
3741  byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
3742  }
3743 
3744  if (UNIV_LIKELY_NULL(page_zip)) {
3745  mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
3746  page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
3747  } else if (UNIV_LIKELY(mtr != NULL)) {
3748 
3749  mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
3750  MLOG_1BYTE, mtr);
3751  } else {
3752  mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
3753  }
3754 }
3755 
3756 /*******************************************************************/
3762 UNIV_INTERN
3763 ibool
3764 btr_cur_mark_extern_inherited_fields(
3765 /*=================================*/
3766  page_zip_des_t* page_zip,
3768  rec_t* rec,
3769  dict_index_t* index,
3770  const ulint* offsets,
3771  const upd_t* update,
3772  mtr_t* mtr)
3773 {
3774  ulint n;
3775  ulint j;
3776  ulint i;
3777  ibool change_ownership = FALSE;
3778 
3779  ut_ad(rec_offs_validate(rec, NULL, offsets));
3780  ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3781 
3782  if (!rec_offs_any_extern(offsets)) {
3783 
3784  return(FALSE);
3785  }
3786 
3787  n = rec_offs_n_fields(offsets);
3788 
3789  for (i = 0; i < n; i++) {
3790  if (rec_offs_nth_extern(offsets, i)) {
3791 
3792  /* Check it is not in updated fields */
3793 
3794  if (update) {
3795  for (j = 0; j < upd_get_n_fields(update);
3796  j++) {
3797  if (upd_get_nth_field(update, j)
3798  ->field_no == i) {
3799 
3800  goto updated;
3801  }
3802  }
3803  }
3804 
3805  btr_cur_set_ownership_of_extern_field(
3806  page_zip, rec, index, offsets, i, FALSE, mtr);
3807 
3808  change_ownership = TRUE;
3809 updated:
3810  ;
3811  }
3812  }
3813 
3814  return(change_ownership);
3815 }
3816 
3817 /*******************************************************************/
3821 UNIV_INTERN
3822 void
3823 btr_cur_mark_dtuple_inherited_extern(
3824 /*=================================*/
3825  dtuple_t* entry,
3827  const upd_t* update)
3828 {
3829  ulint i;
3830 
3831  for (i = 0; i < dtuple_get_n_fields(entry); i++) {
3832 
3833  dfield_t* dfield = dtuple_get_nth_field(entry, i);
3834  byte* data;
3835  ulint len;
3836  ulint j;
3837 
3838  if (!dfield_is_ext(dfield)) {
3839  continue;
3840  }
3841 
3842  /* Check if it is in updated fields */
3843 
3844  for (j = 0; j < upd_get_n_fields(update); j++) {
3845  if (upd_get_nth_field(update, j)->field_no == i) {
3846 
3847  goto is_updated;
3848  }
3849  }
3850 
3851  data = (unsigned char *)dfield_get_data(dfield);
3852  len = dfield_get_len(dfield);
3855 
3856 is_updated:
3857  ;
3858  }
3859 }
3860 
3861 /*******************************************************************/
3865 static
3866 void
3867 btr_cur_unmark_extern_fields(
3868 /*=========================*/
3869  page_zip_des_t* page_zip,
3871  rec_t* rec,
3872  dict_index_t* index,
3873  const ulint* offsets,
3874  mtr_t* mtr)
3875 {
3876  ulint n;
3877  ulint i;
3878 
3879  ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3880  n = rec_offs_n_fields(offsets);
3881 
3882  if (!rec_offs_any_extern(offsets)) {
3883 
3884  return;
3885  }
3886 
3887  for (i = 0; i < n; i++) {
3888  if (rec_offs_nth_extern(offsets, i)) {
3889 
3890  btr_cur_set_ownership_of_extern_field(
3891  page_zip, rec, index, offsets, i, TRUE, mtr);
3892  }
3893  }
3894 }
3895 
3896 /*******************************************************************/
3898 UNIV_INTERN
3899 void
3900 btr_cur_unmark_dtuple_extern_fields(
3901 /*================================*/
3902  dtuple_t* entry)
3903 {
3904  ulint i;
3905 
3906  for (i = 0; i < dtuple_get_n_fields(entry); i++) {
3907  dfield_t* dfield = dtuple_get_nth_field(entry, i);
3908 
3909  if (dfield_is_ext(dfield)) {
3910  byte* data = (unsigned char *)dfield_get_data(dfield);
3911  ulint len = dfield_get_len(dfield);
3912 
3915  }
3916  }
3917 }
3918 
3919 /*******************************************************************/
3924 UNIV_INTERN
3925 ulint
3926 btr_push_update_extern_fields(
3927 /*==========================*/
3928  dtuple_t* tuple,
3929  const upd_t* update,
3930  mem_heap_t* heap)
3931 {
3932  ulint n_pushed = 0;
3933  ulint n;
3934  const upd_field_t* uf;
3935 
3936  ut_ad(tuple);
3937  ut_ad(update);
3938 
3939  uf = update->fields;
3940  n = upd_get_n_fields(update);
3941 
3942  for (; n--; uf++) {
3943  if (dfield_is_ext(&uf->new_val)) {
3944  dfield_t* field
3945  = dtuple_get_nth_field(tuple, uf->field_no);
3946 
3947  if (!dfield_is_ext(field)) {
3948  dfield_set_ext(field);
3949  n_pushed++;
3950  }
3951 
3952  switch (uf->orig_len) {
3953  byte* data;
3954  ulint len;
3955  byte* buf;
3956  case 0:
3957  break;
3959  /* Restore the original locally stored
3960  part of the column. In the undo log,
3961  InnoDB writes a longer prefix of externally
3962  stored columns, so that column prefixes
3963  in secondary indexes can be reconstructed. */
3964  dfield_set_data(field, (byte*) dfield_get_data(field)
3965  + dfield_get_len(field)
3968  dfield_set_ext(field);
3969  break;
3970  default:
3971  /* Reconstruct the original locally
3972  stored part of the column. The data
3973  will have to be copied. */
3975 
3976  data = (unsigned char *)dfield_get_data(field);
3977  len = dfield_get_len(field);
3978 
3979  buf = (unsigned char *)mem_heap_alloc(heap, uf->orig_len);
3980  /* Copy the locally stored prefix. */
3981  memcpy(buf, data,
3982  uf->orig_len
3984  /* Copy the BLOB pointer. */
3985  memcpy(buf + uf->orig_len
3987  data + len - BTR_EXTERN_FIELD_REF_SIZE,
3989 
3990  dfield_set_data(field, buf, uf->orig_len);
3991  dfield_set_ext(field);
3992  }
3993  }
3994  }
3995 
3996  return(n_pushed);
3997 }
3998 
3999 /*******************************************************************/
4002 static
4003 ulint
4004 btr_blob_get_part_len(
4005 /*==================*/
4006  const byte* blob_header)
4007 {
4008  return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
4009 }
4010 
4011 /*******************************************************************/
4014 static
4015 ulint
4016 btr_blob_get_next_page_no(
4017 /*======================*/
4018  const byte* blob_header)
4019 {
4020  return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
4021 }
4022 
4023 /*******************************************************************/
4025 static
4026 void
4027 btr_blob_free(
4028 /*==========*/
4029  buf_block_t* block,
4030  ibool all,
4032  mtr_t* mtr)
4033 {
4034  buf_pool_t* buf_pool = buf_pool_from_block(block);
4035  ulint space = buf_block_get_space(block);
4036  ulint page_no = buf_block_get_page_no(block);
4037 
4038  ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4039 
4040  mtr_commit(mtr);
4041 
4042  buf_pool_mutex_enter(buf_pool);
4043  mutex_enter(&block->mutex);
4044 
4045  /* Only free the block if it is still allocated to
4046  the same file page. */
4047 
4048  if (buf_block_get_state(block)
4050  && buf_block_get_space(block) == space
4051  && buf_block_get_page_no(block) == page_no) {
4052 
4053  if (buf_LRU_free_block(&block->page, all, NULL)
4054  != BUF_LRU_FREED
4055  && all && block->page.zip.data) {
4056  /* Attempt to deallocate the uncompressed page
4057  if the whole block cannot be deallocted. */
4058 
4059  buf_LRU_free_block(&block->page, FALSE, NULL);
4060  }
4061  }
4062 
4063  buf_pool_mutex_exit(buf_pool);
4064  mutex_exit(&block->mutex);
4065 }
4066 
4067 /*******************************************************************/
4073 UNIV_INTERN
4074 ulint
4075 btr_store_big_rec_extern_fields(
4076 /*============================*/
4077  dict_index_t* index,
4079  buf_block_t* rec_block,
4080  rec_t* rec,
4081  const ulint* offsets,
4085  big_rec_t* big_rec_vec,
4087  mtr_t* /*local_mtr __attribute__((unused))*/)
4090 {
4091  ulint rec_page_no;
4092  byte* field_ref;
4093  ulint extern_len;
4094  ulint store_len;
4095  ulint page_no;
4096  ulint space_id;
4097  ulint zip_size;
4098  ulint prev_page_no;
4099  ulint hint_page_no;
4100  ulint i;
4101  mtr_t mtr;
4102  mem_heap_t* heap = NULL;
4103  page_zip_des_t* page_zip;
4104  z_stream c_stream;
4105 
4106  ut_ad(rec_offs_validate(rec, index, offsets));
4107  ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
4108  MTR_MEMO_X_LOCK));
4109  ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
4110  ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
4111  ut_a(dict_index_is_clust(index));
4112 
4113  page_zip = buf_block_get_page_zip(rec_block);
4115  == buf_block_get_zip_size(rec_block));
4116 
4117  space_id = buf_block_get_space(rec_block);
4118  zip_size = buf_block_get_zip_size(rec_block);
4119  rec_page_no = buf_block_get_page_no(rec_block);
4120  ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
4121 
4122  if (UNIV_LIKELY_NULL(page_zip)) {
4123  int err;
4124 
4125  /* Zlib deflate needs 128 kilobytes for the default
4126  window size, plus 512 << memLevel, plus a few
4127  kilobytes for small objects. We use reduced memLevel
4128  to limit the memory consumption, and preallocate the
4129  heap, hoping to avoid memory fragmentation. */
4130  heap = mem_heap_create(250000);
4131  page_zip_set_alloc(&c_stream, heap);
4132 
4133  err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
4134  Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
4135  ut_a(err == Z_OK);
4136  }
4137 
4138  /* We have to create a file segment to the tablespace
4139  for each field and put the pointer to the field in rec */
4140 
4141  for (i = 0; i < big_rec_vec->n_fields; i++) {
4142  ut_ad(rec_offs_nth_extern(offsets,
4143  big_rec_vec->fields[i].field_no));
4144  {
4145  ulint local_len;
4146  field_ref = rec_get_nth_field(
4147  rec, offsets, big_rec_vec->fields[i].field_no,
4148  &local_len);
4149  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4150  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
4151  field_ref += local_len;
4152  }
4153  extern_len = big_rec_vec->fields[i].len;
4154  UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
4155  extern_len);
4156 
4157  ut_a(extern_len > 0);
4158 
4159  prev_page_no = FIL_NULL;
4160 
4161  if (UNIV_LIKELY_NULL(page_zip)) {
4162  int err = deflateReset(&c_stream);
4163  ut_a(err == Z_OK);
4164 
4165  c_stream.next_in = (Bytef *) big_rec_vec->fields[i].data;
4166  c_stream.avail_in = extern_len;
4167  }
4168 
4169  for (;;) {
4170  buf_block_t* block;
4171  page_t* page;
4172 
4173  mtr_start(&mtr);
4174 
4175  if (prev_page_no == FIL_NULL) {
4176  hint_page_no = 1 + rec_page_no;
4177  } else {
4178  hint_page_no = prev_page_no + 1;
4179  }
4180 
4181  block = btr_page_alloc(index, hint_page_no,
4182  FSP_NO_DIR, 0, &mtr);
4183  if (UNIV_UNLIKELY(block == NULL)) {
4184 
4185  mtr_commit(&mtr);
4186 
4187  if (UNIV_LIKELY_NULL(page_zip)) {
4188  deflateEnd(&c_stream);
4189  mem_heap_free(heap);
4190  }
4191 
4192  return(DB_OUT_OF_FILE_SPACE);
4193  }
4194 
4195  page_no = buf_block_get_page_no(block);
4196  page = buf_block_get_frame(block);
4197 
4198  if (prev_page_no != FIL_NULL) {
4199  buf_block_t* prev_block;
4200  page_t* prev_page;
4201 
4202  prev_block = buf_page_get(space_id, zip_size,
4203  prev_page_no,
4204  RW_X_LATCH, &mtr);
4205  buf_block_dbg_add_level(prev_block,
4206  SYNC_EXTERN_STORAGE);
4207  prev_page = buf_block_get_frame(prev_block);
4208 
4209  if (UNIV_LIKELY_NULL(page_zip)) {
4211  prev_page + FIL_PAGE_NEXT,
4212  page_no, MLOG_4BYTES, &mtr);
4213  memcpy(buf_block_get_page_zip(
4214  prev_block)
4215  ->data + FIL_PAGE_NEXT,
4216  prev_page + FIL_PAGE_NEXT, 4);
4217  } else {
4219  prev_page + FIL_PAGE_DATA
4220  + BTR_BLOB_HDR_NEXT_PAGE_NO,
4221  page_no, MLOG_4BYTES, &mtr);
4222  }
4223 
4224  }
4225 
4226  if (UNIV_LIKELY_NULL(page_zip)) {
4227  int err;
4228  page_zip_des_t* blob_page_zip;
4229 
4230  /* Write FIL_PAGE_TYPE to the redo log
4231  separately, before logging any other
4232  changes to the page, so that the debug
4233  assertions in
4234  recv_parse_or_apply_log_rec_body() can
4235  be made simpler. Before InnoDB Plugin
4236  1.0.4, the initialization of
4237  FIL_PAGE_TYPE was logged as part of
4238  the mlog_log_string() below. */
4239 
4241  prev_page_no == FIL_NULL
4244  MLOG_2BYTES, &mtr);
4245 
4246  c_stream.next_out = page
4247  + FIL_PAGE_DATA;
4248  c_stream.avail_out
4249  = page_zip_get_size(page_zip)
4250  - FIL_PAGE_DATA;
4251 
4252  err = deflate(&c_stream, Z_FINISH);
4253  ut_a(err == Z_OK || err == Z_STREAM_END);
4254  ut_a(err == Z_STREAM_END
4255  || c_stream.avail_out == 0);
4256 
4257  /* Write the "next BLOB page" pointer */
4259  FIL_NULL, MLOG_4BYTES, &mtr);
4260  /* Initialize the unused "prev page" pointer */
4262  FIL_NULL, MLOG_4BYTES, &mtr);
4263  /* Write a back pointer to the record
4264  into the otherwise unused area. This
4265  information could be useful in
4266  debugging. Later, we might want to
4267  implement the possibility to relocate
4268  BLOB pages. Then, we would need to be
4269  able to adjust the BLOB pointer in the
4270  record. We do not store the heap
4271  number of the record, because it can
4272  change in page_zip_reorganize() or
4273  btr_page_reorganize(). However, also
4274  the page number of the record may
4275  change when B-tree nodes are split or
4276  merged. */
4277  mlog_write_ulint(page
4279  space_id,
4280  MLOG_4BYTES, &mtr);
4281  mlog_write_ulint(page
4283  rec_page_no,
4284  MLOG_4BYTES, &mtr);
4285 
4286  /* Zero out the unused part of the page. */
4287  memset(page + page_zip_get_size(page_zip)
4288  - c_stream.avail_out,
4289  0, c_stream.avail_out);
4291  page_zip_get_size(page_zip)
4293  &mtr);
4294  /* Copy the page to compressed storage,
4295  because it will be flushed to disk
4296  from there. */
4297  blob_page_zip = buf_block_get_page_zip(block);
4298  ut_ad(blob_page_zip);
4299  ut_ad(page_zip_get_size(blob_page_zip)
4300  == page_zip_get_size(page_zip));
4301  memcpy(blob_page_zip->data, page,
4302  page_zip_get_size(page_zip));
4303 
4304  if (err == Z_OK && prev_page_no != FIL_NULL) {
4305 
4306  goto next_zip_page;
4307  }
4308 
4309  rec_block = buf_page_get(space_id, zip_size,
4310  rec_page_no,
4311  RW_X_LATCH, &mtr);
4312  buf_block_dbg_add_level(rec_block,
4313  SYNC_NO_ORDER_CHECK);
4314 
4315  if (err == Z_STREAM_END) {
4316  mach_write_to_4(field_ref
4317  + BTR_EXTERN_LEN, 0);
4318  mach_write_to_4(field_ref
4319  + BTR_EXTERN_LEN + 4,
4320  c_stream.total_in);
4321  } else {
4322  memset(field_ref + BTR_EXTERN_LEN,
4323  0, 8);
4324  }
4325 
4326  if (prev_page_no == FIL_NULL) {
4327  mach_write_to_4(field_ref
4329  space_id);
4330 
4331  mach_write_to_4(field_ref
4333  page_no);
4334 
4335  mach_write_to_4(field_ref
4337  FIL_PAGE_NEXT);
4338  }
4339 
4341  page_zip, rec, index, offsets,
4342  big_rec_vec->fields[i].field_no, &mtr);
4343 
4344 next_zip_page:
4345  prev_page_no = page_no;
4346 
4347  /* Commit mtr and release the
4348  uncompressed page frame to save memory. */
4349  btr_blob_free(block, FALSE, &mtr);
4350 
4351  if (err == Z_STREAM_END) {
4352  break;
4353  }
4354  } else {
4357  MLOG_2BYTES, &mtr);
4358 
4359  if (extern_len > (UNIV_PAGE_SIZE
4360  - FIL_PAGE_DATA
4361  - BTR_BLOB_HDR_SIZE
4362  - FIL_PAGE_DATA_END)) {
4363  store_len = UNIV_PAGE_SIZE
4364  - FIL_PAGE_DATA
4365  - BTR_BLOB_HDR_SIZE
4367  } else {
4368  store_len = extern_len;
4369  }
4370 
4372  + BTR_BLOB_HDR_SIZE,
4373  (const byte*)
4374  big_rec_vec->fields[i].data
4375  + big_rec_vec->fields[i].len
4376  - extern_len,
4377  store_len, &mtr);
4379  + BTR_BLOB_HDR_PART_LEN,
4380  store_len, MLOG_4BYTES, &mtr);
4382  + BTR_BLOB_HDR_NEXT_PAGE_NO,
4383  FIL_NULL, MLOG_4BYTES, &mtr);
4384 
4385  extern_len -= store_len;
4386 
4387  rec_block = buf_page_get(space_id, zip_size,
4388  rec_page_no,
4389  RW_X_LATCH, &mtr);
4390  buf_block_dbg_add_level(rec_block,
4391  SYNC_NO_ORDER_CHECK);
4392 
4393  mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
4394  MLOG_4BYTES, &mtr);
4395  mlog_write_ulint(field_ref
4396  + BTR_EXTERN_LEN + 4,
4397  big_rec_vec->fields[i].len
4398  - extern_len,
4399  MLOG_4BYTES, &mtr);
4400 
4401  if (prev_page_no == FIL_NULL) {
4402  mlog_write_ulint(field_ref
4404  space_id,
4405  MLOG_4BYTES, &mtr);
4406 
4407  mlog_write_ulint(field_ref
4409  page_no,
4410  MLOG_4BYTES, &mtr);
4411 
4412  mlog_write_ulint(field_ref
4414  FIL_PAGE_DATA,
4415  MLOG_4BYTES, &mtr);
4416  }
4417 
4418  prev_page_no = page_no;
4419 
4420  mtr_commit(&mtr);
4421 
4422  if (extern_len == 0) {
4423  break;
4424  }
4425  }
4426  }
4427  }
4428 
4429  if (UNIV_LIKELY_NULL(page_zip)) {
4430  deflateEnd(&c_stream);
4431  mem_heap_free(heap);
4432  }
4433 
4434  return(DB_SUCCESS);
4435 }
4436 
4437 /*******************************************************************/
4439 static
4440 void
4441 btr_check_blob_fil_page_type(
4442 /*=========================*/
4443  ulint space_id,
4444  ulint page_no,
4445  const page_t* page,
4446  ibool read)
4447 {
4448  ulint type = fil_page_get_type(page);
4449 
4450  ut_a(space_id == page_get_space_id(page));
4451  ut_a(page_no == page_get_page_no(page));
4452 
4453  if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
4454  ulint flags = fil_space_get_flags(space_id);
4455 
4456  if (UNIV_LIKELY
4457  ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) {
4458  /* Old versions of InnoDB did not initialize
4459  FIL_PAGE_TYPE on BLOB pages. Do not print
4460  anything about the type mismatch when reading
4461  a BLOB page that is in Antelope format.*/
4462  return;
4463  }
4464 
4465  ut_print_timestamp(stderr);
4466  fprintf(stderr,
4467  " InnoDB: FIL_PAGE_TYPE=%lu"
4468  " on BLOB %s space %lu page %lu flags %lx\n",
4469  (ulong) type, read ? "read" : "purge",
4470  (ulong) space_id, (ulong) page_no, (ulong) flags);
4471  ut_error;
4472  }
4473 }
4474 
4475 /*******************************************************************/
4480 UNIV_INTERN
4481 void
4482 btr_free_externally_stored_field(
4483 /*=============================*/
4484  dict_index_t* index,
4492  byte* field_ref,
4493  const rec_t* rec,
4495  const ulint* offsets,
4497  page_zip_des_t* page_zip,
4499  ulint i,
4501  enum trx_rb_ctx rb_ctx,
4502  mtr_t* /*local_mtr __attribute__((unused))*/)
4505 {
4506  page_t* page;
4507  ulint space_id;
4508  ulint rec_zip_size = dict_table_zip_size(index->table);
4509  ulint ext_zip_size;
4510  ulint page_no;
4511  ulint next_page_no;
4512  mtr_t mtr;
4513 #ifdef UNIV_DEBUG
4514  ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
4515  MTR_MEMO_X_LOCK));
4516  ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
4517  MTR_MEMO_PAGE_X_FIX));
4518  ut_ad(!rec || rec_offs_validate(rec, index, offsets));
4519 
4520  if (rec) {
4521  ulint local_len;
4522  const byte* f = rec_get_nth_field(rec, offsets,
4523  i, &local_len);
4524  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4525  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
4526  f += local_len;
4527  ut_ad(f == field_ref);
4528  }
4529 #endif /* UNIV_DEBUG */
4530 
4531  if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
4533  /* In the rollback of uncommitted transactions, we may
4534  encounter a clustered index record whose BLOBs have
4535  not been written. There is nothing to free then. */
4536  ut_a(rb_ctx == RB_RECOVERY || rb_ctx == RB_RECOVERY_PURGE_REC);
4537  return;
4538  }
4539 
4540  space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID);
4541 
4542  if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
4543  ext_zip_size = fil_space_get_zip_size(space_id);
4544  /* This must be an undo log record in the system tablespace,
4545  that is, in row_purge_upd_exist_or_extern().
4546  Currently, externally stored records are stored in the
4547  same tablespace as the referring records. */
4548  ut_ad(!page_get_space_id(page_align(field_ref)));
4549  ut_ad(!rec);
4550  ut_ad(!page_zip);
4551  } else {
4552  ext_zip_size = rec_zip_size;
4553  }
4554 
4555  if (!rec) {
4556  /* This is a call from row_purge_upd_exist_or_extern(). */
4557  ut_ad(!page_zip);
4558  rec_zip_size = 0;
4559  }
4560 
4561  for (;;) {
4562 #ifdef UNIV_SYNC_DEBUG
4563  buf_block_t* rec_block;
4564 #endif /* UNIV_SYNC_DEBUG */
4565  buf_block_t* ext_block;
4566 
4567  mtr_start(&mtr);
4568 
4569 #ifdef UNIV_SYNC_DEBUG
4570  rec_block =
4571 #endif /* UNIV_SYNC_DEBUG */
4573  page_align(field_ref)),
4574  rec_zip_size,
4576  page_align(field_ref)),
4577  RW_X_LATCH, &mtr);
4578  buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
4579  page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
4580 
4581  if (/* There is no external storage data */
4582  page_no == FIL_NULL
4583  /* This field does not own the externally stored field */
4584  || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
4586  /* Rollback and inherited field */
4587  || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
4588  && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
4590 
4591  /* Do not free */
4592  mtr_commit(&mtr);
4593 
4594  return;
4595  }
4596 
4597  ext_block = buf_page_get(space_id, ext_zip_size, page_no,
4598  RW_X_LATCH, &mtr);
4599  buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
4600  page = buf_block_get_frame(ext_block);
4601 
4602  if (ext_zip_size) {
4603  /* Note that page_zip will be NULL
4604  in row_purge_upd_exist_or_extern(). */
4605  switch (fil_page_get_type(page)) {
4606  case FIL_PAGE_TYPE_ZBLOB:
4607  case FIL_PAGE_TYPE_ZBLOB2:
4608  break;
4609  default:
4610  ut_error;
4611  }
4612  next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
4613 
4614  btr_page_free_low(index, ext_block, 0, &mtr);
4615 
4616  if (UNIV_LIKELY(page_zip != NULL)) {
4617  mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
4618  next_page_no);
4619  mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
4620  0);
4621  page_zip_write_blob_ptr(page_zip, rec, index,
4622  offsets, i, &mtr);
4623  } else {
4624  mlog_write_ulint(field_ref
4626  next_page_no,
4627  MLOG_4BYTES, &mtr);
4628  mlog_write_ulint(field_ref
4629  + BTR_EXTERN_LEN + 4, 0,
4630  MLOG_4BYTES, &mtr);
4631  }
4632  } else {
4633  ut_a(!page_zip);
4634  btr_check_blob_fil_page_type(space_id, page_no, page,
4635  FALSE);
4636 
4637  next_page_no = mach_read_from_4(
4638  page + FIL_PAGE_DATA
4639  + BTR_BLOB_HDR_NEXT_PAGE_NO);
4640 
4641  /* We must supply the page level (= 0) as an argument
4642  because we did not store it on the page (we save the
4643  space overhead from an index page header. */
4644 
4645  btr_page_free_low(index, ext_block, 0, &mtr);
4646 
4648  next_page_no,
4649  MLOG_4BYTES, &mtr);
4650  /* Zero out the BLOB length. If the server
4651  crashes during the execution of this function,
4652  trx_rollback_or_clean_all_recovered() could
4653  dereference the half-deleted BLOB, fetching a
4654  wrong prefix for the BLOB. */
4655  mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
4656  0,
4657  MLOG_4BYTES, &mtr);
4658  }
4659 
4660  /* Commit mtr and release the BLOB block to save memory. */
4661  btr_blob_free(ext_block, TRUE, &mtr);
4662  }
4663 }
4664 
4665 /***********************************************************/
4667 static
4668 void
4669 btr_rec_free_externally_stored_fields(
4670 /*==================================*/
4671  dict_index_t* index,
4673  rec_t* rec,
4674  const ulint* offsets,
4675  page_zip_des_t* page_zip,
4677  enum trx_rb_ctx rb_ctx,
4678  mtr_t* mtr)
4681 {
4682  ulint n_fields;
4683  ulint i;
4684 
4685  ut_ad(rec_offs_validate(rec, index, offsets));
4686  ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
4687  /* Free possible externally stored fields in the record */
4688 
4689  ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
4690  n_fields = rec_offs_n_fields(offsets);
4691 
4692  for (i = 0; i < n_fields; i++) {
4693  if (rec_offs_nth_extern(offsets, i)) {
4694  ulint len;
4695  byte* data
4696  = rec_get_nth_field(rec, offsets, i, &len);
4698 
4699  btr_free_externally_stored_field(
4700  index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
4701  rec, offsets, page_zip, i, rb_ctx, mtr);
4702  }
4703  }
4704 }
4705 
4706 /***********************************************************/
4709 static
4710 void
4711 btr_rec_free_updated_extern_fields(
4712 /*===============================*/
4713  dict_index_t* index,
4715  rec_t* rec,
4716  page_zip_des_t* page_zip,
4718  const ulint* offsets,
4719  const upd_t* update,
4720  enum trx_rb_ctx rb_ctx,
4721  mtr_t* mtr)
4723 {
4724  ulint n_fields;
4725  ulint i;
4726 
4727  ut_ad(rec_offs_validate(rec, index, offsets));
4728  ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
4729 
4730  /* Free possible externally stored fields in the record */
4731 
4732  n_fields = upd_get_n_fields(update);
4733 
4734  for (i = 0; i < n_fields; i++) {
4735  const upd_field_t* ufield = upd_get_nth_field(update, i);
4736 
4737  if (rec_offs_nth_extern(offsets, ufield->field_no)) {
4738  ulint len;
4739  byte* data = rec_get_nth_field(
4740  rec, offsets, ufield->field_no, &len);
4742 
4743  btr_free_externally_stored_field(
4744  index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
4745  rec, offsets, page_zip,
4746  ufield->field_no, rb_ctx, mtr);
4747  }
4748  }
4749 }
4750 
4751 /*******************************************************************/
4755 static
4756 ulint
4757 btr_copy_blob_prefix(
4758 /*=================*/
4759  byte* buf,
4761  ulint len,
4762  ulint space_id,
4763  ulint page_no,
4764  ulint offset)
4765 {
4766  ulint copied_len = 0;
4767 
4768  for (;;) {
4769  mtr_t mtr;
4770  buf_block_t* block;
4771  const page_t* page;
4772  const byte* blob_header;
4773  ulint part_len;
4774  ulint copy_len;
4775 
4776  mtr_start(&mtr);
4777 
4778  block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
4779  buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
4780  page = buf_block_get_frame(block);
4781 
4782  btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
4783 
4784  blob_header = page + offset;
4785  part_len = btr_blob_get_part_len(blob_header);
4786  copy_len = ut_min(part_len, len - copied_len);
4787 
4788  memcpy(buf + copied_len,
4789  blob_header + BTR_BLOB_HDR_SIZE, copy_len);
4790  copied_len += copy_len;
4791 
4792  page_no = btr_blob_get_next_page_no(blob_header);
4793 
4794  mtr_commit(&mtr);
4795 
4796  if (page_no == FIL_NULL || copy_len != part_len) {
4797  UNIV_MEM_ASSERT_RW(buf, copied_len);
4798  return(copied_len);
4799  }
4800 
4801  /* On other BLOB pages except the first the BLOB header
4802  always is at the page data start: */
4803 
4804  offset = FIL_PAGE_DATA;
4805 
4806  ut_ad(copied_len <= len);
4807  }
4808 }
4809 
4810 /*******************************************************************/
4813 static
4814 void
4815 btr_copy_zblob_prefix(
4816 /*==================*/
4817  z_stream* d_stream,
4818  ulint zip_size,
4819  ulint space_id,
4820  ulint page_no,
4821  ulint offset)
4822 {
4823  ulint page_type = FIL_PAGE_TYPE_ZBLOB;
4824 
4825  ut_ad(ut_is_2pow(zip_size));
4826  ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
4827  ut_ad(zip_size <= UNIV_PAGE_SIZE);
4828  ut_ad(space_id);
4829 
4830  for (;;) {
4831  buf_page_t* bpage;
4832  int err;
4833  ulint next_page_no;
4834 
4835  /* There is no latch on bpage directly. Instead,
4836  bpage is protected by the B-tree page latch that
4837  is being held on the clustered index record, or,
4838  in row_merge_copy_blobs(), by an exclusive table lock. */
4839  bpage = buf_page_get_zip(space_id, zip_size, page_no);
4840 
4841  if (UNIV_UNLIKELY(!bpage)) {
4842  ut_print_timestamp(stderr);
4843  fprintf(stderr,
4844  " InnoDB: Cannot load"
4845  " compressed BLOB"
4846  " page %lu space %lu\n",
4847  (ulong) page_no, (ulong) space_id);
4848  return;
4849  }
4850 
4851  if (UNIV_UNLIKELY
4852  (fil_page_get_type(bpage->zip.data) != page_type)) {
4853  ut_print_timestamp(stderr);
4854  fprintf(stderr,
4855  " InnoDB: Unexpected type %lu of"
4856  " compressed BLOB"
4857  " page %lu space %lu\n",
4858  (ulong) fil_page_get_type(bpage->zip.data),
4859  (ulong) page_no, (ulong) space_id);
4860  goto end_of_blob;
4861  }
4862 
4863  next_page_no = mach_read_from_4(bpage->zip.data + offset);
4864 
4865  if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
4866  /* When the BLOB begins at page header,
4867  the compressed data payload does not
4868  immediately follow the next page pointer. */
4869  offset = FIL_PAGE_DATA;
4870  } else {
4871  offset += 4;
4872  }
4873 
4874  d_stream->next_in = bpage->zip.data + offset;
4875  d_stream->avail_in = zip_size - offset;
4876 
4877  err = inflate(d_stream, Z_NO_FLUSH);
4878  switch (err) {
4879  case Z_OK:
4880  if (!d_stream->avail_out) {
4881  goto end_of_blob;
4882  }
4883  break;
4884  case Z_STREAM_END:
4885  if (next_page_no == FIL_NULL) {
4886  goto end_of_blob;
4887  }
4888  /* fall through */
4889  default:
4890 inflate_error:
4891  ut_print_timestamp(stderr);
4892  fprintf(stderr,
4893  " InnoDB: inflate() of"
4894  " compressed BLOB"
4895  " page %lu space %lu returned %d (%s)\n",
4896  (ulong) page_no, (ulong) space_id,
4897  err, d_stream->msg);
4898  case Z_BUF_ERROR:
4899  goto end_of_blob;
4900  }
4901 
4902  if (next_page_no == FIL_NULL) {
4903  if (!d_stream->avail_in) {
4904  ut_print_timestamp(stderr);
4905  fprintf(stderr,
4906  " InnoDB: unexpected end of"
4907  " compressed BLOB"
4908  " page %lu space %lu\n",
4909  (ulong) page_no,
4910  (ulong) space_id);
4911  } else {
4912  err = inflate(d_stream, Z_FINISH);
4913  switch (err) {
4914  case Z_STREAM_END:
4915  case Z_BUF_ERROR:
4916  break;
4917  default:
4918  goto inflate_error;
4919  }
4920  }
4921 
4922 end_of_blob:
4923  buf_page_release_zip(bpage);
4924  return;
4925  }
4926 
4927  buf_page_release_zip(bpage);
4928 
4929  /* On other BLOB pages except the first
4930  the BLOB header always is at the page header: */
4931 
4932  page_no = next_page_no;
4933  offset = FIL_PAGE_NEXT;
4934  page_type = FIL_PAGE_TYPE_ZBLOB2;
4935  }
4936 }
4937 
4938 /*******************************************************************/
4943 static
4944 ulint
4945 btr_copy_externally_stored_field_prefix_low(
4946 /*========================================*/
4947  byte* buf,
4949  ulint len,
4950  ulint zip_size,
4952  ulint space_id,
4953  ulint page_no,
4954  ulint offset)
4955 {
4956  if (UNIV_UNLIKELY(len == 0)) {
4957  return(0);
4958  }
4959 
4960  if (UNIV_UNLIKELY(zip_size)) {
4961  int err;
4962  z_stream d_stream;
4963  mem_heap_t* heap;
4964 
4965  /* Zlib inflate needs 32 kilobytes for the default
4966  window size, plus a few kilobytes for small objects. */
4967  heap = mem_heap_create(40000);
4968  page_zip_set_alloc(&d_stream, heap);
4969 
4970  err = inflateInit(&d_stream);
4971  ut_a(err == Z_OK);
4972 
4973  d_stream.next_out = buf;
4974  d_stream.avail_out = len;
4975  d_stream.avail_in = 0;
4976 
4977  btr_copy_zblob_prefix(&d_stream, zip_size,
4978  space_id, page_no, offset);
4979  inflateEnd(&d_stream);
4980  mem_heap_free(heap);
4981  UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
4982  return(d_stream.total_out);
4983  } else {
4984  return(btr_copy_blob_prefix(buf, len, space_id,
4985  page_no, offset));
4986  }
4987 }
4988 
4989 /*******************************************************************/
4994 UNIV_INTERN
4995 ulint
4996 btr_copy_externally_stored_field_prefix(
4997 /*====================================*/
4998  byte* buf,
4999  ulint len,
5000  ulint zip_size,
5002  const byte* data,
5006  ulint local_len)
5007 {
5008  ulint space_id;
5009  ulint page_no;
5010  ulint offset;
5011 
5012  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
5013 
5014  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
5015 
5016  if (UNIV_UNLIKELY(local_len >= len)) {
5017  memcpy(buf, data, len);
5018  return(len);
5019  }
5020 
5021  memcpy(buf, data, local_len);
5022  data += local_len;
5023 
5024  ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
5025 
5026  if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
5027  /* The externally stored part of the column has been
5028  (partially) deleted. Signal the half-deleted BLOB
5029  to the caller. */
5030 
5031  return(0);
5032  }
5033 
5034  space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
5035 
5036  page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
5037 
5038  offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
5039 
5040  return(local_len
5041  + btr_copy_externally_stored_field_prefix_low(buf + local_len,
5042  len - local_len,
5043  zip_size,
5044  space_id, page_no,
5045  offset));
5046 }
5047 
5048 /*******************************************************************/
5052 static
5053 byte*
5054 btr_copy_externally_stored_field(
5055 /*=============================*/
5056  ulint* len,
5057  const byte* data,
5061  ulint zip_size,
5063  ulint local_len,
5064  mem_heap_t* heap)
5065 {
5066  ulint space_id;
5067  ulint page_no;
5068  ulint offset;
5069  ulint extern_len;
5070  byte* buf;
5071 
5072  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
5073 
5074  local_len -= BTR_EXTERN_FIELD_REF_SIZE;
5075 
5076  space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
5077 
5078  page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
5079 
5080  offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
5081 
5082  /* Currently a BLOB cannot be bigger than 4 GB; we
5083  leave the 4 upper bytes in the length field unused */
5084 
5085  extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
5086 
5087  buf = (unsigned char *)mem_heap_alloc(heap, local_len + extern_len);
5088 
5089  memcpy(buf, data, local_len);
5090  *len = local_len
5091  + btr_copy_externally_stored_field_prefix_low(buf + local_len,
5092  extern_len,
5093  zip_size,
5094  space_id,
5095  page_no, offset);
5096 
5097  return(buf);
5098 }
5099 
5100 /*******************************************************************/
5103 UNIV_INTERN
5104 byte*
5105 btr_rec_copy_externally_stored_field(
5106 /*=================================*/
5107  const rec_t* rec,
5109  const ulint* offsets,
5110  ulint zip_size,
5112  ulint no,
5113  ulint* len,
5114  mem_heap_t* heap)
5115 {
5116  ulint local_len;
5117  const byte* data;
5118 
5119  ut_a(rec_offs_nth_extern(offsets, no));
5120 
5121  /* An externally stored field can contain some initial
5122  data from the field, and in the last 20 bytes it has the
5123  space id, page number, and offset where the rest of the
5124  field data is stored, and the data length in addition to
5125  the data stored locally. We may need to store some data
5126  locally to get the local record length above the 128 byte
5127  limit so that field offsets are stored in two bytes, and
5128  the extern bit is available in those two bytes. */
5129 
5130  data = rec_get_nth_field(rec, offsets, no, &local_len);
5131 
5132  ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
5133 
5134  if (UNIV_UNLIKELY
5135  (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
5136  field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
5137  /* The externally stored field was not written yet.
5138  This record should only be seen by
5139  recv_recovery_rollback_active() or any
5140  TRX_ISO_READ_UNCOMMITTED transactions. */
5141  return(NULL);
5142  }
5143 
5144  return(btr_copy_externally_stored_field(len, data,
5145  zip_size, local_len, heap));
5146 }
5147 #endif /* !UNIV_HOTBACKUP */