Drizzled Public API Documentation

trx0trx.cc
1 /*****************************************************************************
2 
3 Copyright (C) 1996, 2010, Innobase Oy. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
8 
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12 
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15 St, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 *****************************************************************************/
18 
19 /**************************************************/
26 #include "trx0trx.h"
27 
28 #ifdef UNIV_NONINL
29 #include "trx0trx.ic"
30 #endif
31 
32 #include "trx0undo.h"
33 #include "trx0rseg.h"
34 #include "log0log.h"
35 #include "que0que.h"
36 #include "lock0lock.h"
37 #include "trx0roll.h"
38 #include "usr0sess.h"
39 #include "read0read.h"
40 #include "srv0srv.h"
41 #include "thr0loc.h"
42 #include "btr0sea.h"
43 #include "os0proc.h"
44 #include "trx0xa.h"
45 #include "ha_prototypes.h"
46 
48 UNIV_INTERN sess_t* trx_dummy_sess = NULL;
49 
52 UNIV_INTERN ulint trx_n_mysql_transactions = 0;
53 
54 #ifdef UNIV_PFS_MUTEX
55 /* Key to register the mutex with performance schema */
56 UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key;
57 #endif /* UNIV_PFS_MUTEX */
58 
59 /*************************************************************/
61 UNIV_INTERN
62 void
64 /*===================*/
65  trx_t* trx,
66  const char* msg)
67 {
68  ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
69 }
70 
71 /*************************************************************/
74 UNIV_INTERN
75 void
77 /*=============================*/
78  trx_t* trx,
79  FILE* file)
80 {
82  sizeof(trx->detailed_error));
83 }
84 
85 /****************************************************************/
88 UNIV_INTERN
89 trx_t*
91 /*=======*/
92  sess_t* sess)
93 {
94  trx_t* trx;
95 
96  ut_ad(mutex_own(&kernel_mutex));
97  ut_ad(sess);
98 
99  trx = static_cast<trx_t *>(mem_alloc(sizeof(trx_t)));
100 
101  trx->magic_n = TRX_MAGIC_N;
102 
103  trx->op_info = "";
104 
105  trx->is_purge = 0;
106  trx->is_recovered = 0;
107  trx->conc_state = TRX_NOT_STARTED;
108  trx->start_time = time(NULL);
109 
110  trx->isolation_level = TRX_ISO_REPEATABLE_READ;
111 
112  trx->id = 0;
113  trx->no = IB_ULONGLONG_MAX;
114 
115  trx->support_xa = TRUE;
116 
117  trx->check_foreigns = TRUE;
118  trx->check_unique_secondary = TRUE;
119 
120  trx->flush_log_later = FALSE;
121  trx->must_flush_log_later = FALSE;
122 
124  trx->table_id = 0;
125 
126  trx->mysql_thd = NULL;
127  trx->duplicates = 0;
128 
129  trx->mysql_n_tables_locked = 0;
130 
131  trx->mysql_log_file_name = NULL;
132  trx->mysql_log_offset = 0;
133 
134  mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
135 
136  trx->rseg = NULL;
137 
138  trx->undo_no = 0;
139  trx->last_sql_stat_start.least_undo_no = 0;
140  trx->insert_undo = NULL;
141  trx->update_undo = NULL;
142  trx->undo_no_arr = NULL;
143 
144  trx->error_state = DB_SUCCESS;
145  trx->error_key_num = 0;
146  trx->detailed_error[0] = '\0';
147 
148  trx->sess = sess;
149  trx->que_state = TRX_QUE_RUNNING;
150  trx->n_active_thrs = 0;
151 
152  trx->handling_signals = FALSE;
153 
154  UT_LIST_INIT(trx->signals);
156 
157  trx->graph = NULL;
158 
159  trx->wait_lock = NULL;
160  trx->was_chosen_as_deadlock_victim = FALSE;
161  UT_LIST_INIT(trx->wait_thrs);
162 
164  UT_LIST_INIT(trx->trx_locks);
165 
167 
168  trx->dict_operation_lock_mode = 0;
169  trx->has_search_latch = FALSE;
170  trx->search_latch_timeout = BTR_SEA_TIMEOUT;
171 
172  trx->declared_to_be_inside_innodb = FALSE;
173  trx->n_tickets_to_enter_innodb = 0;
174 
175  trx->global_read_view_heap = mem_heap_create(256);
176  trx->global_read_view = NULL;
177  trx->read_view = NULL;
178 
179  /* Set X/Open XA transaction identification to NULL */
180  memset(&trx->xid, 0, sizeof(trx->xid));
181  trx->xid.formatID = -1;
182 
183  trx->n_autoinc_rows = 0;
184 
185  /* Remember to free the vector explicitly. */
186  trx->autoinc_locks = ib_vector_create(
187  mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4);
188 
189  trx->log_commit_id= FALSE;
190 
191  return(trx);
192 }
193 
194 /********************************************************************/
197 UNIV_INTERN
198 trx_t*
200 /*========================*/
201 {
202  trx_t* trx;
203 
204  mutex_enter(&kernel_mutex);
205 
206  trx = trx_create(trx_dummy_sess);
207 
209 
210  UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
211 
212  mutex_exit(&kernel_mutex);
213 
214  trx->mysql_thread_id = os_thread_get_curr_id();
215 
216  trx->mysql_process_no = os_proc_get_number();
217 
218  return(trx);
219 }
220 
221 /********************************************************************/
224 UNIV_INTERN
225 trx_t*
227 /*=============================*/
228 {
229  trx_t* trx;
230 
231  mutex_enter(&kernel_mutex);
232 
233  trx = trx_create(trx_dummy_sess);
234 
235  mutex_exit(&kernel_mutex);
236 
237  return(trx);
238 }
239 
240 /********************************************************************/
242 UNIV_INTERN
243 void
245 /*=================================*/
246  trx_t* trx)
247 {
248  if (trx->has_search_latch) {
249  rw_lock_s_unlock(&btr_search_latch);
250 
251  trx->has_search_latch = FALSE;
252  }
253 }
254 
255 /********************************************************************/
257 UNIV_INTERN
258 void
260 /*=====*/
261  trx_t* trx)
262 {
263  ut_ad(mutex_own(&kernel_mutex));
264 
265  if (trx->declared_to_be_inside_innodb) {
266  ut_print_timestamp(stderr);
267  fputs(" InnoDB: Error: Freeing a trx which is declared"
268  " to be processing\n"
269  "InnoDB: inside InnoDB.\n", stderr);
270  trx_print(stderr, trx, 600);
271  putc('\n', stderr);
272 
273  /* This is an error but not a fatal error. We must keep
274  the counters like srv_conc_n_threads accurate. */
276  }
277 
278  if (trx->mysql_n_tables_locked != 0) {
279 
280  ut_print_timestamp(stderr);
281  fprintf(stderr,
282  " InnoDB: Error: MySQL is freeing a thd\n"
283  "InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
284  (ulong)trx->mysql_n_tables_locked);
285 
286  trx_print(stderr, trx, 600);
287 
288  ut_print_buf(stderr, trx, sizeof(trx_t));
289  putc('\n', stderr);
290  }
291 
292  ut_a(trx->magic_n == TRX_MAGIC_N);
293 
294  trx->magic_n = 11112222;
295 
296  ut_a(trx->conc_state == TRX_NOT_STARTED);
297 
298  mutex_free(&(trx->undo_mutex));
299 
300  ut_a(trx->insert_undo == NULL);
301  ut_a(trx->update_undo == NULL);
302 
303  if (trx->undo_no_arr) {
305  }
306 
307  ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
308  ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
309 
310  ut_a(trx->wait_lock == NULL);
311  ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
312 
313  ut_a(!trx->has_search_latch);
314 
315  ut_a(trx->dict_operation_lock_mode == 0);
316 
317  if (trx->lock_heap) {
318  mem_heap_free(trx->lock_heap);
319  }
320 
321  ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
322 
323  if (trx->global_read_view_heap) {
324  mem_heap_free(trx->global_read_view_heap);
325  }
326 
327  trx->global_read_view = NULL;
328 
329  ut_a(trx->read_view == NULL);
330 
331  ut_a(ib_vector_is_empty(trx->autoinc_locks));
332  /* We allocated a dedicated heap for the vector. */
333  ib_vector_free(trx->autoinc_locks);
334 
335  mem_free(trx);
336 }
337 
338 /********************************************************************/
340 UNIV_INTERN
341 void
343 /*===============*/
344  trx_t* trx)
345 {
346  mutex_enter(&kernel_mutex);
347 
348  UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
349 
350  trx_free(trx);
351 
353 
355 
356  mutex_exit(&kernel_mutex);
357 }
358 
359 /********************************************************************/
361 UNIV_INTERN
362 void
364 /*====================*/
365  trx_t* trx)
366 {
367  mutex_enter(&kernel_mutex);
368 
369  trx_free(trx);
370 
371  mutex_exit(&kernel_mutex);
372 }
373 
374 /****************************************************************/
379 static
380 void
381 trx_list_insert_ordered(
382 /*====================*/
383  trx_t* trx)
384 {
385  trx_t* trx2;
386 
387  ut_ad(mutex_own(&kernel_mutex));
388 
389  trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
390 
391  while (trx2 != NULL) {
392  if (trx->id >= trx2->id) {
393 
394  ut_ad(trx->id > trx2->id);
395  break;
396  }
397  trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
398  }
399 
400  if (trx2 != NULL) {
401  trx2 = UT_LIST_GET_PREV(trx_list, trx2);
402 
403  if (trx2 == NULL) {
404  UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
405  } else {
406  UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
407  trx2, trx);
408  }
409  } else {
410  UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
411  }
412 }
413 
414 /****************************************************************/
420 UNIV_INTERN
421 void
423 /*============================*/
424 {
425  trx_rseg_t* rseg;
426  trx_undo_t* undo;
427  trx_t* trx;
428 
429  ut_ad(mutex_own(&kernel_mutex));
430  UT_LIST_INIT(trx_sys->trx_list);
431 
432  /* Look from the rollback segments if there exist undo logs for
433  transactions */
434 
435  rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
436 
437  while (rseg != NULL) {
438  undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
439 
440  while (undo != NULL) {
441 
442  trx = trx_create(trx_dummy_sess);
443 
444  trx->is_recovered = TRUE;
445  trx->id = undo->trx_id;
446  trx->xid = undo->xid;
447  trx->insert_undo = undo;
448  trx->rseg = rseg;
449 
450  if (undo->state != TRX_UNDO_ACTIVE) {
451 
452  /* Prepared transactions are left in
453  the prepared state waiting for a
454  commit or abort decision from MySQL */
455 
456  if (undo->state == TRX_UNDO_PREPARED) {
457 
458  fprintf(stderr,
459  "InnoDB: Transaction "
460  TRX_ID_FMT
461  " was in the"
462  " XA prepared state.\n",
463  trx->id);
464 
465  if (srv_force_recovery == 0) {
466 
467  trx->conc_state = TRX_PREPARED;
468  } else {
469  fprintf(stderr,
470  "InnoDB: Since"
471  " innodb_force_recovery"
472  " > 0, we will"
473  " rollback it"
474  " anyway.\n");
475 
476  trx->conc_state = TRX_ACTIVE;
477  }
478  } else {
479  trx->conc_state
480  = TRX_COMMITTED_IN_MEMORY;
481  }
482 
483  /* We give a dummy value for the trx no;
484  this should have no relevance since purge
485  is not interested in committed transaction
486  numbers, unless they are in the history
487  list, in which case it looks the number
488  from the disk based undo log structure */
489 
490  trx->no = trx->id;
491  } else {
492  trx->conc_state = TRX_ACTIVE;
493 
494  /* A running transaction always has the number
495  field inited to IB_ULONGLONG_MAX */
496 
497  trx->no = IB_ULONGLONG_MAX;
498  }
499 
500  if (undo->dict_operation) {
502  trx, TRX_DICT_OP_TABLE);
503  trx->table_id = undo->table_id;
504  }
505 
506  if (!undo->empty) {
507  trx->undo_no = undo->top_undo_no + 1;
508  }
509 
510  trx_list_insert_ordered(trx);
511 
512  undo = UT_LIST_GET_NEXT(undo_list, undo);
513  }
514 
515  undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
516 
517  while (undo != NULL) {
518  trx = trx_get_on_id(undo->trx_id);
519 
520  if (NULL == trx) {
521  trx = trx_create(trx_dummy_sess);
522 
523  trx->is_recovered = TRUE;
524  trx->id = undo->trx_id;
525  trx->xid = undo->xid;
526 
527  if (undo->state != TRX_UNDO_ACTIVE) {
528 
529  /* Prepared transactions are left in
530  the prepared state waiting for a
531  commit or abort decision from MySQL */
532 
533  if (undo->state == TRX_UNDO_PREPARED) {
534  fprintf(stderr,
535  "InnoDB: Transaction "
536  TRX_ID_FMT " was in the"
537  " XA prepared state.\n",
538  trx->id);
539 
540  if (srv_force_recovery == 0) {
541 
542  trx->conc_state
543  = TRX_PREPARED;
544  } else {
545  fprintf(stderr,
546  "InnoDB: Since"
547  " innodb_force_recovery"
548  " > 0, we will"
549  " rollback it"
550  " anyway.\n");
551 
552  trx->conc_state
553  = TRX_ACTIVE;
554  }
555  } else {
556  trx->conc_state
557  = TRX_COMMITTED_IN_MEMORY;
558  }
559 
560  /* We give a dummy value for the trx
561  number */
562 
563  trx->no = trx->id;
564  } else {
565  trx->conc_state = TRX_ACTIVE;
566 
567  /* A running transaction always has
568  the number field inited to
569  IB_ULONGLONG_MAX */
570 
571  trx->no = IB_ULONGLONG_MAX;
572  }
573 
574  trx->rseg = rseg;
575  trx_list_insert_ordered(trx);
576 
577  if (undo->dict_operation) {
579  trx, TRX_DICT_OP_TABLE);
580  trx->table_id = undo->table_id;
581  }
582  }
583 
584  trx->update_undo = undo;
585 
586  if ((!undo->empty)
587  && undo->top_undo_no >= trx->undo_no) {
588 
589  trx->undo_no = undo->top_undo_no + 1;
590  }
591 
592  undo = UT_LIST_GET_NEXT(undo_list, undo);
593  }
594 
595  rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
596  }
597 }
598 
599 /******************************************************************/
603 UNIV_INLINE
604 ulint
605 trx_assign_rseg(void)
606 /*=================*/
607 {
608  trx_rseg_t* rseg = trx_sys->latest_rseg;
609 
610  ut_ad(mutex_own(&kernel_mutex));
611 loop:
612  /* Get next rseg in a round-robin fashion */
613 
614  rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
615 
616  if (rseg == NULL) {
617  rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
618  }
619 
620  /* If it is the SYSTEM rollback segment, and there exist others, skip
621  it */
622 
623  if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID)
624  && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) {
625  goto loop;
626  }
627 
628  trx_sys->latest_rseg = rseg;
629 
630  return(rseg->id);
631 }
632 
633 /****************************************************************/
636 UNIV_INTERN
637 ibool
639 /*==========*/
640  trx_t* trx,
641  ulint rseg_id)
644 {
645  trx_rseg_t* rseg;
646 
647  ut_ad(mutex_own(&kernel_mutex));
648  ut_ad(trx->rseg == NULL);
649 
650  if (trx->is_purge) {
651  trx->id = 0;
652  trx->conc_state = TRX_ACTIVE;
653  trx->start_time = time(NULL);
654 
655  return(TRUE);
656  }
657 
658  ut_ad(trx->conc_state != TRX_ACTIVE);
659 
660  if (rseg_id == ULINT_UNDEFINED) {
661 
662  rseg_id = trx_assign_rseg();
663  }
664 
665  rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
666 
667  trx->id = trx_sys_get_new_trx_id();
668 
669  /* The initial value for trx->no: IB_ULONGLONG_MAX is used in
670  read_view_open_now: */
671 
672  trx->no = IB_ULONGLONG_MAX;
673 
674  trx->rseg = rseg;
675 
676  trx->conc_state = TRX_ACTIVE;
677  trx->start_time = time(NULL);
678 
679  UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
680 
681  return(TRUE);
682 }
683 
684 /****************************************************************/
687 UNIV_INTERN
688 ibool
690 /*======*/
691  trx_t* trx,
692  ulint rseg_id)
695 {
696  ibool ret;
697 
698  /* Update the info whether we should skip XA steps that eat CPU time
699  For the duration of the transaction trx->support_xa is not reread
700  from thd so any changes in the value take effect in the next
701  transaction. This is to avoid a scenario where some undo
702  generated by a transaction, has XA stuff, and other undo,
703  generated by the same transaction, doesn't. */
704  trx->support_xa = thd_supports_xa(trx->mysql_thd);
705 
706  mutex_enter(&kernel_mutex);
707 
708  ret = trx_start_low(trx, rseg_id);
709 
710  mutex_exit(&kernel_mutex);
711 
712  return(ret);
713 }
714 
715 /****************************************************************/
717 UNIV_INTERN
718 void
720 /*==================*/
721  trx_t* trx)
722 {
723  page_t* update_hdr_page;
724  ib_uint64_t lsn = 0;
725  trx_rseg_t* rseg;
726  trx_undo_t* undo;
727  mtr_t mtr;
728 
729  ut_ad(mutex_own(&kernel_mutex));
730 
731  trx->must_flush_log_later = FALSE;
732 
733  rseg = trx->rseg;
734 
735  if (trx->insert_undo != NULL || trx->update_undo != NULL) {
736 
737  mutex_exit(&kernel_mutex);
738 
739  mtr_start(&mtr);
740 
741  /* Change the undo log segment states from TRX_UNDO_ACTIVE
742  to some other state: these modifications to the file data
743  structure define the transaction as committed in the file
744  based world, at the serialization point of the log sequence
745  number lsn obtained below. */
746 
747  mutex_enter(&(rseg->mutex));
748 
749  if (trx->insert_undo != NULL) {
751  }
752 
753  undo = trx->update_undo;
754 
755  if (undo) {
756  mutex_enter(&kernel_mutex);
757  trx->no = trx_sys_get_new_trx_no();
758  mutex_exit(&kernel_mutex);
759 
760  /* It is not necessary to obtain trx->undo_mutex here
761  because only a single OS thread is allowed to do the
762  transaction commit for this transaction. */
763 
764  update_hdr_page = trx_undo_set_state_at_finish(
765  undo, &mtr);
766 
767  /* We have to do the cleanup for the update log while
768  holding the rseg mutex because update log headers
769  have to be put to the history list in the order of
770  the trx number. */
771 
772  trx_undo_update_cleanup(trx, update_hdr_page, &mtr);
773  }
774 
775  mutex_exit(&(rseg->mutex));
776 
777  /* Update the highest commit id currently in the system */
778  if (trx_log_commit_id(trx))
779  {
780  mutex_enter(&commit_id_mutex);
783  &mtr);
784  mutex_exit(&commit_id_mutex);
785  }
786 
787  /* The following call commits the mini-transaction, making the
788  whole transaction committed in the file-based world, at this
789  log sequence number. The transaction becomes 'durable' when
790  we write the log to disk, but in the logical sense the commit
791  in the file-based data structures (undo logs etc.) happens
792  here.
793 
794  NOTE that transaction numbers, which are assigned only to
795  transactions with an update undo log, do not necessarily come
796  in exactly the same order as commit lsn's, if the transactions
797  have different rollback segments. To get exactly the same
798  order we should hold the kernel mutex up to this point,
799  adding to the contention of the kernel mutex. However, if
800  a transaction T2 is able to see modifications made by
801  a transaction T1, T2 will always get a bigger transaction
802  number and a bigger commit lsn than T1. */
803 
804  /*--------------*/
805  mtr_commit(&mtr);
806  /*--------------*/
807  lsn = mtr.end_lsn;
808 
809  mutex_enter(&kernel_mutex);
810  }
811 
812  ut_ad(trx->conc_state == TRX_ACTIVE
813  || trx->conc_state == TRX_PREPARED);
814  ut_ad(mutex_own(&kernel_mutex));
815 
816  /* The following assignment makes the transaction committed in memory
817  and makes its changes to data visible to other transactions.
818  NOTE that there is a small discrepancy from the strict formal
819  visibility rules here: a human user of the database can see
820  modifications made by another transaction T even before the necessary
821  log segment has been flushed to the disk. If the database happens to
822  crash before the flush, the user has seen modifications from T which
823  will never be a committed transaction. However, any transaction T2
824  which sees the modifications of the committing transaction T, and
825  which also itself makes modifications to the database, will get an lsn
826  larger than the committing transaction T. In the case where the log
827  flush fails, and T never gets committed, also T2 will never get
828  committed. */
829 
830  /*--------------------------------------*/
831  trx->conc_state = TRX_COMMITTED_IN_MEMORY;
832  /*--------------------------------------*/
833 
834  /* If we release kernel_mutex below and we are still doing
835  recovery i.e.: back ground rollback thread is still active
836  then there is a chance that the rollback thread may see
837  this trx as COMMITTED_IN_MEMORY and goes adhead to clean it
838  up calling trx_cleanup_at_db_startup(). This can happen
839  in the case we are committing a trx here that is left in
840  PREPARED state during the crash. Note that commit of the
841  rollback of a PREPARED trx happens in the recovery thread
842  while the rollback of other transactions happen in the
843  background thread. To avoid this race we unconditionally
844  unset the is_recovered flag from the trx. */
845 
846  trx->is_recovered = FALSE;
847 
849 
850  if (trx->global_read_view) {
851  read_view_close(trx->global_read_view);
852  mem_heap_empty(trx->global_read_view_heap);
853  trx->global_read_view = NULL;
854  }
855 
856  trx->read_view = NULL;
857 
858  if (lsn) {
859 
860  mutex_exit(&kernel_mutex);
861 
862  if (trx->insert_undo != NULL) {
863 
865  }
866 
867  /* NOTE that we could possibly make a group commit more
868  efficient here: call os_thread_yield here to allow also other
869  trxs to come to commit! */
870 
871  /*-------------------------------------*/
872 
873  /* Depending on the my.cnf options, we may now write the log
874  buffer to the log files, making the transaction durable if
875  the OS does not crash. We may also flush the log files to
876  disk, making the transaction durable also at an OS crash or a
877  power outage.
878 
879  The idea in InnoDB's group commit is that a group of
880  transactions gather behind a trx doing a physical disk write
881  to log files, and when that physical write has been completed,
882  one of those transactions does a write which commits the whole
883  group. Note that this group commit will only bring benefit if
884  there are > 2 users in the database. Then at least 2 users can
885  gather behind one doing the physical log write to disk.
886 
887  If we are calling trx_commit() under prepare_commit_mutex, we
888  will delay possible log write and flush to a separate function
889  trx_commit_complete_for_mysql(), which is only called when the
890  thread has released the mutex. This is to make the
891  group commit algorithm to work. Otherwise, the prepare_commit
892  mutex would serialize all commits and prevent a group of
893  transactions from gathering. */
894 
895  if (trx->flush_log_later) {
896  /* Do nothing yet */
897  trx->must_flush_log_later = TRUE;
898  } else if (srv_flush_log_at_trx_commit == 0) {
899  /* Do nothing */
900  } else if (srv_flush_log_at_trx_commit == 1) {
901  if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
902  /* Write the log but do not flush it to disk */
903 
904  log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
905  FALSE);
906  } else {
907  /* Write the log to the log files AND flush
908  them to disk */
909 
910  log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
911  }
912  } else if (srv_flush_log_at_trx_commit == 2) {
913 
914  /* Write the log but do not flush it to disk */
915 
916  log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
917  } else {
918  ut_error;
919  }
920 
921  trx->commit_lsn = lsn;
922 
923  /*-------------------------------------*/
924 
925  mutex_enter(&kernel_mutex);
926  }
927 
928  /* Free all savepoints */
929  trx_roll_free_all_savepoints(trx);
930 
931  trx->conc_state = TRX_NOT_STARTED;
932  trx->rseg = NULL;
933  trx->undo_no = 0;
934  trx->last_sql_stat_start.least_undo_no = 0;
935 
936  ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
937  ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
938 
939  UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
940 }
941 
942 /****************************************************************/
946 UNIV_INTERN
947 void
949 /*======================*/
950  trx_t* trx)
951 {
952  if (trx->insert_undo != NULL) {
953 
955  }
956 
957  trx->conc_state = TRX_NOT_STARTED;
958  trx->rseg = NULL;
959  trx->undo_no = 0;
960  trx->last_sql_stat_start.least_undo_no = 0;
961 
962  UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
963 }
964 
965 /********************************************************************/
970 UNIV_INTERN
973 /*=================*/
974  trx_t* trx)
975 {
976  ut_ad(trx->conc_state == TRX_ACTIVE);
977 
978  if (trx->read_view) {
979  return(trx->read_view);
980  }
981 
982  mutex_enter(&kernel_mutex);
983 
984  if (!trx->read_view) {
986  trx->id, trx->global_read_view_heap);
987  trx->global_read_view = trx->read_view;
988  }
989 
990  mutex_exit(&kernel_mutex);
991 
992  return(trx->read_view);
993 }
994 
995 /****************************************************************/
997 static
998 void
999 trx_handle_commit_sig_off_kernel(
1000 /*=============================*/
1001  trx_t* trx,
1002  que_thr_t** next_thr)
1007 {
1008  trx_sig_t* sig;
1009  trx_sig_t* next_sig;
1010 
1011  ut_ad(mutex_own(&kernel_mutex));
1012 
1013  trx->que_state = TRX_QUE_COMMITTING;
1014 
1015  trx_commit_off_kernel(trx);
1016 
1017  ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
1018 
1019  /* Remove all TRX_SIG_COMMIT signals from the signal queue and send
1020  reply messages to them */
1021 
1022  sig = UT_LIST_GET_FIRST(trx->signals);
1023 
1024  while (sig != NULL) {
1025  next_sig = UT_LIST_GET_NEXT(signals, sig);
1026 
1027  if (sig->type == TRX_SIG_COMMIT) {
1028 
1029  trx_sig_reply(sig, next_thr);
1030  trx_sig_remove(trx, sig);
1031  }
1032 
1033  sig = next_sig;
1034  }
1035 
1036  trx->que_state = TRX_QUE_RUNNING;
1037 }
1038 
1039 /***********************************************************/
1043 UNIV_INTERN
1044 void
1046 /*==============*/
1047  trx_t* trx)
1048 {
1049  que_thr_t* thr;
1050 
1051  ut_ad(mutex_own(&kernel_mutex));
1052  ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
1053 
1054  thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1055 
1056  while (thr != NULL) {
1058 
1059  UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
1060 
1061  thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1062  }
1063 
1064  trx->que_state = TRX_QUE_RUNNING;
1065 }
1066 
1067 /***********************************************************/
1070 static
1071 void
1072 trx_lock_wait_to_suspended(
1073 /*=======================*/
1074  trx_t* trx)
1075 {
1076  que_thr_t* thr;
1077 
1078  ut_ad(mutex_own(&kernel_mutex));
1079  ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
1080 
1081  thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1082 
1083  while (thr != NULL) {
1084  thr->state = QUE_THR_SUSPENDED;
1085 
1086  UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
1087 
1088  thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1089  }
1090 
1091  trx->que_state = TRX_QUE_RUNNING;
1092 }
1093 
1094 /***********************************************************/
1097 static
1098 void
1099 trx_sig_reply_wait_to_suspended(
1100 /*============================*/
1101  trx_t* trx)
1102 {
1103  trx_sig_t* sig;
1104  que_thr_t* thr;
1105 
1106  ut_ad(mutex_own(&kernel_mutex));
1107 
1108  sig = UT_LIST_GET_FIRST(trx->reply_signals);
1109 
1110  while (sig != NULL) {
1111  thr = sig->receiver;
1112 
1113  ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
1114 
1115  thr->state = QUE_THR_SUSPENDED;
1116 
1117  sig->receiver = NULL;
1118 
1119  UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
1120 
1121  sig = UT_LIST_GET_FIRST(trx->reply_signals);
1122  }
1123 }
1124 
1125 /*****************************************************************/
1129 static
1130 ibool
1131 trx_sig_is_compatible(
1132 /*==================*/
1133  trx_t* trx,
1134  ulint type,
1135  ulint sender)
1136 {
1137  trx_sig_t* sig;
1138 
1139  ut_ad(mutex_own(&kernel_mutex));
1140 
1141  if (UT_LIST_GET_LEN(trx->signals) == 0) {
1142 
1143  return(TRUE);
1144  }
1145 
1146  if (sender == TRX_SIG_SELF) {
1147  if (type == TRX_SIG_ERROR_OCCURRED) {
1148 
1149  return(TRUE);
1150 
1151  } else if (type == TRX_SIG_BREAK_EXECUTION) {
1152 
1153  return(TRUE);
1154  } else {
1155  return(FALSE);
1156  }
1157  }
1158 
1159  ut_ad(sender == TRX_SIG_OTHER_SESS);
1160 
1161  sig = UT_LIST_GET_FIRST(trx->signals);
1162 
1163  if (type == TRX_SIG_COMMIT) {
1164  while (sig != NULL) {
1165 
1166  if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
1167 
1168  return(FALSE);
1169  }
1170 
1171  sig = UT_LIST_GET_NEXT(signals, sig);
1172  }
1173 
1174  return(TRUE);
1175 
1176  } else if (type == TRX_SIG_TOTAL_ROLLBACK) {
1177  while (sig != NULL) {
1178 
1179  if (sig->type == TRX_SIG_COMMIT) {
1180 
1181  return(FALSE);
1182  }
1183 
1184  sig = UT_LIST_GET_NEXT(signals, sig);
1185  }
1186 
1187  return(TRUE);
1188 
1189  } else if (type == TRX_SIG_BREAK_EXECUTION) {
1190 
1191  return(TRUE);
1192  } else {
1193  ut_error;
1194 
1195  return(FALSE);
1196  }
1197 }
1198 
1199 /****************************************************************/
1201 UNIV_INTERN
1202 void
1204 /*=========*/
1205  trx_t* trx,
1206  ulint type,
1207  ulint sender,
1209  que_thr_t* receiver_thr,
1212  trx_savept_t* savept,
1214  que_thr_t** next_thr)
1220 {
1221  trx_sig_t* sig;
1222  trx_t* receiver_trx;
1223 
1224  ut_ad(trx);
1225  ut_ad(mutex_own(&kernel_mutex));
1226 
1227  if (!trx_sig_is_compatible(trx, type, sender)) {
1228  /* The signal is not compatible with the other signals in
1229  the queue: die */
1230 
1231  ut_error;
1232  }
1233 
1234  /* Queue the signal object */
1235 
1236  if (UT_LIST_GET_LEN(trx->signals) == 0) {
1237 
1238  /* The signal list is empty: the 'sig' slot must be unused
1239  (we improve performance a bit by avoiding mem_alloc) */
1240  sig = &(trx->sig);
1241  } else {
1242  /* It might be that the 'sig' slot is unused also in this
1243  case, but we choose the easy way of using mem_alloc */
1244 
1245  sig = static_cast<trx_sig_t *>(mem_alloc(sizeof(trx_sig_t)));
1246  }
1247 
1248  UT_LIST_ADD_LAST(signals, trx->signals, sig);
1249 
1250  sig->type = type;
1251  sig->sender = sender;
1252  sig->receiver = receiver_thr;
1253 
1254  if (savept) {
1255  sig->savept = *savept;
1256  }
1257 
1258  if (receiver_thr) {
1259  receiver_trx = thr_get_trx(receiver_thr);
1260 
1261  UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
1262  sig);
1263  }
1264 
1265  if (trx->sess->state == SESS_ERROR) {
1266 
1267  trx_sig_reply_wait_to_suspended(trx);
1268  }
1269 
1270  if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
1271  ut_error;
1272  }
1273 
1274  /* If there were no other signals ahead in the queue, try to start
1275  handling of the signal */
1276 
1277  if (UT_LIST_GET_FIRST(trx->signals) == sig) {
1278 
1279  trx_sig_start_handle(trx, next_thr);
1280  }
1281 }
1282 
1283 /****************************************************************/
1288 UNIV_INTERN
1289 void
1291 /*====================*/
1292  trx_t* trx)
1293 {
1294  ut_ad(mutex_own(&kernel_mutex));
1295  ut_ad(trx->handling_signals == TRUE);
1296 
1297  trx->handling_signals = FALSE;
1298 
1299  trx->graph = trx->graph_before_signal_handling;
1300 
1301  if (trx->graph && (trx->sess->state == SESS_ERROR)) {
1302 
1303  que_fork_error_handle(trx, trx->graph);
1304  }
1305 }
1306 
1307 /****************************************************************/
1309 UNIV_INTERN
1310 void
1312 /*=================*/
1313  trx_t* trx,
1314  que_thr_t** next_thr)
1320 {
1321  trx_sig_t* sig;
1322  ulint type;
1323 loop:
1324  /* We loop in this function body as long as there are queued signals
1325  we can process immediately */
1326 
1327  ut_ad(trx);
1328  ut_ad(mutex_own(&kernel_mutex));
1329 
1330  if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
1331 
1333 
1334  return;
1335  }
1336 
1337  if (trx->conc_state == TRX_NOT_STARTED) {
1338 
1339  trx_start_low(trx, ULINT_UNDEFINED);
1340  }
1341 
1342  /* If the trx is in a lock wait state, moves the waiting query threads
1343  to the suspended state */
1344 
1345  if (trx->que_state == TRX_QUE_LOCK_WAIT) {
1346 
1347  trx_lock_wait_to_suspended(trx);
1348  }
1349 
1350  /* If the session is in the error state and this trx has threads
1351  waiting for reply from signals, moves these threads to the suspended
1352  state, canceling wait reservations; note that if the transaction has
1353  sent a commit or rollback signal to itself, and its session is not in
1354  the error state, then nothing is done here. */
1355 
1356  if (trx->sess->state == SESS_ERROR) {
1357  trx_sig_reply_wait_to_suspended(trx);
1358  }
1359 
1360  /* If there are no running query threads, we can start processing of a
1361  signal, otherwise we have to wait until all query threads of this
1362  transaction are aware of the arrival of the signal. */
1363 
1364  if (trx->n_active_thrs > 0) {
1365 
1366  return;
1367  }
1368 
1369  if (trx->handling_signals == FALSE) {
1370  trx->graph_before_signal_handling = trx->graph;
1371 
1372  trx->handling_signals = TRUE;
1373  }
1374 
1375  sig = UT_LIST_GET_FIRST(trx->signals);
1376  type = sig->type;
1377 
1378  if (type == TRX_SIG_COMMIT) {
1379 
1380  trx_handle_commit_sig_off_kernel(trx, next_thr);
1381 
1382  } else if ((type == TRX_SIG_TOTAL_ROLLBACK)
1383  || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
1384 
1385  trx_rollback(trx, sig, next_thr);
1386 
1387  /* No further signals can be handled until the rollback
1388  completes, therefore we return */
1389 
1390  return;
1391 
1392  } else if (type == TRX_SIG_ERROR_OCCURRED) {
1393 
1394  trx_rollback(trx, sig, next_thr);
1395 
1396  /* No further signals can be handled until the rollback
1397  completes, therefore we return */
1398 
1399  return;
1400 
1401  } else if (type == TRX_SIG_BREAK_EXECUTION) {
1402 
1403  trx_sig_reply(sig, next_thr);
1404  trx_sig_remove(trx, sig);
1405  } else {
1406  ut_error;
1407  }
1408 
1409  goto loop;
1410 }
1411 
1412 /****************************************************************/
1415 UNIV_INTERN
1416 void
1418 /*==========*/
1419  trx_sig_t* sig,
1420  que_thr_t** next_thr)
1425 {
1426  trx_t* receiver_trx;
1427 
1428  ut_ad(sig);
1429  ut_ad(mutex_own(&kernel_mutex));
1430 
1431  if (sig->receiver != NULL) {
1432  ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
1433 
1434  receiver_trx = thr_get_trx(sig->receiver);
1435 
1436  UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
1437  sig);
1438  ut_ad(receiver_trx->sess->state != SESS_ERROR);
1439 
1440  que_thr_end_wait(sig->receiver, next_thr);
1441 
1442  sig->receiver = NULL;
1443 
1444  }
1445 }
1446 
1447 /****************************************************************/
1449 UNIV_INTERN
1450 void
1452 /*===========*/
1453  trx_t* trx,
1454  trx_sig_t* sig)
1455 {
1456  ut_ad(trx && sig);
1457  ut_ad(mutex_own(&kernel_mutex));
1458 
1459  ut_ad(sig->receiver == NULL);
1460 
1461  UT_LIST_REMOVE(signals, trx->signals, sig);
1462  sig->type = 0; /* reset the field to catch possible bugs */
1463 
1464  if (sig != &(trx->sig)) {
1465  mem_free(sig);
1466  }
1467 }
1468 
1469 /*********************************************************************/
1472 UNIV_INTERN
1475 /*===============*/
1476  mem_heap_t* heap)
1477 {
1478  commit_node_t* node;
1479 
1480  node = static_cast<commit_node_t *>(mem_heap_alloc(heap, sizeof(commit_node_t)));
1481  node->common.type = QUE_NODE_COMMIT;
1482  node->state = COMMIT_NODE_SEND;
1483 
1484  return(node);
1485 }
1486 
1487 /***********************************************************/
1490 UNIV_INTERN
1491 que_thr_t*
1493 /*============*/
1494  que_thr_t* thr)
1495 {
1496  commit_node_t* node;
1497  que_thr_t* next_thr;
1498 
1499  node = static_cast<commit_node_t *>(thr->run_node);
1500 
1501  ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
1502 
1503  if (thr->prev_node == que_node_get_parent(node)) {
1504  node->state = COMMIT_NODE_SEND;
1505  }
1506 
1507  if (node->state == COMMIT_NODE_SEND) {
1508  mutex_enter(&kernel_mutex);
1509 
1510  node->state = COMMIT_NODE_WAIT;
1511 
1512  next_thr = NULL;
1513 
1514  thr->state = QUE_THR_SIG_REPLY_WAIT;
1515 
1516  /* Send the commit signal to the transaction */
1517 
1518  trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
1519  thr, NULL, &next_thr);
1520 
1521  mutex_exit(&kernel_mutex);
1522 
1523  return(next_thr);
1524  }
1525 
1526  ut_ad(node->state == COMMIT_NODE_WAIT);
1527 
1528  node->state = COMMIT_NODE_SEND;
1529 
1530  thr->run_node = que_node_get_parent(node);
1531 
1532  return(thr);
1533 }
1534 
1535 /**********************************************************************/
1538 UNIV_INTERN
1539 ulint
1541 /*=================*/
1542  trx_t* trx)
1543 {
1544  /* Because we do not do the commit by sending an Innobase
1545  sig to the transaction, we must here make sure that trx has been
1546  started. */
1547 
1548  ut_a(trx);
1549 
1551 
1552  trx->op_info = "committing";
1553 
1554  mutex_enter(&kernel_mutex);
1555 
1556  trx_commit_off_kernel(trx);
1557 
1558  mutex_exit(&kernel_mutex);
1559 
1560  trx->op_info = "";
1561 
1562  return(DB_SUCCESS);
1563 }
1564 
1565 /**********************************************************************/
1569 UNIV_INTERN
1570 ulint
1572 /*==========================*/
1573  trx_t* trx)
1574 {
1575  ib_uint64_t lsn = trx->commit_lsn;
1576 
1577  ut_a(trx);
1578 
1579  trx->op_info = "flushing log";
1580 
1581  if (!trx->must_flush_log_later) {
1582  /* Do nothing */
1583  } else if (srv_flush_log_at_trx_commit == 0) {
1584  /* Do nothing */
1585  } else if (srv_flush_log_at_trx_commit == 1) {
1586  if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1587  /* Write the log but do not flush it to disk */
1588 
1589  log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1590  } else {
1591  /* Write the log to the log files AND flush them to
1592  disk */
1593 
1594  log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1595  }
1596  } else if (srv_flush_log_at_trx_commit == 2) {
1597 
1598  /* Write the log but do not flush it to disk */
1599 
1600  log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1601  } else {
1602  ut_error;
1603  }
1604 
1605  trx->must_flush_log_later = FALSE;
1606 
1607  trx->op_info = "";
1608 
1609  return(0);
1610 }
1611 
1612 /**********************************************************************/
1614 UNIV_INTERN
1615 void
1617 /*==================*/
1618  trx_t* trx)
1619 {
1620  ut_a(trx);
1621 
1622  if (trx->conc_state == TRX_NOT_STARTED) {
1623  trx->undo_no = 0;
1624  }
1625 
1626  trx->last_sql_stat_start.least_undo_no = trx->undo_no;
1627 }
1628 
1629 /**********************************************************************/
1632 UNIV_INTERN
1633 void
1635 /*======*/
1636  FILE* f,
1637  trx_t* trx,
1638  ulint max_query_len)
1640 {
1641  ibool newline;
1642 
1643  fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
1644 
1645  switch (trx->conc_state) {
1646  case TRX_NOT_STARTED:
1647  fputs(", not started", f);
1648  break;
1649  case TRX_ACTIVE:
1650  fprintf(f, ", ACTIVE %lu sec",
1651  (ulong)difftime(time(NULL), trx->start_time));
1652  break;
1653  case TRX_PREPARED:
1654  fprintf(f, ", ACTIVE (PREPARED) %lu sec",
1655  (ulong)difftime(time(NULL), trx->start_time));
1656  break;
1657  case TRX_COMMITTED_IN_MEMORY:
1658  fputs(", COMMITTED IN MEMORY", f);
1659  break;
1660  default:
1661  fprintf(f, " state %lu", (ulong) trx->conc_state);
1662  }
1663 
1664 #ifdef UNIV_LINUX
1665  fprintf(f, ", process no %lu", trx->mysql_process_no);
1666 #endif
1667  fprintf(f, ", OS thread id %lu",
1668  (ulong) os_thread_pf(trx->mysql_thread_id));
1669 
1670  if (*trx->op_info) {
1671  putc(' ', f);
1672  fputs(trx->op_info, f);
1673  }
1674 
1675  if (trx->is_recovered) {
1676  fputs(" recovered trx", f);
1677  }
1678 
1679  if (trx->is_purge) {
1680  fputs(" purge trx", f);
1681  }
1682 
1683  if (trx->declared_to_be_inside_innodb) {
1684  fprintf(f, ", thread declared inside InnoDB %lu",
1685  (ulong) trx->n_tickets_to_enter_innodb);
1686  }
1687 
1688  putc('\n', f);
1689 
1690  if (trx->mysql_n_tables_locked > 0) {
1691  fprintf(f, "mysql tables in locked %lu\n",
1692  (ulong) trx->mysql_n_tables_locked);
1693  }
1694 
1695  newline = TRUE;
1696 
1697  switch (trx->que_state) {
1698  case TRX_QUE_RUNNING:
1699  newline = FALSE; break;
1700  case TRX_QUE_LOCK_WAIT:
1701  fputs("LOCK WAIT ", f); break;
1702  case TRX_QUE_ROLLING_BACK:
1703  fputs("ROLLING BACK ", f); break;
1704  case TRX_QUE_COMMITTING:
1705  fputs("COMMITTING ", f); break;
1706  default:
1707  fprintf(f, "que state %lu ", (ulong) trx->que_state);
1708  }
1709 
1710  if (0 < UT_LIST_GET_LEN(trx->trx_locks)
1711  || mem_heap_get_size(trx->lock_heap) > 400) {
1712  newline = TRUE;
1713 
1714  fprintf(f, "%lu lock struct(s), heap size %lu,"
1715  " %lu row lock(s)",
1716  (ulong) UT_LIST_GET_LEN(trx->trx_locks),
1717  (ulong) mem_heap_get_size(trx->lock_heap),
1718  (ulong) lock_number_of_rows_locked(trx));
1719  }
1720 
1721  if (trx->has_search_latch) {
1722  newline = TRUE;
1723  fputs(", holds adaptive hash latch", f);
1724  }
1725 
1726  if (trx->undo_no != 0) {
1727  newline = TRUE;
1728  fprintf(f, ", undo log entries %llu",
1729  (ullint) trx->undo_no);
1730  }
1731 
1732  if (newline) {
1733  putc('\n', f);
1734  }
1735 
1736  if (trx->mysql_thd != NULL) {
1737  innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
1738  }
1739 }
1740 
1741 /*******************************************************************/
1746 UNIV_INTERN
1747 ibool
1749 /*==========*/
1750  const trx_t* a,
1751  const trx_t* b)
1752 {
1753  ibool a_notrans_edit;
1754  ibool b_notrans_edit;
1755 
1756  /* If mysql_thd is NULL for a transaction we assume that it has
1757  not edited non-transactional tables. */
1758 
1759  a_notrans_edit = a->mysql_thd != NULL
1761 
1762  b_notrans_edit = b->mysql_thd != NULL
1764 
1765  if (a_notrans_edit != b_notrans_edit) {
1766 
1767  return(a_notrans_edit);
1768  }
1769 
1770  /* Either both had edited non-transactional tables or both had
1771  not, we fall back to comparing the number of altered/locked
1772  rows. */
1773 
1774 #if 0
1775  fprintf(stderr,
1776  "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
1777  __func__,
1780 #endif
1781 
1782  return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
1783 }
1784 
1785 /****************************************************************/
1787 UNIV_INLINE
1788 void
1789 trx_prepare_off_kernel(
1790 /*===================*/
1791  trx_t* trx)
1792 {
1793  trx_rseg_t* rseg;
1794  ib_uint64_t lsn = 0;
1795  mtr_t mtr;
1796 
1797  ut_ad(mutex_own(&kernel_mutex));
1798 
1799  rseg = trx->rseg;
1800 
1801  if (trx->insert_undo != NULL || trx->update_undo != NULL) {
1802 
1803  mutex_exit(&kernel_mutex);
1804 
1805  mtr_start(&mtr);
1806 
1807  /* Change the undo log segment states from TRX_UNDO_ACTIVE
1808  to TRX_UNDO_PREPARED: these modifications to the file data
1809  structure define the transaction as prepared in the
1810  file-based world, at the serialization point of lsn. */
1811 
1812  mutex_enter(&(rseg->mutex));
1813 
1814  if (trx->insert_undo != NULL) {
1815 
1816  /* It is not necessary to obtain trx->undo_mutex here
1817  because only a single OS thread is allowed to do the
1818  transaction prepare for this transaction. */
1819 
1821  &mtr);
1822  }
1823 
1824  if (trx->update_undo) {
1826  trx, trx->update_undo, &mtr);
1827  }
1828 
1829  mutex_exit(&(rseg->mutex));
1830 
1831  /*--------------*/
1832  mtr_commit(&mtr); /* This mtr commit makes the
1833  transaction prepared in the file-based
1834  world */
1835  /*--------------*/
1836  lsn = mtr.end_lsn;
1837 
1838  mutex_enter(&kernel_mutex);
1839  }
1840 
1841  ut_ad(mutex_own(&kernel_mutex));
1842 
1843  /*--------------------------------------*/
1844  trx->conc_state = TRX_PREPARED;
1845  /*--------------------------------------*/
1846 
1847  if (lsn) {
1848  /* Depending on the my.cnf options, we may now write the log
1849  buffer to the log files, making the prepared state of the
1850  transaction durable if the OS does not crash. We may also
1851  flush the log files to disk, making the prepared state of the
1852  transaction durable also at an OS crash or a power outage.
1853 
1854  The idea in InnoDB's group prepare is that a group of
1855  transactions gather behind a trx doing a physical disk write
1856  to log files, and when that physical write has been completed,
1857  one of those transactions does a write which prepares the whole
1858  group. Note that this group prepare will only bring benefit if
1859  there are > 2 users in the database. Then at least 2 users can
1860  gather behind one doing the physical log write to disk.
1861 
1862  TODO: find out if MySQL holds some mutex when calling this.
1863  That would spoil our group prepare algorithm. */
1864 
1865  mutex_exit(&kernel_mutex);
1866 
1867  if (srv_flush_log_at_trx_commit == 0) {
1868  /* Do nothing */
1869  } else if (srv_flush_log_at_trx_commit == 1) {
1870  if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1871  /* Write the log but do not flush it to disk */
1872 
1873  log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
1874  FALSE);
1875  } else {
1876  /* Write the log to the log files AND flush
1877  them to disk */
1878 
1879  log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1880  }
1881  } else if (srv_flush_log_at_trx_commit == 2) {
1882 
1883  /* Write the log but do not flush it to disk */
1884 
1885  log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1886  } else {
1887  ut_error;
1888  }
1889 
1890  mutex_enter(&kernel_mutex);
1891  }
1892 }
1893 
1894 /**********************************************************************/
1897 UNIV_INTERN
1898 ulint
1900 /*==================*/
1901  trx_t* trx)
1902 {
1903  /* Because we do not do the prepare by sending an Innobase
1904  sig to the transaction, we must here make sure that trx has been
1905  started. */
1906 
1907  ut_a(trx);
1908 
1909  trx->op_info = "preparing";
1910 
1912 
1913  mutex_enter(&kernel_mutex);
1914 
1915  trx_prepare_off_kernel(trx);
1916 
1917  mutex_exit(&kernel_mutex);
1918 
1919  trx->op_info = "";
1920 
1921  return(0);
1922 }
1923 
1924 /**********************************************************************/
1928 UNIV_INTERN
1929 int
1931 /*==================*/
1932  XID* xid_list,
1933  ulint len)
1934 {
1935  trx_t* trx;
1936  ulint count = 0;
1937 
1938  ut_ad(xid_list);
1939  ut_ad(len);
1940 
1941  /* We should set those transactions which are in the prepared state
1942  to the xid_list */
1943 
1944  mutex_enter(&kernel_mutex);
1945 
1946  trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
1947 
1948  while (trx) {
1949  if (trx->conc_state == TRX_PREPARED) {
1950  xid_list[count] = trx->xid;
1951 
1952  if (count == 0) {
1953  ut_print_timestamp(stderr);
1954  fprintf(stderr,
1955  " InnoDB: Starting recovery for"
1956  " XA transactions...\n");
1957  }
1958 
1959  ut_print_timestamp(stderr);
1960  fprintf(stderr,
1961  " InnoDB: Transaction " TRX_ID_FMT " in"
1962  " prepared state after recovery\n",
1963  trx->id);
1964 
1965  ut_print_timestamp(stderr);
1966  fprintf(stderr,
1967  " InnoDB: Transaction contains changes"
1968  " to %llu rows\n",
1969  (ullint) trx->undo_no);
1970 
1971  count++;
1972 
1973  if (count == len) {
1974  break;
1975  }
1976  }
1977 
1978  trx = UT_LIST_GET_NEXT(trx_list, trx);
1979  }
1980 
1981  mutex_exit(&kernel_mutex);
1982 
1983  if (count > 0){
1984  ut_print_timestamp(stderr);
1985  fprintf(stderr,
1986  " InnoDB: %lu transactions in prepared state"
1987  " after recovery\n",
1988  (ulong) count);
1989  }
1990 
1991  return ((int) count);
1992 }
1993 
1994 /*******************************************************************/
1998 UNIV_INTERN
1999 trx_t*
2001 /*===============*/
2002  XID* xid)
2003 {
2004  trx_t* trx;
2005 
2006  if (xid == NULL) {
2007 
2008  return (NULL);
2009  }
2010 
2011  mutex_enter(&kernel_mutex);
2012 
2013  trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
2014 
2015  while (trx) {
2016  /* Compare two X/Open XA transaction id's: their
2017  length should be the same and binary comparison
2018  of gtrid_length+bqual_length bytes should be
2019  the same */
2020 
2021  if (xid->gtrid_length == trx->xid.gtrid_length
2022  && xid->bqual_length == trx->xid.bqual_length
2023  && memcmp(xid->data, trx->xid.data,
2024  xid->gtrid_length + xid->bqual_length) == 0) {
2025  break;
2026  }
2027 
2028  trx = UT_LIST_GET_NEXT(trx_list, trx);
2029  }
2030 
2031  mutex_exit(&kernel_mutex);
2032 
2033  if (trx) {
2034  if (trx->conc_state != TRX_PREPARED) {
2035 
2036  return(NULL);
2037  }
2038 
2039  return(trx);
2040  } else {
2041  return(NULL);
2042  }
2043 }