Drizzled Public API Documentation

os0file.cc
1 /*****************************************************************************
2 
3 Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
4 Copyright (C) 2009, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; version 2 of the License.
16 
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
23 St, Fifth Floor, Boston, MA 02110-1301 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************/
34 #include "os0file.h"
35 
36 #ifdef UNIV_NONINL
37 #include "os0file.ic"
38 #endif
39 
40 #include "ut0mem.h"
41 #include "srv0srv.h"
42 #include "srv0start.h"
43 #include "fil0fil.h"
44 #include "buf0buf.h"
45 #include <errno.h>
46 #include <fcntl.h>
47 #include <limits.h>
48 #include <unistd.h>
49 #ifndef UNIV_HOTBACKUP
50 # include "os0sync.h"
51 # include "os0thread.h"
52 #else /* !UNIV_HOTBACKUP */
53 # ifdef __WIN__
54 /* Add includes for the _stat() call to compile on Windows */
55 # include <sys/types.h>
56 # include <sys/stat.h>
57 # endif /* __WIN__ */
58 #endif /* !UNIV_HOTBACKUP */
59 
60 #if defined(LINUX_NATIVE_AIO)
61 #include <libaio.h>
62 #endif
63 
64 /* This specifies the file permissions InnoDB uses when it creates files in
65 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
66 my_umask */
67 
68 #ifndef __WIN__
69 
70 UNIV_INTERN ulint os_innodb_umask
71  = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
72 #else
73 
74 UNIV_INTERN ulint os_innodb_umask = 0;
75 #endif
76 
77 #ifdef UNIV_DO_FLUSH
78 /* If the following is set to TRUE, we do not call os_file_flush in every
79 os_file_write. We can set this TRUE when the doublewrite buffer is used. */
80 UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE;
81 #else
82 /* We do not call os_file_flush in every os_file_write. */
83 #endif /* UNIV_DO_FLUSH */
84 
85 #ifndef UNIV_HOTBACKUP
86 /* We use these mutexes to protect lseek + file i/o operation, if the
87 OS does not provide an atomic pread or pwrite, or similar */
88 #define OS_FILE_N_SEEK_MUTEXES 16
89 UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
90 
91 /* In simulated aio, merge at most this many consecutive i/os */
92 #define OS_AIO_MERGE_N_CONSECUTIVE 64
93 
94 /**********************************************************************
95 
96 InnoDB AIO Implementation:
97 =========================
98 
99 We support native AIO for windows and linux. For rest of the platforms
100 we simulate AIO by special io-threads servicing the IO-requests.
101 
102 Simulated AIO:
103 ==============
104 
105 In platforms where we 'simulate' AIO following is a rough explanation
106 of the high level design.
107 There are four io-threads (for ibuf, log, read, write).
108 All synchronous IO requests are serviced by the calling thread using
109 os_file_write/os_file_read. The Asynchronous requests are queued up
110 in an array (there are four such arrays) by the calling thread.
111 Later these requests are picked up by the io-thread and are serviced
112 synchronously.
113 
114 Windows native AIO:
115 ==================
116 
117 If srv_use_native_aio is not set then windows follow the same
118 code as simulated AIO. If the flag is set then native AIO interface
119 is used. On windows, one of the limitation is that if a file is opened
120 for AIO no synchronous IO can be done on it. Therefore we have an
121 extra fifth array to queue up synchronous IO requests.
122 There are innodb_file_io_threads helper threads. These threads work
123 on the four arrays mentioned above in Simulated AIO. No thread is
124 required for the sync array.
125 If a synchronous IO request is made, it is first queued in the sync
126 array. Then the calling thread itself waits on the request, thus
127 making the call synchronous.
128 If an AIO request is made the calling thread not only queues it in the
129 array but also submits the requests. The helper thread then collects
130 the completed IO request and calls completion routine on it.
131 
132 Linux native AIO:
133 =================
134 
135 If we have libaio installed on the system and innodb_use_native_aio
136 is set to TRUE we follow the code path of native AIO, otherwise we
137 do simulated AIO.
138 There are innodb_file_io_threads helper threads. These threads work
139 on the four arrays mentioned above in Simulated AIO.
140 If a synchronous IO request is made, it is handled by calling
141 os_file_write/os_file_read.
142 If an AIO request is made the calling thread not only queues it in the
143 array but also submits the requests. The helper thread then collects
144 the completed IO request and calls completion routine on it.
145 
146 **********************************************************************/
147 
149 UNIV_INTERN ibool os_aio_print_debug = FALSE;
150 
151 #ifdef UNIV_PFS_IO
152 /* Keys to register InnoDB I/O with performance schema */
153 UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154 UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155 UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156 #endif /* UNIV_PFS_IO */
157 
159 typedef struct os_aio_slot_struct os_aio_slot_t;
160 
163  ibool is_read;
164  ulint pos;
166  ibool reserved;
168  ulint len;
170  byte* buf;
171  ulint type;
172  ulint offset;
174  ulint offset_high;
176  const char* name;
183  void* message2;
187 #ifdef WIN_ASYNC_IO
188  HANDLE handle;
190  OVERLAPPED control;
192 #elif defined(LINUX_NATIVE_AIO)
193  struct iocb control; /* Linux control block for aio */
194  int n_bytes; /* bytes written/read. */
195  int ret; /* AIO return code */
196 #endif
197 };
198 
200 typedef struct os_aio_array_struct os_aio_array_t;
201 
213  ulint n_slots;
216  ulint n_segments;
221  ulint cur_seg;
225  ulint n_reserved;
229 #ifdef __WIN__
230  HANDLE* handles;
237 #endif
238 
239 #if defined(LINUX_NATIVE_AIO)
240  io_context_t* aio_ctx;
241  /* completion queue for IO. There is
242  one such queue per segment. Each thread
243  will work on one ctx exclusively. */
244  struct io_event* aio_events;
245  /* The array to collect completed IOs.
246  There is one such event for each
247  possible pending IO. The size of the
248  array is equal to n_slots. */
249 #endif
250 };
251 
252 #if defined(LINUX_NATIVE_AIO)
253 
254 #define OS_AIO_REAP_TIMEOUT (500000000UL)
255 
257 #define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
258 
260 #define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
261 #endif
262 
264 static os_event_t* os_aio_segment_wait_events = NULL;
265 
268 static os_aio_array_t* os_aio_read_array = NULL;
269 static os_aio_array_t* os_aio_write_array = NULL;
270 static os_aio_array_t* os_aio_ibuf_array = NULL;
271 static os_aio_array_t* os_aio_log_array = NULL;
272 static os_aio_array_t* os_aio_sync_array = NULL;
273 /* @} */
274 
276 static ulint os_aio_n_segments = ULINT_UNDEFINED;
277 
280 static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
281 #endif /* !UNIV_HOTBACKUP */
282 
283 UNIV_INTERN ulint os_n_file_reads = 0;
284 UNIV_INTERN ulint os_bytes_read_since_printout = 0;
285 UNIV_INTERN ulint os_n_file_writes = 0;
286 UNIV_INTERN ulint os_n_fsyncs = 0;
287 UNIV_INTERN ulint os_n_file_reads_old = 0;
288 UNIV_INTERN ulint os_n_file_writes_old = 0;
289 UNIV_INTERN ulint os_n_fsyncs_old = 0;
290 UNIV_INTERN time_t os_last_printout;
291 
292 UNIV_INTERN ibool os_has_said_disk_full = FALSE;
293 
294 #ifndef UNIV_HOTBACKUP
295 
296 static os_mutex_t os_file_count_mutex;
297 #endif /* !UNIV_HOTBACKUP */
298 
299 UNIV_INTERN ulint os_file_n_pending_preads = 0;
301 UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
303 UNIV_INTERN ulint os_n_pending_writes = 0;
305 UNIV_INTERN ulint os_n_pending_reads = 0;
306 
307 /***********************************************************************/
311 UNIV_INTERN
312 ulint
314 /*===================*/
315 {
316 #ifdef __WIN__
317  OSVERSIONINFO os_info;
318 
319  os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
320 
321  ut_a(GetVersionEx(&os_info));
322 
323  if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
324  return(OS_WIN31);
325  } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
326  return(OS_WIN95);
327  } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
328  switch (os_info.dwMajorVersion) {
329  case 3:
330  case 4:
331  return OS_WINNT;
332  case 5:
333  return (os_info.dwMinorVersion == 0) ? OS_WIN2000
334  : OS_WINXP;
335  case 6:
336  return (os_info.dwMinorVersion == 0) ? OS_WINVISTA
337  : OS_WIN7;
338  default:
339  return OS_WIN7;
340  }
341  } else {
342  ut_error;
343  return(0);
344  }
345 #else
346  ut_error;
347 
348  return(0);
349 #endif
350 }
351 
352 /***********************************************************************/
358 UNIV_INTERN
359 ulint
361 /*===================*/
362  ibool report_all_errors)
364 {
365  ulint err;
366 
367 #ifdef __WIN__
368 
369  err = (ulint) GetLastError();
370 
371  if (report_all_errors
372  || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
373 
374  ut_print_timestamp(stderr);
375  fprintf(stderr,
376  " InnoDB: Operating system error number %lu"
377  " in a file operation.\n", (ulong) err);
378 
379  if (err == ERROR_PATH_NOT_FOUND) {
380  fprintf(stderr,
381  "InnoDB: The error means the system"
382  " cannot find the path specified.\n");
383 
384  if (srv_is_being_started) {
385  fprintf(stderr,
386  "InnoDB: If you are installing InnoDB,"
387  " remember that you must create\n"
388  "InnoDB: directories yourself, InnoDB"
389  " does not create them.\n");
390  }
391  } else if (err == ERROR_ACCESS_DENIED) {
392  fprintf(stderr,
393  "InnoDB: The error means mysqld does not have"
394  " the access rights to\n"
395  "InnoDB: the directory. It may also be"
396  " you have created a subdirectory\n"
397  "InnoDB: of the same name as a data file.\n");
398  } else if (err == ERROR_SHARING_VIOLATION
399  || err == ERROR_LOCK_VIOLATION) {
400  fprintf(stderr,
401  "InnoDB: The error means that another program"
402  " is using InnoDB's files.\n"
403  "InnoDB: This might be a backup or antivirus"
404  " software or another instance\n"
405  "InnoDB: of MySQL."
406  " Please close it to get rid of this error.\n");
407  } else if (err == ERROR_WORKING_SET_QUOTA
408  || err == ERROR_NO_SYSTEM_RESOURCES) {
409  fprintf(stderr,
410  "InnoDB: The error means that there are no"
411  " sufficient system resources or quota to"
412  " complete the operation.\n");
413  } else if (err == ERROR_OPERATION_ABORTED) {
414  fprintf(stderr,
415  "InnoDB: The error means that the I/O"
416  " operation has been aborted\n"
417  "InnoDB: because of either a thread exit"
418  " or an application request.\n"
419  "InnoDB: Retry attempt is made.\n");
420  } else {
421  fprintf(stderr,
422  "InnoDB: Some operating system error numbers"
423  " are described at\n"
424  "InnoDB: "
425  REFMAN
426  "operating-system-error-codes.html\n");
427  }
428  }
429 
430  fflush(stderr);
431 
432  if (err == ERROR_FILE_NOT_FOUND) {
433  return(OS_FILE_NOT_FOUND);
434  } else if (err == ERROR_DISK_FULL) {
435  return(OS_FILE_DISK_FULL);
436  } else if (err == ERROR_FILE_EXISTS) {
437  return(OS_FILE_ALREADY_EXISTS);
438  } else if (err == ERROR_SHARING_VIOLATION
439  || err == ERROR_LOCK_VIOLATION) {
440  return(OS_FILE_SHARING_VIOLATION);
441  } else if (err == ERROR_WORKING_SET_QUOTA
442  || err == ERROR_NO_SYSTEM_RESOURCES) {
443  return(OS_FILE_INSUFFICIENT_RESOURCE);
444  } else if (err == ERROR_OPERATION_ABORTED) {
445  return(OS_FILE_OPERATION_ABORTED);
446  } else {
447  return(100 + err);
448  }
449 #else
450  err = (ulint) errno;
451 
452  if (report_all_errors
453  || (err != ENOSPC && err != EEXIST)) {
454 
455  ut_print_timestamp(stderr);
456  fprintf(stderr,
457  " InnoDB: Operating system error number %lu"
458  " in a file operation.\n", (ulong) err);
459 
460  if (err == ENOENT) {
461  fprintf(stderr,
462  "InnoDB: The error means the system"
463  " cannot find the path specified.\n");
464 
465  if (srv_is_being_started) {
466  fprintf(stderr,
467  "InnoDB: If you are installing InnoDB,"
468  " remember that you must create\n"
469  "InnoDB: directories yourself, InnoDB"
470  " does not create them.\n");
471  }
472  } else if (err == EACCES) {
473  fprintf(stderr,
474  "InnoDB: The error means mysqld does not have"
475  " the access rights to\n"
476  "InnoDB: the directory.\n");
477  } else {
478  if (strerror((int)err) != NULL) {
479  fprintf(stderr,
480  "InnoDB: Error number %lu"
481  " means '%s'.\n",
482  err, strerror((int)err));
483  }
484 
485  fprintf(stderr,
486  "InnoDB: Some operating system"
487  " error numbers are described at\n"
488  "InnoDB: "
489  REFMAN
490  "operating-system-error-codes.html\n");
491  }
492  }
493 
494  fflush(stderr);
495 
496  switch (err) {
497  case ENOSPC:
498  return(OS_FILE_DISK_FULL);
499  case ENOENT:
500  return(OS_FILE_NOT_FOUND);
501  case EEXIST:
502  return(OS_FILE_ALREADY_EXISTS);
503  case EXDEV:
504  case ENOTDIR:
505  case EISDIR:
506  return(OS_FILE_PATH_ERROR);
507  case EAGAIN:
508  if (srv_use_native_aio) {
509  return(OS_FILE_AIO_RESOURCES_RESERVED);
510  }
511  break;
512  case EINTR:
513  if (srv_use_native_aio) {
514  return(OS_FILE_AIO_INTERRUPTED);
515  }
516  break;
517  }
518  return(100 + err);
519 #endif
520 }
521 
522 /****************************************************************/
527 static
528 ibool
529 os_file_handle_error_cond_exit(
530 /*===========================*/
531  const char* name,
532  const char* operation,
533  ibool should_exit)
535 {
536  ulint err;
537 
538  err = os_file_get_last_error(FALSE);
539 
540  if (err == OS_FILE_DISK_FULL) {
541  /* We only print a warning about disk full once */
542 
543  if (os_has_said_disk_full) {
544 
545  return(FALSE);
546  }
547 
548  if (name) {
549  ut_print_timestamp(stderr);
550  fprintf(stderr,
551  " InnoDB: Encountered a problem with"
552  " file %s\n", name);
553  }
554 
555  ut_print_timestamp(stderr);
556  fprintf(stderr,
557  " InnoDB: Disk is full. Try to clean the disk"
558  " to free space.\n");
559 
560  os_has_said_disk_full = TRUE;
561 
562  fflush(stderr);
563 
564  return(FALSE);
565  } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
566 
567  return(TRUE);
568  } else if (err == OS_FILE_AIO_INTERRUPTED) {
569 
570  return(TRUE);
571  } else if (err == OS_FILE_ALREADY_EXISTS
572  || err == OS_FILE_PATH_ERROR) {
573 
574  return(FALSE);
575  } else if (err == OS_FILE_SHARING_VIOLATION) {
576 
577  os_thread_sleep(10000000); /* 10 sec */
578  return(TRUE);
579  } else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
580 
581  os_thread_sleep(100000); /* 100 ms */
582  return(TRUE);
583  } else if (err == OS_FILE_OPERATION_ABORTED) {
584 
585  os_thread_sleep(100000); /* 100 ms */
586  return(TRUE);
587  } else {
588  if (name) {
589  fprintf(stderr, "InnoDB: File name %s\n", name);
590  }
591 
592  fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
593  operation);
594 
595  if (should_exit) {
596  fprintf(stderr, "InnoDB: Cannot continue operation.\n");
597 
598  fflush(stderr);
599 
600  exit(1);
601  }
602  }
603 
604  return(FALSE);
605 }
606 
607 /****************************************************************/
610 static
611 ibool
612 os_file_handle_error(
613 /*=================*/
614  const char* name,
615  const char* operation)
616 {
617  /* exit in case of unknown error */
618  return(os_file_handle_error_cond_exit(name, operation, TRUE));
619 }
620 
621 /****************************************************************/
624 static
625 ibool
626 os_file_handle_error_no_exit(
627 /*=========================*/
628  const char* name,
629  const char* operation)
630 {
631  /* don't exit in case of unknown error */
632  return(os_file_handle_error_cond_exit(name, operation, FALSE));
633 }
634 
635 #undef USE_FILE_LOCK
636 #define USE_FILE_LOCK
637 #if defined(UNIV_HOTBACKUP) || defined(__WIN__)
638 /* InnoDB Hot Backup does not lock the data files.
639  * On Windows, mandatory locking is used.
640  */
641 # undef USE_FILE_LOCK
642 #endif
643 #ifdef USE_FILE_LOCK
644 /****************************************************************/
647 static
648 int
649 os_file_lock(
650 /*=========*/
651  int fd,
652  const char* name)
653 {
654  struct flock lk;
655 
656  if (srv_read_only)
657  return 0;
658 
659  lk.l_type = F_WRLCK;
660  lk.l_whence = SEEK_SET;
661  lk.l_start = lk.l_len = 0;
662  if (fcntl(fd, F_SETLK, &lk) == -1) {
663  fprintf(stderr,
664  "InnoDB: Unable to lock %s, error: %d\n", name, errno);
665 
666  if (errno == EAGAIN || errno == EACCES) {
667  fprintf(stderr,
668  "InnoDB: Check that you do not already have"
669  " another drizzled process\n"
670  "InnoDB: using the same InnoDB data"
671  " or log files.\n");
672  }
673 
674  return(-1);
675  }
676 
677  return(0);
678 }
679 #endif /* USE_FILE_LOCK */
680 
681 #ifndef UNIV_HOTBACKUP
682 /****************************************************************/
684 UNIV_INTERN
685 void
687 /*===================*/
688 {
689  ulint i;
690 
691  os_file_count_mutex = os_mutex_create();
692 
693  for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
694  os_file_seek_mutexes[i] = os_mutex_create();
695  }
696 }
697 
698 /***********************************************************************/
702 UNIV_INTERN
703 FILE*
705 /*========================*/
706 {
707  FILE* file = NULL;
708  int fd = innobase_mysql_tmpfile();
709 
710  if (fd >= 0) {
711  file = fdopen(fd, "w+b");
712  }
713 
714  if (!file) {
715  ut_print_timestamp(stderr);
716  fprintf(stderr,
717  " InnoDB: Error: unable to create temporary file;"
718  " errno: %d\n", errno);
719  if (fd >= 0) {
720  close(fd);
721  }
722  }
723 
724  return(file);
725 }
726 #endif /* !UNIV_HOTBACKUP */
727 
728 /***********************************************************************/
734 UNIV_INTERN
737 /*============*/
738  const char* dirname,
740  ibool error_is_fatal)
745 {
746  os_file_dir_t dir;
747 #ifdef __WIN__
748  LPWIN32_FIND_DATA lpFindFileData;
749  char path[OS_FILE_MAX_PATH + 3];
750 
751  ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
752 
753  strcpy(path, dirname);
754  strcpy(path + strlen(path), "\\*");
755 
756  /* Note that in Windows opening the 'directory stream' also retrieves
757  the first entry in the directory. Since it is '.', that is no problem,
758  as we will skip over the '.' and '..' entries anyway. */
759 
760  lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
761 
762  dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
763 
764  ut_free(lpFindFileData);
765 
766  if (dir == INVALID_HANDLE_VALUE) {
767 
768  if (error_is_fatal) {
769  os_file_handle_error(dirname, "opendir");
770  }
771 
772  return(NULL);
773  }
774 
775  return(dir);
776 #else
777  dir = opendir(dirname);
778 
779  if (dir == NULL && error_is_fatal) {
780  os_file_handle_error(dirname, "opendir");
781  }
782 
783  return(dir);
784 #endif
785 }
786 
787 /***********************************************************************/
790 UNIV_INTERN
791 int
793 /*=============*/
794  os_file_dir_t dir)
795 {
796 #ifdef __WIN__
797  BOOL ret;
798 
799  ret = FindClose(dir);
800 
801  if (!ret) {
802  os_file_handle_error_no_exit(NULL, "closedir");
803 
804  return(-1);
805  }
806 
807  return(0);
808 #else
809  int ret;
810 
811  ret = closedir(dir);
812 
813  if (ret) {
814  os_file_handle_error_no_exit(NULL, "closedir");
815  }
816 
817  return(ret);
818 #endif
819 }
820 
821 /***********************************************************************/
825 UNIV_INTERN
826 int
828 /*======================*/
829  const char* dirname,
830  os_file_dir_t dir,
831  os_file_stat_t* info)
832 {
833 #ifdef __WIN__
834  LPWIN32_FIND_DATA lpFindFileData;
835  BOOL ret;
836 
837  lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
838 next_file:
839  ret = FindNextFile(dir, lpFindFileData);
840 
841  if (ret) {
842  ut_a(strlen((char *) lpFindFileData->cFileName)
843  < OS_FILE_MAX_PATH);
844 
845  if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
846  || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
847 
848  goto next_file;
849  }
850 
851  strcpy(info->name, (char *) lpFindFileData->cFileName);
852 
853  info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
854  + (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
855  << 32);
856 
857  if (lpFindFileData->dwFileAttributes
858  & FILE_ATTRIBUTE_REPARSE_POINT) {
859  /* TODO: test Windows symlinks */
860  /* TODO: MySQL has apparently its own symlink
861  implementation in Windows, dbname.sym can
862  redirect a database directory:
863  REFMAN "windows-symbolic-links.html" */
864  info->type = OS_FILE_TYPE_LINK;
865  } else if (lpFindFileData->dwFileAttributes
866  & FILE_ATTRIBUTE_DIRECTORY) {
867  info->type = OS_FILE_TYPE_DIR;
868  } else {
869  /* It is probably safest to assume that all other
870  file types are normal. Better to check them rather
871  than blindly skip them. */
872 
873  info->type = OS_FILE_TYPE_FILE;
874  }
875  }
876 
877  ut_free(lpFindFileData);
878 
879  if (ret) {
880  return(0);
881  } else if (GetLastError() == ERROR_NO_MORE_FILES) {
882 
883  return(1);
884  } else {
885  os_file_handle_error_no_exit(dirname,
886  "readdir_next_file");
887  return(-1);
888  }
889 #else
890  struct dirent* ent;
891  char* full_path;
892  int ret;
893  struct stat statinfo;
894 #ifdef HAVE_READDIR_R
895  char dirent_buf[sizeof(struct dirent)
896  + _POSIX_PATH_MAX + 100];
897  /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
898  the max file name len; but in most standards, the
899  length is NAME_MAX; we add 100 to be even safer */
900 #endif
901 
902 next_file:
903 
904 #ifdef HAVE_READDIR_R
905  ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
906 
907  if (ret != 0
908 #ifdef UNIV_AIX
909  /* On AIX, only if we got non-NULL 'ent' (result) value and
910  a non-zero 'ret' (return) value, it indicates a failed
911  readdir_r() call. An NULL 'ent' with an non-zero 'ret'
912  would indicate the "end of the directory" is reached. */
913  && ent != NULL
914 #endif
915  ) {
916  fprintf(stderr,
917  "InnoDB: cannot read directory %s, error %lu\n",
918  dirname, (ulong)ret);
919 
920  return(-1);
921  }
922 
923  if (ent == NULL) {
924  /* End of directory */
925 
926  return(1);
927  }
928 
929  ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
930 #else
931  ent = readdir(dir);
932 
933  if (ent == NULL) {
934 
935  return(1);
936  }
937 #endif
938  ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
939 
940  if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
941 
942  goto next_file;
943  }
944 
945  strcpy(info->name, ent->d_name);
946 
947  full_path = static_cast<char* >(ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
948 
949  sprintf(full_path, "%s/%s", dirname, ent->d_name);
950 
951  ret = stat(full_path, &statinfo);
952 
953  if (ret) {
954 
955  if (errno == ENOENT) {
956  /* readdir() returned a file that does not exist,
957  it must have been deleted in the meantime. Do what
958  would have happened if the file was deleted before
959  readdir() - ignore and go to the next entry.
960  If this is the last entry then info->name will still
961  contain the name of the deleted file when this
962  function returns, but this is not an issue since the
963  caller shouldn't be looking at info when end of
964  directory is returned. */
965 
966  ut_free(full_path);
967 
968  goto next_file;
969  }
970 
971  os_file_handle_error_no_exit(full_path, "stat");
972 
973  ut_free(full_path);
974 
975  return(-1);
976  }
977 
978  info->size = (ib_int64_t)statinfo.st_size;
979 
980  if (S_ISDIR(statinfo.st_mode)) {
981  info->type = OS_FILE_TYPE_DIR;
982  } else if (S_ISLNK(statinfo.st_mode)) {
983  info->type = OS_FILE_TYPE_LINK;
984  } else if (S_ISREG(statinfo.st_mode)) {
985  info->type = OS_FILE_TYPE_FILE;
986  } else {
987  info->type = OS_FILE_TYPE_UNKNOWN;
988  }
989 
990  ut_free(full_path);
991 
992  return(0);
993 #endif
994 }
995 
996 /*****************************************************************/
1002 UNIV_INTERN
1003 ibool
1005 /*=====================*/
1006  const char* pathname,
1008  ibool fail_if_exists)
1010 {
1011 #ifdef __WIN__
1012  BOOL rcode;
1013 
1014  rcode = CreateDirectory((LPCTSTR) pathname, NULL);
1015  if (!(rcode != 0
1016  || (GetLastError() == ERROR_ALREADY_EXISTS
1017  && !fail_if_exists))) {
1018  /* failure */
1019  os_file_handle_error(pathname, "CreateDirectory");
1020 
1021  return(FALSE);
1022  }
1023 
1024  return (TRUE);
1025 #else
1026  int rcode;
1027 
1028  rcode = mkdir(pathname, 0770);
1029 
1030  if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
1031  /* failure */
1032  os_file_handle_error(pathname, "mkdir");
1033 
1034  return(FALSE);
1035  }
1036 
1037  return (TRUE);
1038 #endif
1039 }
1040 
1041 /****************************************************************/
1047 UNIV_INTERN
1048 os_file_t
1050 /*=======================*/
1051  const char* name,
1053  ulint create_mode,
1060  ulint access_type,
1062  ibool* success)
1063 {
1064 #ifdef __WIN__
1065  os_file_t file;
1066  DWORD create_flag;
1067  DWORD access;
1068  DWORD attributes = 0;
1069  ibool retry;
1070 
1071 try_again:
1072  ut_a(name);
1073 
1074  if (create_mode == OS_FILE_OPEN) {
1075  create_flag = OPEN_EXISTING;
1076  } else if (create_mode == OS_FILE_CREATE) {
1077  create_flag = CREATE_NEW;
1078  } else if (create_mode == OS_FILE_CREATE_PATH) {
1079  /* create subdirs along the path if needed */
1080  *success = os_file_create_subdirs_if_needed(name);
1081  if (!*success) {
1082  ut_error;
1083  }
1084  create_flag = CREATE_NEW;
1085  create_mode = OS_FILE_CREATE;
1086  } else {
1087  create_flag = 0;
1088  ut_error;
1089  }
1090 
1091  if (access_type == OS_FILE_READ_ONLY) {
1092  access = GENERIC_READ;
1093  } else if (access_type == OS_FILE_READ_WRITE) {
1094  access = GENERIC_READ | GENERIC_WRITE;
1095  } else {
1096  access = 0;
1097  ut_error;
1098  }
1099 
1100  file = CreateFile((LPCTSTR) name,
1101  access,
1102  FILE_SHARE_READ | FILE_SHARE_WRITE,
1103  /* file can be read and written also
1104  by other processes */
1105  NULL, /* default security attributes */
1106  create_flag,
1107  attributes,
1108  NULL);
1110  if (file == INVALID_HANDLE_VALUE) {
1111  *success = FALSE;
1112 
1113  retry = os_file_handle_error(name,
1114  create_mode == OS_FILE_OPEN ?
1115  "open" : "create");
1116  if (retry) {
1117  goto try_again;
1118  }
1119  } else {
1120  *success = TRUE;
1121  }
1122 
1123  return(file);
1124 #else /* __WIN__ */
1125  os_file_t file;
1126  int create_flag;
1127  ibool retry;
1128 
1129 try_again:
1130  ut_a(name);
1131 
1132  if (create_mode == OS_FILE_OPEN) {
1133  if (access_type == OS_FILE_READ_ONLY) {
1134  create_flag = O_RDONLY;
1135  } else {
1136  create_flag = O_RDWR;
1137  }
1138  } else if (create_mode == OS_FILE_CREATE) {
1139  create_flag = O_RDWR | O_CREAT | O_EXCL;
1140  } else if (create_mode == OS_FILE_CREATE_PATH) {
1141  /* create subdirs along the path if needed */
1142  *success = os_file_create_subdirs_if_needed(name);
1143  if (!*success) {
1144  return (-1);
1145  }
1146  create_flag = O_RDWR | O_CREAT | O_EXCL;
1147  create_mode = OS_FILE_CREATE;
1148  } else {
1149  create_flag = 0;
1150  ut_error;
1151  }
1152 
1153  if (create_mode == OS_FILE_CREATE) {
1154  file = open(name, create_flag, S_IRUSR | S_IWUSR
1155  | S_IRGRP | S_IWGRP);
1156  } else {
1157  file = open(name, create_flag);
1158  }
1159 
1160  if (file == -1) {
1161  *success = FALSE;
1162 
1163  retry = os_file_handle_error(name,
1164  create_mode == OS_FILE_OPEN ?
1165  "open" : "create");
1166  if (retry) {
1167  goto try_again;
1168  }
1169 #ifdef USE_FILE_LOCK
1170  } else if (access_type == OS_FILE_READ_WRITE
1171  && os_file_lock(file, name)) {
1172  *success = FALSE;
1173  close(file);
1174  file = -1;
1175 #endif
1176  } else {
1177  *success = TRUE;
1178  }
1179 
1180  return(file);
1181 #endif /* __WIN__ */
1182 }
1183 
1184 /****************************************************************/
1190 UNIV_INTERN
1191 os_file_t
1193 /*=========================================*/
1194  const char* name,
1196  ulint create_mode,
1200  ulint access_type,
1204  ibool* success)
1205 {
1206 #ifdef __WIN__
1207  os_file_t file;
1208  DWORD create_flag;
1209  DWORD access;
1210  DWORD attributes = 0;
1211  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
1212 
1213  ut_a(name);
1214 
1215  if (create_mode == OS_FILE_OPEN) {
1216  create_flag = OPEN_EXISTING;
1217  } else if (create_mode == OS_FILE_CREATE) {
1218  create_flag = CREATE_NEW;
1219  } else {
1220  create_flag = 0;
1221  ut_error;
1222  }
1223 
1224  if (access_type == OS_FILE_READ_ONLY) {
1225  access = GENERIC_READ;
1226  } else if (access_type == OS_FILE_READ_WRITE) {
1227  access = GENERIC_READ | GENERIC_WRITE;
1228  } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1229  access = GENERIC_READ;
1230  share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
1231  | FILE_SHARE_WRITE;
1235  } else {
1236  access = 0;
1237  ut_error;
1238  }
1239 
1240  file = CreateFile((LPCTSTR) name,
1241  access,
1242  share_mode,
1243  NULL, /* default security attributes */
1244  create_flag,
1245  attributes,
1246  NULL);
1248  if (file == INVALID_HANDLE_VALUE) {
1249  *success = FALSE;
1250  } else {
1251  *success = TRUE;
1252  }
1253 
1254  return(file);
1255 #else /* __WIN__ */
1256  os_file_t file;
1257  int create_flag;
1258 
1259  ut_a(name);
1260 
1261  if (create_mode == OS_FILE_OPEN) {
1262  if (access_type == OS_FILE_READ_ONLY) {
1263  create_flag = O_RDONLY;
1264  } else {
1265  create_flag = O_RDWR;
1266  }
1267  } else if (create_mode == OS_FILE_CREATE) {
1268  create_flag = O_RDWR | O_CREAT | O_EXCL;
1269  } else {
1270  create_flag = 0;
1271  ut_error;
1272  }
1273 
1274  if (create_mode == OS_FILE_CREATE) {
1275  file = open(name, create_flag, S_IRUSR | S_IWUSR
1276  | S_IRGRP | S_IWGRP);
1277  } else {
1278  file = open(name, create_flag);
1279  }
1280 
1281  if (file == -1) {
1282  *success = FALSE;
1283 #ifdef USE_FILE_LOCK
1284  } else if (access_type == OS_FILE_READ_WRITE
1285  && os_file_lock(file, name)) {
1286  *success = FALSE;
1287  close(file);
1288  file = -1;
1289 #endif
1290  } else {
1291  *success = TRUE;
1292  }
1293 
1294  return(file);
1295 #endif /* __WIN__ */
1296 }
1297 
1298 /****************************************************************/
1300 UNIV_INTERN
1301 void
1303 /*================*/
1304  int fd,
1305  const char* file_name,
1306  const char* operation_name)
1309 {
1310  /* some versions of Solaris may not have DIRECTIO_ON */
1311 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1312  if (directio(fd, DIRECTIO_ON) == -1) {
1313  int errno_save;
1314  errno_save = (int)errno;
1315  ut_print_timestamp(stderr);
1316  fprintf(stderr,
1317  " InnoDB: Failed to set DIRECTIO_ON "
1318  "on file %s: %s: %s, continuing anyway\n",
1319  file_name, operation_name, strerror(errno_save));
1320  }
1321 #elif defined(O_DIRECT)
1322  if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1323  int errno_save;
1324  errno_save = (int)errno;
1325  ut_print_timestamp(stderr);
1326  fprintf(stderr,
1327  " InnoDB: Failed to set O_DIRECT "
1328  "on file %s: %s: %s, continuing anyway\n",
1329  file_name, operation_name, strerror(errno_save));
1330  if (errno_save == EINVAL) {
1331  ut_print_timestamp(stderr);
1332  fprintf(stderr,
1333  " InnoDB: O_DIRECT is known to result in "
1334  "'Invalid argument' on Linux on tmpfs, "
1335  "see MySQL Bug#26662\n");
1336  }
1337  }
1338 #else /* Required for OSX */
1339  (void)fd;
1340  (void)file_name;
1341  (void)operation_name;
1342 #endif
1343 }
1344 
1345 /****************************************************************/
1351 UNIV_INTERN
1352 os_file_t
1354 /*================*/
1355  const char* name,
1357  ulint create_mode,
1365  ulint purpose,
1372  ulint type,
1373  ibool* success)
1374 {
1375 #ifdef __WIN__
1376  os_file_t file;
1377  DWORD share_mode = FILE_SHARE_READ;
1378  DWORD create_flag;
1379  DWORD attributes;
1380  ibool retry;
1381 try_again:
1382  ut_a(name);
1383 
1384  if (create_mode == OS_FILE_OPEN_RAW) {
1385  create_flag = OPEN_EXISTING;
1386  share_mode = FILE_SHARE_WRITE;
1387  } else if (create_mode == OS_FILE_OPEN
1388  || create_mode == OS_FILE_OPEN_RETRY) {
1389  create_flag = OPEN_EXISTING;
1390  } else if (create_mode == OS_FILE_CREATE) {
1391  create_flag = CREATE_NEW;
1392  } else if (create_mode == OS_FILE_OVERWRITE) {
1393  create_flag = CREATE_ALWAYS;
1394  } else {
1395  create_flag = 0;
1396  ut_error;
1397  }
1398 
1399  if (purpose == OS_FILE_AIO) {
1400  /* If specified, use asynchronous (overlapped) io and no
1401  buffering of writes in the OS */
1402  attributes = 0;
1403 #ifdef WIN_ASYNC_IO
1404  if (srv_use_native_aio) {
1405  attributes = attributes | FILE_FLAG_OVERLAPPED;
1406  }
1407 #endif
1408 #ifdef UNIV_NON_BUFFERED_IO
1409 # ifndef UNIV_HOTBACKUP
1410  if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1411  /* Do not use unbuffered i/o to log files because
1412  value 2 denotes that we do not flush the log at every
1413  commit, but only once per second */
1414  } else if (srv_win_file_flush_method
1415  == SRV_WIN_IO_UNBUFFERED) {
1416  attributes = attributes | FILE_FLAG_NO_BUFFERING;
1417  }
1418 # else /* !UNIV_HOTBACKUP */
1419  attributes = attributes | FILE_FLAG_NO_BUFFERING;
1420 # endif /* !UNIV_HOTBACKUP */
1421 #endif /* UNIV_NON_BUFFERED_IO */
1422  } else if (purpose == OS_FILE_NORMAL) {
1423  attributes = 0;
1424 #ifdef UNIV_NON_BUFFERED_IO
1425 # ifndef UNIV_HOTBACKUP
1426  if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1427  /* Do not use unbuffered i/o to log files because
1428  value 2 denotes that we do not flush the log at every
1429  commit, but only once per second */
1430  } else if (srv_win_file_flush_method
1431  == SRV_WIN_IO_UNBUFFERED) {
1432  attributes = attributes | FILE_FLAG_NO_BUFFERING;
1433  }
1434 # else /* !UNIV_HOTBACKUP */
1435  attributes = attributes | FILE_FLAG_NO_BUFFERING;
1436 # endif /* !UNIV_HOTBACKUP */
1437 #endif /* UNIV_NON_BUFFERED_IO */
1438  } else {
1439  attributes = 0;
1440  ut_error;
1441  }
1442 
1443  file = CreateFile((LPCTSTR) name,
1444  GENERIC_READ | GENERIC_WRITE, /* read and write
1445  access */
1446  share_mode, /* File can be read also by other
1447  processes; we must give the read
1448  permission because of ibbackup. We do
1449  not give the write permission to
1450  others because if one would succeed to
1451  start 2 instances of mysqld on the
1452  SAME files, that could cause severe
1453  database corruption! When opening
1454  raw disk partitions, Microsoft manuals
1455  say that we must give also the write
1456  permission. */
1457  NULL, /* default security attributes */
1458  create_flag,
1459  attributes,
1460  NULL);
1462  if (file == INVALID_HANDLE_VALUE) {
1463  *success = FALSE;
1464 
1465  /* When srv_file_per_table is on, file creation failure may not
1466  be critical to the whole instance. Do not crash the server in
1467  case of unknown errors.
1468  Please note "srv_file_per_table" is a global variable with
1469  no explicit synchronization protection. It could be
1470  changed during this execution path. It might not have the
1471  same value as the one when building the table definition */
1472  if (srv_file_per_table) {
1473  retry = os_file_handle_error_no_exit(name,
1474  create_mode == OS_FILE_CREATE ?
1475  "create" : "open");
1476  } else {
1477  retry = os_file_handle_error(name,
1478  create_mode == OS_FILE_CREATE ?
1479  "create" : "open");
1480  }
1481 
1482  if (retry) {
1483  goto try_again;
1484  }
1485  } else {
1486  *success = TRUE;
1487  }
1488 
1489  return(file);
1490 #else /* __WIN__ */
1491  os_file_t file;
1492  int create_flag;
1493  ibool retry;
1494  const char* mode_str = NULL;
1495 
1496 try_again:
1497  ut_a(name);
1498 
1499  if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
1500  || create_mode == OS_FILE_OPEN_RETRY) {
1501  mode_str = "OPEN";
1502  if (srv_read_only)
1503  create_flag = O_RDONLY;
1504  else
1505  create_flag = O_RDWR;
1506  } else if (create_mode == OS_FILE_CREATE) {
1507  mode_str = "CREATE";
1508  create_flag = O_RDWR | O_CREAT | O_EXCL;
1509  } else if (create_mode == OS_FILE_OVERWRITE) {
1510  mode_str = "OVERWRITE";
1511  create_flag = O_RDWR | O_CREAT | O_TRUNC;
1512  } else {
1513  create_flag = 0;
1514  ut_error;
1515  }
1516 
1517  ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1518  ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1519 
1520 #ifdef O_SYNC
1521  /* We let O_SYNC only affect log files; note that we map O_DSYNC to
1522  O_SYNC because the datasync options seemed to corrupt files in 2001
1523  in both Linux and Solaris */
1524  if (type == OS_LOG_FILE
1525  && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1526 
1527 # if 0
1528  fprintf(stderr, "Using O_SYNC for file %s\n", name);
1529 # endif
1530 
1531  create_flag = create_flag | O_SYNC;
1532  }
1533 #endif /* O_SYNC */
1534 
1535  file = open(name, create_flag, os_innodb_umask);
1536 
1537  if (file == -1) {
1538  *success = FALSE;
1539 
1540  /* When srv_file_per_table is on, file creation failure may not
1541  be critical to the whole instance. Do not crash the server in
1542  case of unknown errors.
1543  Please note "srv_file_per_table" is a global variable with
1544  no explicit synchronization protection. It could be
1545  changed during this execution path. It might not have the
1546  same value as the one when building the table definition */
1547  if (srv_file_per_table) {
1548  retry = os_file_handle_error_no_exit(name,
1549  create_mode == OS_FILE_CREATE ?
1550  "create" : "open");
1551  } else {
1552  retry = os_file_handle_error(name,
1553  create_mode == OS_FILE_CREATE ?
1554  "create" : "open");
1555  }
1556 
1557  if (retry) {
1558  goto try_again;
1559  } else {
1560  return(file /* -1 */);
1561  }
1562  }
1563  /* else */
1564 
1565  *success = TRUE;
1566 
1567  /* We disable OS caching (O_DIRECT) only on data files */
1568  if (type != OS_LOG_FILE
1569  && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
1570 
1571  os_file_set_nocache(file, name, mode_str);
1572  }
1573 
1574  /* With ALL_O_DIRECT we disable OS caching for trx log file too */
1575  if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
1576  os_file_set_nocache(file, name, mode_str);
1577  }
1578 
1579 #ifdef USE_FILE_LOCK
1580  if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
1581 
1582  if (create_mode == OS_FILE_OPEN_RETRY) {
1583  int i;
1584  ut_print_timestamp(stderr);
1585  fputs(" InnoDB: Retrying to lock"
1586  " the first data file\n",
1587  stderr);
1588  for (i = 0; i < 100; i++) {
1589  os_thread_sleep(1000000);
1590  if (!os_file_lock(file, name)) {
1591  *success = TRUE;
1592  return(file);
1593  }
1594  }
1595  ut_print_timestamp(stderr);
1596  fputs(" InnoDB: Unable to open the first data file\n",
1597  stderr);
1598  }
1599 
1600  *success = FALSE;
1601  close(file);
1602  file = -1;
1603  }
1604 #endif /* USE_FILE_LOCK */
1605 
1606  return(file);
1607 #endif /* __WIN__ */
1608 }
1609 
1610 /***********************************************************************/
1613 UNIV_INTERN
1614 ibool
1616 /*=====================*/
1617  const char* name)
1618 {
1619 #ifdef __WIN__
1620  BOOL ret;
1621  ulint count = 0;
1622 loop:
1623  /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1624  it */
1625 
1626  ret = DeleteFile((LPCTSTR)name);
1627 
1628  if (ret) {
1629  return(TRUE);
1630  }
1631 
1632  if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1633  /* the file does not exist, this not an error */
1634 
1635  return(TRUE);
1636  }
1637 
1638  count++;
1639 
1640  if (count > 100 && 0 == (count % 10)) {
1641  fprintf(stderr,
1642  "InnoDB: Warning: cannot delete file %s\n"
1643  "InnoDB: Are you running ibbackup"
1644  " to back up the file?\n", name);
1645 
1646  os_file_get_last_error(TRUE); /* print error information */
1647  }
1648 
1649  os_thread_sleep(1000000); /* sleep for a second */
1650 
1651  if (count > 2000) {
1652 
1653  return(FALSE);
1654  }
1655 
1656  goto loop;
1657 #else
1658  int ret;
1659 
1660  ret = unlink(name);
1661 
1662  if (ret != 0 && errno != ENOENT) {
1663  os_file_handle_error_no_exit(name, "delete");
1664 
1665  return(FALSE);
1666  }
1667 
1668  return(TRUE);
1669 #endif
1670 }
1671 
1672 /***********************************************************************/
1675 UNIV_INTERN
1676 ibool
1678 /*===========*/
1679  const char* name)
1680 {
1681 #ifdef __WIN__
1682  BOOL ret;
1683  ulint count = 0;
1684 loop:
1685  /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1686  it */
1687 
1688  ret = DeleteFile((LPCTSTR)name);
1689 
1690  if (ret) {
1691  return(TRUE);
1692  }
1693 
1694  if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1695  /* If the file does not exist, we classify this as a 'mild'
1696  error and return */
1697 
1698  return(FALSE);
1699  }
1700 
1701  count++;
1702 
1703  if (count > 100 && 0 == (count % 10)) {
1704  fprintf(stderr,
1705  "InnoDB: Warning: cannot delete file %s\n"
1706  "InnoDB: Are you running ibbackup"
1707  " to back up the file?\n", name);
1708 
1709  os_file_get_last_error(TRUE); /* print error information */
1710  }
1711 
1712  os_thread_sleep(1000000); /* sleep for a second */
1713 
1714  if (count > 2000) {
1715 
1716  return(FALSE);
1717  }
1718 
1719  goto loop;
1720 #else
1721  int ret;
1722 
1723  ret = unlink(name);
1724 
1725  if (ret != 0) {
1726  os_file_handle_error_no_exit(name, "delete");
1727 
1728  return(FALSE);
1729  }
1730 
1731  return(TRUE);
1732 #endif
1733 }
1734 
1735 /***********************************************************************/
1740 UNIV_INTERN
1741 ibool
1743 /*================*/
1744  const char* oldpath,
1746  const char* newpath)
1747 {
1748 #ifdef __WIN__
1749  BOOL ret;
1750 
1751  ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
1752 
1753  if (ret) {
1754  return(TRUE);
1755  }
1756 
1757  os_file_handle_error_no_exit(oldpath, "rename");
1758 
1759  return(FALSE);
1760 #else
1761  int ret;
1762 
1763  ret = rename(oldpath, newpath);
1764 
1765  if (ret != 0) {
1766  os_file_handle_error_no_exit(oldpath, "rename");
1767 
1768  return(FALSE);
1769  }
1770 
1771  return(TRUE);
1772 #endif
1773 }
1774 
1775 /***********************************************************************/
1780 UNIV_INTERN
1781 ibool
1783 /*===============*/
1784  os_file_t file)
1785 {
1786 #ifdef __WIN__
1787  BOOL ret;
1788 
1789  ut_a(file);
1790 
1791  ret = CloseHandle(file);
1792 
1793  if (ret) {
1794  return(TRUE);
1795  }
1796 
1797  os_file_handle_error(NULL, "close");
1798 
1799  return(FALSE);
1800 #else
1801  int ret;
1802 
1803  ret = close(file);
1804 
1805  if (ret == -1) {
1806  os_file_handle_error(NULL, "close");
1807 
1808  return(FALSE);
1809  }
1810 
1811  return(TRUE);
1812 #endif
1813 }
1814 
1815 #ifdef UNIV_HOTBACKUP
1816 /***********************************************************************/
1819 UNIV_INTERN
1820 ibool
1821 os_file_close_no_error_handling(
1822 /*============================*/
1823  os_file_t file)
1824 {
1825 #ifdef __WIN__
1826  BOOL ret;
1827 
1828  ut_a(file);
1829 
1830  ret = CloseHandle(file);
1831 
1832  if (ret) {
1833  return(TRUE);
1834  }
1835 
1836  return(FALSE);
1837 #else
1838  int ret;
1839 
1840  ret = close(file);
1841 
1842  if (ret == -1) {
1843 
1844  return(FALSE);
1845  }
1846 
1847  return(TRUE);
1848 #endif
1849 }
1850 #endif /* UNIV_HOTBACKUP */
1851 
1852 /***********************************************************************/
1855 UNIV_INTERN
1856 ibool
1858 /*=============*/
1859  os_file_t file,
1860  ulint* size,
1862  ulint* size_high)
1863 {
1864 #ifdef __WIN__
1865  DWORD high;
1866  DWORD low;
1867 
1868  low = GetFileSize(file, &high);
1869 
1870  if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
1871  return(FALSE);
1872  }
1873 
1874  *size = low;
1875  *size_high = high;
1876 
1877  return(TRUE);
1878 #else
1879  off_t offs;
1880 
1881  offs = lseek(file, 0, SEEK_END);
1882 
1883  if (offs == ((off_t)-1)) {
1884 
1885  return(FALSE);
1886  }
1887 
1888  if (sizeof(off_t) > 4) {
1889  *size = (ulint)(offs & 0xFFFFFFFFUL);
1890  *size_high = (ulint)(offs >> 32);
1891  } else {
1892  *size = (ulint) offs;
1893  *size_high = 0;
1894  }
1895 
1896  return(TRUE);
1897 #endif
1898 }
1899 
1900 /***********************************************************************/
1903 UNIV_INTERN
1904 ib_int64_t
1906 /*===========================*/
1907  os_file_t file)
1908 {
1909  ulint size;
1910  ulint size_high;
1911  ibool success;
1912 
1913  success = os_file_get_size(file, &size, &size_high);
1914 
1915  if (!success) {
1916 
1917  return(-1);
1918  }
1919 
1920  return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
1921 }
1922 
1923 /***********************************************************************/
1926 UNIV_INTERN
1927 ibool
1929 /*=============*/
1930  const char* name,
1932  os_file_t file,
1933  ulint size,
1935  ulint size_high)
1936 {
1937  ib_int64_t current_size;
1938  ib_int64_t desired_size;
1939  ibool ret;
1940  byte* buf;
1941  byte* buf2;
1942  ulint buf_size;
1943 
1944  ut_a(size == (size & 0xFFFFFFFF));
1945 
1946  current_size = 0;
1947  desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
1948 
1949  /* Write up to 1 megabyte at a time. */
1950  buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
1951  * UNIV_PAGE_SIZE;
1952  buf2 = static_cast<unsigned char *>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
1953 
1954  /* Align the buffer for possible raw i/o */
1955  buf = static_cast<unsigned char *>(ut_align(buf2, UNIV_PAGE_SIZE));
1956 
1957  /* Write buffer full of zeros */
1958  memset(buf, 0, buf_size);
1959 
1960  if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1961 
1962  fprintf(stderr, "InnoDB: Progress in MB:");
1963  }
1964 
1965  while (current_size < desired_size) {
1966  ulint n_bytes;
1967 
1968  if (desired_size - current_size < (ib_int64_t) buf_size) {
1969  n_bytes = (ulint) (desired_size - current_size);
1970  } else {
1971  n_bytes = buf_size;
1972  }
1973 
1974  ret = os_file_write(name, file, buf,
1975  (ulint)(current_size & 0xFFFFFFFF),
1976  (ulint)(current_size >> 32),
1977  n_bytes);
1978  if (!ret) {
1979  ut_free(buf2);
1980  goto error_handling;
1981  }
1982 
1983  /* Print about progress for each 100 MB written */
1984  if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
1985  != current_size / (ib_int64_t)(100 * 1024 * 1024)) {
1986 
1987  fprintf(stderr, " %lu00",
1988  (ulong) ((current_size + n_bytes)
1989  / (ib_int64_t)(100 * 1024 * 1024)));
1990  }
1991 
1992  current_size += n_bytes;
1993  }
1994 
1995  if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1996 
1997  fprintf(stderr, "\n");
1998  }
1999 
2000  ut_free(buf2);
2001 
2002  ret = os_file_flush(file);
2003 
2004  if (ret) {
2005  return(TRUE);
2006  }
2007 
2008 error_handling:
2009  return(FALSE);
2010 }
2011 
2012 /***********************************************************************/
2015 UNIV_INTERN
2016 ibool
2018 /*============*/
2019  FILE* file)
2020 {
2021 #ifdef __WIN__
2022  HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
2023  return(SetEndOfFile(h));
2024 #else /* __WIN__ */
2025  return(!ftruncate(fileno(file), ftell(file)));
2026 #endif /* __WIN__ */
2027 }
2028 
2029 #ifndef __WIN__
2030 /***********************************************************************/
2036 static
2037 int
2038 os_file_fsync(
2039 /*==========*/
2040  os_file_t file)
2041 {
2042  int ret;
2043  int failures;
2044  ibool retry;
2045 
2046  failures = 0;
2047 
2048  do {
2049  ret = fsync(file);
2050 
2051  os_n_fsyncs++;
2052 
2053  if (ret == -1 && errno == ENOLCK) {
2054 
2055  if (failures % 100 == 0) {
2056 
2057  ut_print_timestamp(stderr);
2058  fprintf(stderr,
2059  " InnoDB: fsync(): "
2060  "No locks available; retrying\n");
2061  }
2062 
2063  os_thread_sleep(200000 /* 0.2 sec */);
2064 
2065  failures++;
2066 
2067  retry = TRUE;
2068  } else {
2069 
2070  retry = FALSE;
2071  }
2072  } while (retry);
2073 
2074  return(ret);
2075 }
2076 #endif /* !__WIN__ */
2077 
2078 /***********************************************************************/
2082 UNIV_INTERN
2083 ibool
2085 /*===============*/
2086  os_file_t file)
2087 {
2088 #ifdef __WIN__
2089  BOOL ret;
2090 
2091  ut_a(file);
2092 
2093  os_n_fsyncs++;
2094 
2095  ret = FlushFileBuffers(file);
2096 
2097  if (ret) {
2098  return(TRUE);
2099  }
2100 
2101  /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
2102  actually a raw device, we choose to ignore that error if we are using
2103  raw disks */
2104 
2105  if (srv_start_raw_disk_in_use && GetLastError()
2106  == ERROR_INVALID_FUNCTION) {
2107  return(TRUE);
2108  }
2109 
2110  os_file_handle_error(NULL, "flush");
2111 
2112  /* It is a fatal error if a file flush does not succeed, because then
2113  the database can get corrupt on disk */
2114  ut_error;
2115 
2116  return(FALSE);
2117 #else
2118  int ret;
2119 
2120 #if defined(HAVE_DARWIN_THREADS)
2121 # ifndef F_FULLFSYNC
2122  /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2123 # define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2124 # elif F_FULLFSYNC != 51
2125 # error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2126 # endif
2127  /* Apple has disabled fsync() for internal disk drives in OS X. That
2128  caused corruption for a user when he tested a power outage. Let us in
2129  OS X use a nonstandard flush method recommended by an Apple
2130  engineer. */
2131 
2132  if (!srv_have_fullfsync) {
2133  /* If we are not on an operating system that supports this,
2134  then fall back to a plain fsync. */
2135 
2136  ret = os_file_fsync(file);
2137  } else {
2138  ret = fcntl(file, F_FULLFSYNC, NULL);
2139 
2140  if (ret) {
2141  /* If we are not on a file system that supports this,
2142  then fall back to a plain fsync. */
2143  ret = os_file_fsync(file);
2144  }
2145  }
2146 #else
2147  ret = os_file_fsync(file);
2148 #endif
2149 
2150  if (ret == 0) {
2151  return(TRUE);
2152  }
2153 
2154  /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2155  we choose to ignore that error if we are using raw disks */
2156 
2157  if (srv_start_raw_disk_in_use && errno == EINVAL) {
2158 
2159  return(TRUE);
2160  }
2161 
2162  ut_print_timestamp(stderr);
2163 
2164  fprintf(stderr,
2165  " InnoDB: Error: the OS said file flush did not succeed\n");
2166 
2167  os_file_handle_error(NULL, "flush");
2168 
2169  /* It is a fatal error if a file flush does not succeed, because then
2170  the database can get corrupt on disk */
2171  ut_error;
2172 
2173  return(FALSE);
2174 #endif
2175 }
2176 
2177 #ifndef __WIN__
2178 /*******************************************************************/
2181 static
2182 ssize_t
2183 os_file_pread(
2184 /*==========*/
2185  os_file_t file,
2186  void* buf,
2187  ulint n,
2188  ulint offset,
2190  ulint offset_high)
2192 {
2193  off_t offs;
2194 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2195  ssize_t n_bytes;
2196 #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2197 
2198  ut_a((offset & 0xFFFFFFFFUL) == offset);
2199 
2200  /* If off_t is > 4 bytes in size, then we assume we can pass a
2201  64-bit address */
2202 
2203  if (sizeof(off_t) > 4) {
2204  offs = (off_t)offset + (((off_t)offset_high) << 32);
2205 
2206  } else {
2207  offs = (off_t)offset;
2208 
2209  if (offset_high > 0) {
2210  fprintf(stderr,
2211  "InnoDB: Error: file read at offset > 4 GB\n");
2212  }
2213  }
2214 
2215  os_n_file_reads++;
2216 
2217 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2218  os_mutex_enter(os_file_count_mutex);
2221  os_mutex_exit(os_file_count_mutex);
2222 
2223  n_bytes = pread(file, buf, (ssize_t)n, offs);
2224 
2225  os_mutex_enter(os_file_count_mutex);
2228  os_mutex_exit(os_file_count_mutex);
2229 
2230  return(n_bytes);
2231 #else
2232  {
2233  off_t ret_offset;
2234  ssize_t ret;
2235 #ifndef UNIV_HOTBACKUP
2236  ulint i;
2237 #endif /* !UNIV_HOTBACKUP */
2238 
2239  os_mutex_enter(os_file_count_mutex);
2241  os_mutex_exit(os_file_count_mutex);
2242 
2243 #ifndef UNIV_HOTBACKUP
2244  /* Protect the seek / read operation with a mutex */
2245  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2246 
2247  os_mutex_enter(os_file_seek_mutexes[i]);
2248 #endif /* !UNIV_HOTBACKUP */
2249 
2250  ret_offset = lseek(file, offs, SEEK_SET);
2251 
2252  if (ret_offset < 0) {
2253  ret = -1;
2254  } else {
2255  ret = read(file, buf, (ssize_t)n);
2256  }
2257 
2258 #ifndef UNIV_HOTBACKUP
2259  os_mutex_exit(os_file_seek_mutexes[i]);
2260 #endif /* !UNIV_HOTBACKUP */
2261 
2262  os_mutex_enter(os_file_count_mutex);
2264  os_mutex_exit(os_file_count_mutex);
2265 
2266  return(ret);
2267  }
2268 #endif
2269 }
2270 
2271 /*******************************************************************/
2274 static
2275 ssize_t
2276 os_file_pwrite(
2277 /*===========*/
2278  os_file_t file,
2279  const void* buf,
2280  ulint n,
2281  ulint offset,
2283  ulint offset_high)
2285 {
2286  ssize_t ret;
2287  off_t offs;
2288 
2289  ut_a((offset & 0xFFFFFFFFUL) == offset);
2290 
2291  /* If off_t is > 4 bytes in size, then we assume we can pass a
2292  64-bit address */
2293 
2294  if (sizeof(off_t) > 4) {
2295  offs = (off_t)offset + (((off_t)offset_high) << 32);
2296  } else {
2297  offs = (off_t)offset;
2298 
2299  if (offset_high > 0) {
2300  fprintf(stderr,
2301  "InnoDB: Error: file write"
2302  " at offset > 4 GB\n");
2303  }
2304  }
2305 
2306  if (srv_fake_write)
2307  return(TRUE);
2308 
2309  os_n_file_writes++;
2310 
2311 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2312  os_mutex_enter(os_file_count_mutex);
2315  os_mutex_exit(os_file_count_mutex);
2316 
2317  ret = pwrite(file, buf, (ssize_t)n, offs);
2318 
2319  os_mutex_enter(os_file_count_mutex);
2322  os_mutex_exit(os_file_count_mutex);
2323 
2324 # ifdef UNIV_DO_FLUSH
2325  if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2326  && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2327  && !os_do_not_call_flush_at_each_write) {
2328 
2329  /* Always do fsync to reduce the probability that when
2330  the OS crashes, a database page is only partially
2331  physically written to disk. */
2332 
2333  ut_a(TRUE == os_file_flush(file));
2334  }
2335 # endif /* UNIV_DO_FLUSH */
2336 
2337  return(ret);
2338 #else
2339  {
2340  off_t ret_offset;
2341 # ifndef UNIV_HOTBACKUP
2342  ulint i;
2343 # endif /* !UNIV_HOTBACKUP */
2344 
2345  os_mutex_enter(os_file_count_mutex);
2347  os_mutex_exit(os_file_count_mutex);
2348 
2349 # ifndef UNIV_HOTBACKUP
2350  /* Protect the seek / write operation with a mutex */
2351  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2352 
2353  os_mutex_enter(os_file_seek_mutexes[i]);
2354 # endif /* UNIV_HOTBACKUP */
2355 
2356  ret_offset = lseek(file, offs, SEEK_SET);
2357 
2358  if (ret_offset < 0) {
2359  ret = -1;
2360 
2361  goto func_exit;
2362  }
2363 
2364  ret = write(file, buf, (ssize_t)n);
2365 
2366 # ifdef UNIV_DO_FLUSH
2367  if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2368  && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2369  && !os_do_not_call_flush_at_each_write) {
2370 
2371  /* Always do fsync to reduce the probability that when
2372  the OS crashes, a database page is only partially
2373  physically written to disk. */
2374 
2375  ut_a(TRUE == os_file_flush(file));
2376  }
2377 # endif /* UNIV_DO_FLUSH */
2378 
2379 func_exit:
2380 # ifndef UNIV_HOTBACKUP
2381  os_mutex_exit(os_file_seek_mutexes[i]);
2382 # endif /* !UNIV_HOTBACKUP */
2383 
2384  os_mutex_enter(os_file_count_mutex);
2386  os_mutex_exit(os_file_count_mutex);
2387 
2388  return(ret);
2389  }
2390 #endif
2391 }
2392 #endif
2393 
2394 /*******************************************************************/
2399 UNIV_INTERN
2400 ibool
2402 /*==============*/
2403  os_file_t file,
2404  void* buf,
2405  ulint offset,
2407  ulint offset_high,
2409  ulint n)
2410 {
2411 #ifdef __WIN__
2412  BOOL ret;
2413  DWORD len;
2414  DWORD ret2;
2415  DWORD low;
2416  DWORD high;
2417  ibool retry;
2418 #ifndef UNIV_HOTBACKUP
2419  ulint i;
2420 #endif /* !UNIV_HOTBACKUP */
2421 
2422  /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2423  no more than 32 bits. */
2424  ut_a((offset & 0xFFFFFFFFUL) == offset);
2425  ut_a((n & 0xFFFFFFFFUL) == n);
2426 
2427  os_n_file_reads++;
2428  os_bytes_read_since_printout += n;
2429 
2430 try_again:
2431  ut_ad(file);
2432  ut_ad(buf);
2433  ut_ad(n > 0);
2434 
2435  low = (DWORD) offset;
2436  high = (DWORD) offset_high;
2437 
2438  os_mutex_enter(os_file_count_mutex);
2440  os_mutex_exit(os_file_count_mutex);
2441 
2442 #ifndef UNIV_HOTBACKUP
2443  /* Protect the seek / read operation with a mutex */
2444  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2445 
2446  os_mutex_enter(os_file_seek_mutexes[i]);
2447 #endif /* !UNIV_HOTBACKUP */
2448 
2449  ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2450 
2451  if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2452 
2453 #ifndef UNIV_HOTBACKUP
2454  os_mutex_exit(os_file_seek_mutexes[i]);
2455 #endif /* !UNIV_HOTBACKUP */
2456 
2457  os_mutex_enter(os_file_count_mutex);
2459  os_mutex_exit(os_file_count_mutex);
2460 
2461  goto error_handling;
2462  }
2463 
2464  ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2465 
2466 #ifndef UNIV_HOTBACKUP
2467  os_mutex_exit(os_file_seek_mutexes[i]);
2468 #endif /* !UNIV_HOTBACKUP */
2469 
2470  os_mutex_enter(os_file_count_mutex);
2472  os_mutex_exit(os_file_count_mutex);
2473 
2474  if (ret && len == n) {
2475  return(TRUE);
2476  }
2477 #else /* __WIN__ */
2478  ibool retry;
2479  ssize_t ret;
2480 
2481  os_bytes_read_since_printout += n;
2482 
2483 try_again:
2484  ret = os_file_pread(file, buf, n, offset, offset_high);
2485 
2486  if ((ulint)ret == n) {
2487 
2488  return(TRUE);
2489  }
2490 
2491  fprintf(stderr,
2492  "InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2493  "InnoDB: Was only able to read %ld.\n",
2494  (ulong)n, (ulong)offset_high,
2495  (ulong)offset, (long)ret);
2496 #endif /* __WIN__ */
2497 #ifdef __WIN__
2498 error_handling:
2499 #endif
2500  retry = os_file_handle_error(NULL, "read");
2501 
2502  if (retry) {
2503  goto try_again;
2504  }
2505 
2506  fprintf(stderr,
2507  "InnoDB: Fatal error: cannot read from file."
2508  " OS error number %lu.\n",
2509 #ifdef __WIN__
2510  (ulong) GetLastError()
2511 #else
2512  (ulong) errno
2513 #endif
2514  );
2515  fflush(stderr);
2516 
2517  ut_error;
2518 
2519  return(FALSE);
2520 }
2521 
2522 /*******************************************************************/
2528 UNIV_INTERN
2529 ibool
2531 /*================================*/
2532  os_file_t file,
2533  void* buf,
2534  ulint offset,
2536  ulint offset_high,
2538  ulint n)
2539 {
2540 #ifdef __WIN__
2541  BOOL ret;
2542  DWORD len;
2543  DWORD ret2;
2544  DWORD low;
2545  DWORD high;
2546  ibool retry;
2547 #ifndef UNIV_HOTBACKUP
2548  ulint i;
2549 #endif /* !UNIV_HOTBACKUP */
2550 
2551  /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2552  no more than 32 bits. */
2553  ut_a((offset & 0xFFFFFFFFUL) == offset);
2554  ut_a((n & 0xFFFFFFFFUL) == n);
2555 
2556  os_n_file_reads++;
2557  os_bytes_read_since_printout += n;
2558 
2559 try_again:
2560  ut_ad(file);
2561  ut_ad(buf);
2562  ut_ad(n > 0);
2563 
2564  low = (DWORD) offset;
2565  high = (DWORD) offset_high;
2566 
2567  os_mutex_enter(os_file_count_mutex);
2569  os_mutex_exit(os_file_count_mutex);
2570 
2571 #ifndef UNIV_HOTBACKUP
2572  /* Protect the seek / read operation with a mutex */
2573  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2574 
2575  os_mutex_enter(os_file_seek_mutexes[i]);
2576 #endif /* !UNIV_HOTBACKUP */
2577 
2578  ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2579 
2580  if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2581 
2582 #ifndef UNIV_HOTBACKUP
2583  os_mutex_exit(os_file_seek_mutexes[i]);
2584 #endif /* !UNIV_HOTBACKUP */
2585 
2586  os_mutex_enter(os_file_count_mutex);
2588  os_mutex_exit(os_file_count_mutex);
2589 
2590  goto error_handling;
2591  }
2592 
2593  ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2594 
2595 #ifndef UNIV_HOTBACKUP
2596  os_mutex_exit(os_file_seek_mutexes[i]);
2597 #endif /* !UNIV_HOTBACKUP */
2598 
2599  os_mutex_enter(os_file_count_mutex);
2601  os_mutex_exit(os_file_count_mutex);
2602 
2603  if (ret && len == n) {
2604  return(TRUE);
2605  }
2606 #else /* __WIN__ */
2607  ibool retry;
2608  ssize_t ret;
2609 
2610  os_bytes_read_since_printout += n;
2611 
2612 try_again:
2613  ret = os_file_pread(file, buf, n, offset, offset_high);
2614 
2615  if ((ulint)ret == n) {
2616 
2617  return(TRUE);
2618  }
2619 #endif /* __WIN__ */
2620 #ifdef __WIN__
2621 error_handling:
2622 #endif
2623  retry = os_file_handle_error_no_exit(NULL, "read");
2624 
2625  if (retry) {
2626  goto try_again;
2627  }
2628 
2629  return(FALSE);
2630 }
2631 
2632 /*******************************************************************/
2636 UNIV_INTERN
2637 void
2639 /*================*/
2640  FILE* file,
2641  char* str,
2642  ulint size)
2643 {
2644  size_t flen;
2645 
2646  if (size == 0) {
2647  return;
2648  }
2649 
2650  rewind(file);
2651  flen = fread(str, 1, size - 1, file);
2652  str[flen] = '\0';
2653 }
2654 
2655 /*******************************************************************/
2660 UNIV_INTERN
2661 ibool
2663 /*===============*/
2664  const char* name,
2666  os_file_t file,
2667  const void* buf,
2668  ulint offset,
2670  ulint offset_high,
2672  ulint n)
2673 {
2674 #ifdef __WIN__
2675  BOOL ret;
2676  DWORD len;
2677  DWORD ret2;
2678  DWORD low;
2679  DWORD high;
2680  ulint n_retries = 0;
2681  ulint err;
2682 #ifndef UNIV_HOTBACKUP
2683  ulint i;
2684 #endif /* !UNIV_HOTBACKUP */
2685 
2686  /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2687  no more than 32 bits. */
2688  ut_a((offset & 0xFFFFFFFFUL) == offset);
2689  ut_a((n & 0xFFFFFFFFUL) == n);
2690 
2691  if (srv_fake_write)
2692  return(TRUE);
2693 
2694  os_n_file_writes++;
2695 
2696  ut_ad(file);
2697  ut_ad(buf);
2698  ut_ad(n > 0);
2699 retry:
2700  low = (DWORD) offset;
2701  high = (DWORD) offset_high;
2702 
2703  os_mutex_enter(os_file_count_mutex);
2705  os_mutex_exit(os_file_count_mutex);
2706 
2707 #ifndef UNIV_HOTBACKUP
2708  /* Protect the seek / write operation with a mutex */
2709  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2710 
2711  os_mutex_enter(os_file_seek_mutexes[i]);
2712 #endif /* !UNIV_HOTBACKUP */
2713 
2714  ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2715 
2716  if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2717 
2718 #ifndef UNIV_HOTBACKUP
2719  os_mutex_exit(os_file_seek_mutexes[i]);
2720 #endif /* !UNIV_HOTBACKUP */
2721 
2722  os_mutex_enter(os_file_count_mutex);
2724  os_mutex_exit(os_file_count_mutex);
2725 
2726  ut_print_timestamp(stderr);
2727 
2728  fprintf(stderr,
2729  " InnoDB: Error: File pointer positioning to"
2730  " file %s failed at\n"
2731  "InnoDB: offset %lu %lu. Operating system"
2732  " error number %lu.\n"
2733  "InnoDB: Some operating system error numbers"
2734  " are described at\n"
2735  "InnoDB: "
2736  REFMAN "operating-system-error-codes.html\n",
2737  name, (ulong) offset_high, (ulong) offset,
2738  (ulong) GetLastError());
2739 
2740  return(FALSE);
2741  }
2742 
2743  ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2744 
2745  /* Always do fsync to reduce the probability that when the OS crashes,
2746  a database page is only partially physically written to disk. */
2747 
2748 # ifdef UNIV_DO_FLUSH
2749  if (!os_do_not_call_flush_at_each_write) {
2750  ut_a(TRUE == os_file_flush(file));
2751  }
2752 # endif /* UNIV_DO_FLUSH */
2753 
2754 #ifndef UNIV_HOTBACKUP
2755  os_mutex_exit(os_file_seek_mutexes[i]);
2756 #endif /* !UNIV_HOTBACKUP */
2757 
2758  os_mutex_enter(os_file_count_mutex);
2760  os_mutex_exit(os_file_count_mutex);
2761 
2762  if (ret && len == n) {
2763 
2764  return(TRUE);
2765  }
2766 
2767  /* If some background file system backup tool is running, then, at
2768  least in Windows 2000, we may get here a specific error. Let us
2769  retry the operation 100 times, with 1 second waits. */
2770 
2771  if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2772 
2773  os_thread_sleep(1000000);
2774 
2775  n_retries++;
2776 
2777  goto retry;
2778  }
2779 
2780  if (!os_has_said_disk_full) {
2781 
2782  err = (ulint)GetLastError();
2783 
2784  ut_print_timestamp(stderr);
2785 
2786  fprintf(stderr,
2787  " InnoDB: Error: Write to file %s failed"
2788  " at offset %lu %lu.\n"
2789  "InnoDB: %lu bytes should have been written,"
2790  " only %lu were written.\n"
2791  "InnoDB: Operating system error number %lu.\n"
2792  "InnoDB: Check that your OS and file system"
2793  " support files of this size.\n"
2794  "InnoDB: Check also that the disk is not full"
2795  " or a disk quota exceeded.\n",
2796  name, (ulong) offset_high, (ulong) offset,
2797  (ulong) n, (ulong) len, (ulong) err);
2798 
2799  if (strerror((int)err) != NULL) {
2800  fprintf(stderr,
2801  "InnoDB: Error number %lu means '%s'.\n",
2802  (ulong) err, strerror((int)err));
2803  }
2804 
2805  fprintf(stderr,
2806  "InnoDB: Some operating system error numbers"
2807  " are described at\n"
2808  "InnoDB: "
2809  REFMAN "operating-system-error-codes.html\n");
2810 
2811  os_has_said_disk_full = TRUE;
2812  }
2813 
2814  return(FALSE);
2815 #else
2816  ssize_t ret;
2817 
2818  ret = os_file_pwrite(file, buf, n, offset, offset_high);
2819 
2820  if ((ulint)ret == n) {
2821 
2822  return(TRUE);
2823  }
2824 
2825  if (!os_has_said_disk_full) {
2826 
2827  ut_print_timestamp(stderr);
2828 
2829  fprintf(stderr,
2830  " InnoDB: Error: Write to file %s failed"
2831  " at offset %lu %lu.\n"
2832  "InnoDB: %lu bytes should have been written,"
2833  " only %ld were written.\n"
2834  "InnoDB: Operating system error number %lu.\n"
2835  "InnoDB: Check that your OS and file system"
2836  " support files of this size.\n"
2837  "InnoDB: Check also that the disk is not full"
2838  " or a disk quota exceeded.\n",
2839  name, offset_high, offset, n, (long int)ret,
2840  (ulint)errno);
2841  if (strerror(errno) != NULL) {
2842  fprintf(stderr,
2843  "InnoDB: Error number %lu means '%s'.\n",
2844  (ulint)errno, strerror(errno));
2845  }
2846 
2847  fprintf(stderr,
2848  "InnoDB: Some operating system error numbers"
2849  " are described at\n"
2850  "InnoDB: "
2851  REFMAN "operating-system-error-codes.html\n");
2852 
2853  os_has_said_disk_full = TRUE;
2854  }
2855 
2856  return(FALSE);
2857 #endif
2858 }
2859 
2860 /*******************************************************************/
2863 UNIV_INTERN
2864 ibool
2866 /*===========*/
2867  const char* path,
2868  ibool* exists,
2869  os_file_type_t* type)
2870 {
2871 #ifdef __WIN__
2872  int ret;
2873  struct _stat statinfo;
2874 
2875  ret = _stat(path, &statinfo);
2876  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2877  /* file does not exist */
2878  *exists = FALSE;
2879  return(TRUE);
2880  } else if (ret) {
2881  /* file exists, but stat call failed */
2882 
2883  os_file_handle_error_no_exit(path, "stat");
2884 
2885  return(FALSE);
2886  }
2887 
2888  if (_S_IFDIR & statinfo.st_mode) {
2889  *type = OS_FILE_TYPE_DIR;
2890  } else if (_S_IFREG & statinfo.st_mode) {
2891  *type = OS_FILE_TYPE_FILE;
2892  } else {
2893  *type = OS_FILE_TYPE_UNKNOWN;
2894  }
2895 
2896  *exists = TRUE;
2897 
2898  return(TRUE);
2899 #else
2900  int ret;
2901  struct stat statinfo;
2902 
2903  ret = stat(path, &statinfo);
2904  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2905  /* file does not exist */
2906  *exists = FALSE;
2907  return(TRUE);
2908  } else if (ret) {
2909  /* file exists, but stat call failed */
2910 
2911  os_file_handle_error_no_exit(path, "stat");
2912 
2913  return(FALSE);
2914  }
2915 
2916  if (S_ISDIR(statinfo.st_mode)) {
2917  *type = OS_FILE_TYPE_DIR;
2918  } else if (S_ISLNK(statinfo.st_mode)) {
2919  *type = OS_FILE_TYPE_LINK;
2920  } else if (S_ISREG(statinfo.st_mode)) {
2921  *type = OS_FILE_TYPE_FILE;
2922  } else {
2923  *type = OS_FILE_TYPE_UNKNOWN;
2924  }
2925 
2926  *exists = TRUE;
2927 
2928  return(TRUE);
2929 #endif
2930 }
2931 
2932 /*******************************************************************/
2935 UNIV_INTERN
2936 ibool
2938 /*===============*/
2939  const char* path,
2940  os_file_stat_t* stat_info)
2942 {
2943 #ifdef __WIN__
2944  int ret;
2945  struct _stat statinfo;
2946 
2947  ret = _stat(path, &statinfo);
2948  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2949  /* file does not exist */
2950 
2951  return(FALSE);
2952  } else if (ret) {
2953  /* file exists, but stat call failed */
2954 
2955  os_file_handle_error_no_exit(path, "stat");
2956 
2957  return(FALSE);
2958  }
2959  if (_S_IFDIR & statinfo.st_mode) {
2960  stat_info->type = OS_FILE_TYPE_DIR;
2961  } else if (_S_IFREG & statinfo.st_mode) {
2962  stat_info->type = OS_FILE_TYPE_FILE;
2963  } else {
2964  stat_info->type = OS_FILE_TYPE_UNKNOWN;
2965  }
2966 
2967  stat_info->ctime = statinfo.st_ctime;
2968  stat_info->atime = statinfo.st_atime;
2969  stat_info->mtime = statinfo.st_mtime;
2970  stat_info->size = statinfo.st_size;
2971 
2972  return(TRUE);
2973 #else
2974  int ret;
2975  struct stat statinfo;
2976 
2977  ret = stat(path, &statinfo);
2978 
2979  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2980  /* file does not exist */
2981 
2982  return(FALSE);
2983  } else if (ret) {
2984  /* file exists, but stat call failed */
2985 
2986  os_file_handle_error_no_exit(path, "stat");
2987 
2988  return(FALSE);
2989  }
2990 
2991  if (S_ISDIR(statinfo.st_mode)) {
2992  stat_info->type = OS_FILE_TYPE_DIR;
2993  } else if (S_ISLNK(statinfo.st_mode)) {
2994  stat_info->type = OS_FILE_TYPE_LINK;
2995  } else if (S_ISREG(statinfo.st_mode)) {
2996  stat_info->type = OS_FILE_TYPE_FILE;
2997  } else {
2998  stat_info->type = OS_FILE_TYPE_UNKNOWN;
2999  }
3000 
3001  stat_info->ctime = statinfo.st_ctime;
3002  stat_info->atime = statinfo.st_atime;
3003  stat_info->mtime = statinfo.st_mtime;
3004  stat_info->size = statinfo.st_size;
3005 
3006  return(TRUE);
3007 #endif
3008 }
3009 
3010 /* path name separator character */
3011 #ifdef __WIN__
3012 # define OS_FILE_PATH_SEPARATOR '\\'
3013 #else
3014 # define OS_FILE_PATH_SEPARATOR '/'
3015 #endif
3016 
3017 /****************************************************************/
3045 UNIV_INTERN
3046 char*
3048 /*============*/
3049  const char* path)
3050 {
3051  /* Find the offset of the last slash */
3052  const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
3053  if (!last_slash) {
3054  /* No slash in the path, return "." */
3055 
3056  return(mem_strdup("."));
3057  }
3058 
3059  /* Ok, there is a slash */
3060 
3061  if (last_slash == path) {
3062  /* last slash is the first char of the path */
3063 
3064  return(mem_strdup("/"));
3065  }
3066 
3067  /* Non-trivial directory component */
3068 
3069  return(mem_strdupl(path, last_slash - path));
3070 }
3071 
3072 /****************************************************************/
3075 UNIV_INTERN
3076 ibool
3078 /*=============================*/
3079  const char* path)
3080 {
3081  char* subdir;
3082  ibool success, subdir_exists;
3083  os_file_type_t type;
3084 
3085  subdir = os_file_dirname(path);
3086  if (strlen(subdir) == 1
3087  && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
3088  /* subdir is root or cwd, nothing to do */
3089  mem_free(subdir);
3090 
3091  return(TRUE);
3092  }
3093 
3094  /* Test if subdir exists */
3095  success = os_file_status(subdir, &subdir_exists, &type);
3096  if (success && !subdir_exists) {
3097  /* subdir does not exist, create it */
3098  success = os_file_create_subdirs_if_needed(subdir);
3099  if (!success) {
3100  mem_free(subdir);
3101 
3102  return(FALSE);
3103  }
3104  success = os_file_create_directory(subdir, FALSE);
3105  }
3106 
3107  mem_free(subdir);
3108 
3109  return(success);
3110 }
3111 
3112 #ifndef UNIV_HOTBACKUP
3113 /****************************************************************/
3116 static
3118 os_aio_array_get_nth_slot(
3119 /*======================*/
3120  os_aio_array_t* array,
3121  ulint index)
3122 {
3123  ut_a(index < array->n_slots);
3124 
3125  return((array->slots) + index);
3126 }
3127 
3128 #if defined(LINUX_NATIVE_AIO)
3129 /******************************************************************/
3132 static
3133 ibool
3134 os_aio_linux_create_io_ctx(
3135 /*=======================*/
3136  ulint max_events,
3137  io_context_t* io_ctx)
3138 {
3139  int ret;
3140  ulint retries = 0;
3141 
3142 retry:
3143  memset(io_ctx, 0x0, sizeof(*io_ctx));
3144 
3145  /* Initialize the io_ctx. Tell it how many pending
3146  IO requests this context will handle. */
3147 
3148  ret = io_setup(max_events, io_ctx);
3149  if (ret == 0) {
3150 #if defined(UNIV_AIO_DEBUG)
3151  fprintf(stderr,
3152  "InnoDB: Linux native AIO:"
3153  " initialized io_ctx for segment\n");
3154 #endif
3155  /* Success. Return now. */
3156  return(TRUE);
3157  }
3158 
3159  /* If we hit EAGAIN we'll make a few attempts before failing. */
3160 
3161  switch (ret) {
3162  case -EAGAIN:
3163  if (retries == 0) {
3164  /* First time around. */
3165  ut_print_timestamp(stderr);
3166  fprintf(stderr,
3167  " InnoDB: Warning: io_setup() failed"
3168  " with EAGAIN. Will make %d attempts"
3169  " before giving up.\n",
3170  OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3171  }
3172 
3173  if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3174  ++retries;
3175  fprintf(stderr,
3176  "InnoDB: Warning: io_setup() attempt"
3177  " %lu failed.\n",
3178  retries);
3179  os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3180  goto retry;
3181  }
3182 
3183  /* Have tried enough. Better call it a day. */
3184  ut_print_timestamp(stderr);
3185  fprintf(stderr,
3186  " InnoDB: Error: io_setup() failed"
3187  " with EAGAIN after %d attempts.\n",
3188  OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3189  break;
3190 
3191  case -ENOSYS:
3192  ut_print_timestamp(stderr);
3193  fprintf(stderr,
3194  " InnoDB: Error: Linux Native AIO interface"
3195  " is not supported on this platform. Please"
3196  " check your OS documentation and install"
3197  " appropriate binary of InnoDB.\n");
3198 
3199  break;
3200 
3201  default:
3202  ut_print_timestamp(stderr);
3203  fprintf(stderr,
3204  " InnoDB: Error: Linux Native AIO setup"
3205  " returned following error[%d]\n", -ret);
3206  break;
3207  }
3208 
3209  fprintf(stderr,
3210  "InnoDB: You can disable Linux Native AIO by"
3211  " setting innodb_native_aio = off in my.cnf\n");
3212  return(FALSE);
3213 }
3214 #endif /* LINUX_NATIVE_AIO */
3215 
3216 /******************************************************************/
3221 static
3223 os_aio_array_create(
3224 /*================*/
3225  ulint n,
3228  ulint n_segments)
3229 {
3230  os_aio_array_t* array;
3231  ulint i;
3232  os_aio_slot_t* slot;
3233 #ifdef WIN_ASYNC_IO
3234  OVERLAPPED* over;
3235 #elif defined(LINUX_NATIVE_AIO)
3236  struct io_event* aio_event = NULL;
3237 #endif
3238  ut_a(n > 0);
3239  ut_a(n_segments > 0);
3240 
3241  array = static_cast<os_aio_array_t *>(ut_malloc(sizeof(os_aio_array_t)));
3242 
3243  array->mutex = os_mutex_create();
3244  array->not_full = os_event_create(NULL);
3245  array->is_empty = os_event_create(NULL);
3246 
3247  os_event_set(array->is_empty);
3248 
3249  array->n_slots = n;
3250  array->n_segments = n_segments;
3251  array->n_reserved = 0;
3252  array->cur_seg = 0;
3253  array->slots = static_cast<os_aio_slot_t *>(ut_malloc(n * sizeof(os_aio_slot_t)));
3254 #ifdef __WIN__
3255  array->handles = ut_malloc(n * sizeof(HANDLE));
3256 #endif
3257 
3258 #if defined(LINUX_NATIVE_AIO)
3259  array->aio_ctx = NULL;
3260  array->aio_events = NULL;
3261 
3262  /* If we are not using native aio interface then skip this
3263  part of initialization. */
3264  if (!srv_use_native_aio) {
3265  goto skip_native_aio;
3266  }
3267 
3268  /* Initialize the io_context array. One io_context
3269  per segment in the array. */
3270 
3271  array->aio_ctx = (io_context**) ut_malloc(n_segments *
3272  sizeof(*array->aio_ctx));
3273  for (i = 0; i < n_segments; ++i) {
3274  if (!os_aio_linux_create_io_ctx(n/n_segments,
3275  &array->aio_ctx[i])) {
3276  /* If something bad happened during aio setup
3277  we should call it a day and return right away.
3278  We don't care about any leaks because a failure
3279  to initialize the io subsystem means that the
3280  server (or atleast the innodb storage engine)
3281  is not going to startup. */
3282  return(NULL);
3283  }
3284  }
3285 
3286  /* Initialize the event array. One event per slot. */
3287  aio_event = (io_event*) ut_malloc(n * sizeof(io_event));
3288  memset(aio_event, 0x0, sizeof(io_event) * n);
3289  array->aio_events = aio_event;
3290 
3291 skip_native_aio:
3292 #endif /* LINUX_NATIVE_AIO */
3293  for (i = 0; i < n; i++) {
3294  slot = os_aio_array_get_nth_slot(array, i);
3295 
3296  slot->pos = i;
3297  slot->reserved = FALSE;
3298 #ifdef WIN_ASYNC_IO
3299  slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
3300 
3301  over = &(slot->control);
3302 
3303  over->hEvent = slot->handle;
3304 
3305  *((array->handles) + i) = over->hEvent;
3306 
3307 #elif defined(LINUX_NATIVE_AIO)
3308 
3309  memset(&slot->control, 0x0, sizeof(slot->control));
3310  slot->n_bytes = 0;
3311  slot->ret = 0;
3312 #endif
3313  }
3314 
3315  return(array);
3316 }
3317 
3318 /************************************************************************/
3320 static
3321 void
3322 os_aio_array_free(
3323 /*==============*/
3324  os_aio_array_t* array)
3325 {
3326 #ifdef WIN_ASYNC_IO
3327  ulint i;
3328 
3329  for (i = 0; i < array->n_slots; i++) {
3330  os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3331  CloseHandle(slot->handle);
3332  }
3333 #endif /* WIN_ASYNC_IO */
3334 
3335 #ifdef __WIN__
3336  ut_free(array->handles);
3337 #endif /* __WIN__ */
3338  os_mutex_free(array->mutex);
3339  os_event_free(array->not_full);
3340  os_event_free(array->is_empty);
3341 
3342 #if defined(LINUX_NATIVE_AIO)
3343  if (srv_use_native_aio) {
3344  ut_free(array->aio_events);
3345  ut_free(array->aio_ctx);
3346  }
3347 #endif /* LINUX_NATIVE_AIO */
3348 
3349  ut_free(array->slots);
3350  ut_free(array);
3351 }
3352 
3353 /***********************************************************************
3354 Initializes the asynchronous io system. Creates one array each for ibuf
3355 and log i/o. Also creates one array each for read and write where each
3356 array is divided logically into n_read_segs and n_write_segs
3357 respectively. The caller must create an i/o handler thread for each
3358 segment in these arrays. This function also creates the sync array.
3359 No i/o handler thread needs to be created for that */
3360 UNIV_INTERN
3361 ibool
3362 os_aio_init(
3363 /*========*/
3364  ulint n_per_seg, /*<! in: maximum number of pending aio
3365  operations allowed per segment */
3366  ulint n_read_segs, /*<! in: number of reader threads */
3367  ulint n_write_segs, /*<! in: number of writer threads */
3368  ulint n_slots_sync) /*<! in: number of slots in the sync aio
3369  array */
3370 {
3371  ulint i;
3372  ulint n_segments = 2 + n_read_segs + n_write_segs;
3373 
3374  ut_ad(n_segments >= 4);
3375 
3377 
3378  for (i = 0; i < n_segments; i++) {
3379  srv_set_io_thread_op_info(i, "not started yet");
3380  }
3381 
3382 
3383  /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
3384 
3385  os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3386  if (os_aio_ibuf_array == NULL) {
3387  goto err_exit;
3388  }
3389 
3390  srv_io_thread_function[0] = "insert buffer thread";
3391 
3392  os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3393  if (os_aio_log_array == NULL) {
3394  goto err_exit;
3395  }
3396 
3397  srv_io_thread_function[1] = "log thread";
3398 
3399  os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
3400  n_read_segs);
3401  if (os_aio_read_array == NULL) {
3402  goto err_exit;
3403  }
3404 
3405  for (i = 2; i < 2 + n_read_segs; i++) {
3406  ut_a(i < SRV_MAX_N_IO_THREADS);
3407  srv_io_thread_function[i] = "read thread";
3408  }
3409 
3410  os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
3411  n_write_segs);
3412  if (os_aio_write_array == NULL) {
3413  goto err_exit;
3414  }
3415 
3416  for (i = 2 + n_read_segs; i < n_segments; i++) {
3417  ut_a(i < SRV_MAX_N_IO_THREADS);
3418  srv_io_thread_function[i] = "write thread";
3419  }
3420 
3421  os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3422  if (os_aio_sync_array == NULL) {
3423  goto err_exit;
3424  }
3425 
3426 
3427  os_aio_n_segments = n_segments;
3428 
3429  os_aio_validate();
3430 
3431  os_aio_segment_wait_events = static_cast<os_event_t *>(ut_malloc(n_segments * sizeof(void*)));
3432 
3433  for (i = 0; i < n_segments; i++) {
3434  os_aio_segment_wait_events[i] = os_event_create(NULL);
3435  }
3436 
3437  os_last_printout = time(NULL);
3438 
3439  return(TRUE);
3440 
3441 err_exit:
3442  return(FALSE);
3443 
3444 }
3445 
3446 /***********************************************************************
3447 Frees the asynchronous io system. */
3448 UNIV_INTERN
3449 void
3450 os_aio_free(void)
3451 /*=============*/
3452 {
3453  ulint i;
3454 
3455  os_aio_array_free(os_aio_ibuf_array);
3456  os_aio_ibuf_array = NULL;
3457  os_aio_array_free(os_aio_log_array);
3458  os_aio_log_array = NULL;
3459  os_aio_array_free(os_aio_read_array);
3460  os_aio_read_array = NULL;
3461  os_aio_array_free(os_aio_write_array);
3462  os_aio_write_array = NULL;
3463  os_aio_array_free(os_aio_sync_array);
3464  os_aio_sync_array = NULL;
3465 
3466  for (i = 0; i < os_aio_n_segments; i++) {
3467  os_event_free(os_aio_segment_wait_events[i]);
3468  }
3469 
3470  ut_free(os_aio_segment_wait_events);
3471  os_aio_segment_wait_events = 0;
3472  os_aio_n_segments = 0;
3473 }
3474 
3475 #ifdef WIN_ASYNC_IO
3476 /************************************************************************/
3479 static
3480 void
3481 os_aio_array_wake_win_aio_at_shutdown(
3482 /*==================================*/
3483  os_aio_array_t* array)
3484 {
3485  ulint i;
3486 
3487  for (i = 0; i < array->n_slots; i++) {
3488 
3489  SetEvent((array->slots + i)->handle);
3490  }
3491 }
3492 #endif
3493 
3494 /************************************************************************/
3497 UNIV_INTERN
3498 void
3500 /*=====================================*/
3501 {
3502  ulint i;
3503 
3504 #ifdef WIN_ASYNC_IO
3505  /* This code wakes up all ai/o threads in Windows native aio */
3506  os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3507  os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3508  os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3509  os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3510 
3511 #elif defined(LINUX_NATIVE_AIO)
3512 
3513  /* When using native AIO interface the io helper threads
3514  wait on io_getevents with a timeout value of 500ms. At
3515  each wake up these threads check the server status.
3516  No need to do anything to wake them up. */
3517 
3518  if (srv_use_native_aio) {
3519  return;
3520  }
3521  /* Fall through to simulated AIO handler wakeup if we are
3522  not using native AIO. */
3523 #endif
3524  /* This loop wakes up all simulated ai/o threads */
3525 
3526  for (i = 0; i < os_aio_n_segments; i++) {
3527 
3528  os_event_set(os_aio_segment_wait_events[i]);
3529  }
3530 }
3531 
3532 /************************************************************************/
3535 UNIV_INTERN
3536 void
3538 /*=====================================*/
3539 {
3540  os_event_wait(os_aio_write_array->is_empty);
3541 }
3542 
3543 /**********************************************************************/
3547 static
3548 ulint
3549 os_aio_get_segment_no_from_slot(
3550 /*============================*/
3551  os_aio_array_t* array,
3552  os_aio_slot_t* slot)
3553 {
3554  ulint segment;
3555  ulint seg_len;
3556 
3557  if (array == os_aio_ibuf_array) {
3558  segment = 0;
3559 
3560  } else if (array == os_aio_log_array) {
3561  segment = 1;
3562 
3563  } else if (array == os_aio_read_array) {
3564  seg_len = os_aio_read_array->n_slots
3565  / os_aio_read_array->n_segments;
3566 
3567  segment = 2 + slot->pos / seg_len;
3568  } else {
3569  ut_a(array == os_aio_write_array);
3570  seg_len = os_aio_write_array->n_slots
3571  / os_aio_write_array->n_segments;
3572 
3573  segment = os_aio_read_array->n_segments + 2
3574  + slot->pos / seg_len;
3575  }
3576 
3577  return(segment);
3578 }
3579 
3580 /**********************************************************************/
3583 static
3584 ulint
3585 os_aio_get_array_and_local_segment(
3586 /*===============================*/
3587  os_aio_array_t** array,
3588  ulint global_segment)
3589 {
3590  ulint segment;
3591 
3592  ut_a(global_segment < os_aio_n_segments);
3593 
3594  if (global_segment == 0) {
3595  *array = os_aio_ibuf_array;
3596  segment = 0;
3597 
3598  } else if (global_segment == 1) {
3599  *array = os_aio_log_array;
3600  segment = 0;
3601 
3602  } else if (global_segment < os_aio_read_array->n_segments + 2) {
3603  *array = os_aio_read_array;
3604 
3605  segment = global_segment - 2;
3606  } else {
3607  *array = os_aio_write_array;
3608 
3609  segment = global_segment - (os_aio_read_array->n_segments + 2);
3610  }
3611 
3612  return(segment);
3613 }
3614 
3615 /*******************************************************************/
3619 static
3621 os_aio_array_reserve_slot(
3622 /*======================*/
3623  ulint type,
3624  os_aio_array_t* array,
3625  fil_node_t* message1,
3627  void* message2,
3629  os_file_t file,
3630  const char* name,
3632  void* buf,
3634  ulint offset,
3636  ulint offset_high,
3638  ulint len)
3639 {
3640  os_aio_slot_t* slot = NULL;
3641 #ifdef WIN_ASYNC_IO
3642  OVERLAPPED* control;
3643 
3644 #elif defined(LINUX_NATIVE_AIO)
3645 
3646  struct iocb* iocb;
3647  off_t aio_offset;
3648 
3649 #endif
3650  ulint i;
3651  ulint counter;
3652  ulint slots_per_seg;
3653  ulint local_seg;
3654 
3655 #ifdef WIN_ASYNC_IO
3656  ut_a((len & 0xFFFFFFFFUL) == len);
3657 #endif
3658 
3659  /* No need of a mutex. Only reading constant fields */
3660  slots_per_seg = array->n_slots / array->n_segments;
3661 
3662  /* We attempt to keep adjacent blocks in the same local
3663  segment. This can help in merging IO requests when we are
3664  doing simulated AIO */
3665  local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
3666  % array->n_segments;
3667 
3668 loop:
3669  os_mutex_enter(array->mutex);
3670 
3671  if (array->n_reserved == array->n_slots) {
3672  os_mutex_exit(array->mutex);
3673 
3674  if (!srv_use_native_aio) {
3675  /* If the handler threads are suspended, wake them
3676  so that we get more slots */
3677 
3679  }
3680 
3681  os_event_wait(array->not_full);
3682 
3683  goto loop;
3684  }
3685 
3686  /* We start our search for an available slot from our preferred
3687  local segment and do a full scan of the array. We are
3688  guaranteed to find a slot in full scan. */
3689  for (i = local_seg * slots_per_seg, counter = 0;
3690  counter < array->n_slots; i++, counter++) {
3691 
3692  i %= array->n_slots;
3693  slot = os_aio_array_get_nth_slot(array, i);
3694 
3695  if (slot->reserved == FALSE) {
3696  goto found;
3697  }
3698  }
3699 
3700  /* We MUST always be able to get hold of a reserved slot. */
3701  ut_error;
3702 
3703 found:
3704  ut_a(slot->reserved == FALSE);
3705  array->n_reserved++;
3706 
3707  if (array->n_reserved == 1) {
3708  os_event_reset(array->is_empty);
3709  }
3710 
3711  if (array->n_reserved == array->n_slots) {
3712  os_event_reset(array->not_full);
3713  }
3714 
3715  slot->reserved = TRUE;
3716  slot->reservation_time = time(NULL);
3717  slot->message1 = message1;
3718  slot->message2 = message2;
3719  slot->file = file;
3720  slot->name = name;
3721  slot->len = len;
3722  slot->type = type;
3723  slot->buf = static_cast<unsigned char *>(buf);
3724  slot->offset = offset;
3725  slot->offset_high = offset_high;
3726  slot->io_already_done = FALSE;
3727 
3728 #ifdef WIN_ASYNC_IO
3729  control = &(slot->control);
3730  control->Offset = (DWORD)offset;
3731  control->OffsetHigh = (DWORD)offset_high;
3732  ResetEvent(slot->handle);
3733 
3734 #elif defined(LINUX_NATIVE_AIO)
3735 
3736  /* If we are not using native AIO skip this part. */
3737  if (!srv_use_native_aio) {
3738  goto skip_native_aio;
3739  }
3740 
3741  /* Check if we are dealing with 64 bit arch.
3742  If not then make sure that offset fits in 32 bits. */
3743  if (sizeof(aio_offset) == 8) {
3744  aio_offset = offset_high;
3745  aio_offset <<= 32;
3746  aio_offset += offset;
3747  } else {
3748  ut_a(offset_high == 0);
3749  aio_offset = offset;
3750  }
3751 
3752  iocb = &slot->control;
3753 
3754  if (type == OS_FILE_READ) {
3755  io_prep_pread(iocb, file, buf, len, aio_offset);
3756  } else {
3757  ut_a(type == OS_FILE_WRITE);
3758  io_prep_pwrite(iocb, file, buf, len, aio_offset);
3759  }
3760 
3761  iocb->data = (void*)slot;
3762  slot->n_bytes = 0;
3763  slot->ret = 0;
3764  /*fprintf(stderr, "Filled up Linux native iocb.\n");*/
3765 
3766 
3767 skip_native_aio:
3768 #endif /* LINUX_NATIVE_AIO */
3769  os_mutex_exit(array->mutex);
3770 
3771  return(slot);
3772 }
3773 
3774 /*******************************************************************/
3776 static
3777 void
3778 os_aio_array_free_slot(
3779 /*===================*/
3780  os_aio_array_t* array,
3781  os_aio_slot_t* slot)
3782 {
3783  ut_ad(array);
3784  ut_ad(slot);
3785 
3786  os_mutex_enter(array->mutex);
3787 
3788  ut_ad(slot->reserved);
3789 
3790  slot->reserved = FALSE;
3791 
3792  array->n_reserved--;
3793 
3794  if (array->n_reserved == array->n_slots - 1) {
3795  os_event_set(array->not_full);
3796  }
3797 
3798  if (array->n_reserved == 0) {
3799  os_event_set(array->is_empty);
3800  }
3801 
3802 #ifdef WIN_ASYNC_IO
3803 
3804  ResetEvent(slot->handle);
3805 
3806 #elif defined(LINUX_NATIVE_AIO)
3807 
3808  if (srv_use_native_aio) {
3809  memset(&slot->control, 0x0, sizeof(slot->control));
3810  slot->n_bytes = 0;
3811  slot->ret = 0;
3812  /*fprintf(stderr, "Freed up Linux native slot.\n");*/
3813  } else {
3814  /* These fields should not be used if we are not
3815  using native AIO. */
3816  ut_ad(slot->n_bytes == 0);
3817  ut_ad(slot->ret == 0);
3818  }
3819 
3820 #endif
3821  os_mutex_exit(array->mutex);
3822 }
3823 
3824 /**********************************************************************/
3826 static
3827 void
3828 os_aio_simulated_wake_handler_thread(
3829 /*=================================*/
3830  ulint global_segment)
3832 {
3833  os_aio_array_t* array;
3834  os_aio_slot_t* slot;
3835  ulint segment;
3836  ulint n;
3837  ulint i;
3838 
3839  ut_ad(!srv_use_native_aio);
3840 
3841  segment = os_aio_get_array_and_local_segment(&array, global_segment);
3842 
3843  n = array->n_slots / array->n_segments;
3844 
3845  /* Look through n slots after the segment * n'th slot */
3846 
3847  os_mutex_enter(array->mutex);
3848 
3849  for (i = 0; i < n; i++) {
3850  slot = os_aio_array_get_nth_slot(array, i + segment * n);
3851 
3852  if (slot->reserved) {
3853  /* Found an i/o request */
3854 
3855  break;
3856  }
3857  }
3858 
3859  os_mutex_exit(array->mutex);
3860 
3861  if (i < n) {
3862  os_event_set(os_aio_segment_wait_events[global_segment]);
3863  }
3864 }
3865 
3866 /**********************************************************************/
3868 UNIV_INTERN
3869 void
3871 /*=======================================*/
3872 {
3873  ulint i;
3874 
3875  if (srv_use_native_aio) {
3876  /* We do not use simulated aio: do nothing */
3877 
3878  return;
3879  }
3880 
3881  os_aio_recommend_sleep_for_read_threads = FALSE;
3882 
3883  for (i = 0; i < os_aio_n_segments; i++) {
3884  os_aio_simulated_wake_handler_thread(i);
3885  }
3886 }
3887 
3888 /**********************************************************************/
3893 UNIV_INTERN
3894 void
3896 /*============================================*/
3897 {
3898 
3899 /* The idea of putting background IO threads to sleep is only for
3900 Windows when using simulated AIO. Windows XP seems to schedule
3901 background threads too eagerly to allow for coalescing during
3902 readahead requests. */
3903 #ifdef __WIN__
3904  os_aio_array_t* array;
3905  ulint g;
3906 
3907  if (srv_use_native_aio) {
3908  /* We do not use simulated aio: do nothing */
3909 
3910  return;
3911  }
3912 
3913  os_aio_recommend_sleep_for_read_threads = TRUE;
3914 
3915  for (g = 0; g < os_aio_n_segments; g++) {
3916  os_aio_get_array_and_local_segment(&array, g);
3917 
3918  if (array == os_aio_read_array) {
3919 
3920  os_event_reset(os_aio_segment_wait_events[g]);
3921  }
3922  }
3923 #endif /* __WIN__ */
3924 }
3925 
3926 #if defined(LINUX_NATIVE_AIO)
3927 /*******************************************************************/
3930 static
3931 ibool
3932 os_aio_linux_dispatch(
3933 /*==================*/
3934  os_aio_array_t* array,
3935  os_aio_slot_t* slot)
3936 {
3937  int ret;
3938  ulint io_ctx_index;
3939  struct iocb* iocb;
3940 
3941  ut_ad(slot != NULL);
3942  ut_ad(array);
3943 
3944  ut_a(slot->reserved);
3945 
3946  /* Find out what we are going to work with.
3947  The iocb struct is directly in the slot.
3948  The io_context is one per segment. */
3949 
3950  iocb = &slot->control;
3951  io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3952 
3953  ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3954 
3955 #if defined(UNIV_AIO_DEBUG)
3956  fprintf(stderr,
3957  "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3958  (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3959  array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3960 #endif
3961 
3962  /* io_submit returns number of successfully
3963  queued requests or -errno. */
3964  if (UNIV_UNLIKELY(ret != 1)) {
3965  errno = -ret;
3966  return(FALSE);
3967  }
3968 
3969  return(TRUE);
3970 }
3971 #endif /* LINUX_NATIVE_AIO */
3972 
3973 
3974 /*******************************************************************/
3978 UNIV_INTERN
3979 ibool
3981 /*========*/
3982  ulint type,
3983  ulint mode,
3996  const char* name,
3998  os_file_t file,
3999  void* buf,
4001  ulint offset,
4003  ulint offset_high,
4005  ulint n,
4006  fil_node_t* message1,
4010  void* message2)
4014 {
4015  os_aio_array_t* array;
4016  os_aio_slot_t* slot;
4017 #ifdef WIN_ASYNC_IO
4018  ibool retval;
4019  BOOL ret = TRUE;
4020  DWORD len = (DWORD) n;
4021  struct fil_node_struct * dummy_mess1;
4022  void* dummy_mess2;
4023  ulint dummy_type;
4024 #endif /* WIN_ASYNC_IO */
4025 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4026  ibool retry;
4027 #endif
4028  ulint wake_later;
4029 
4030  ut_ad(file);
4031  ut_ad(buf);
4032  ut_ad(n > 0);
4033  ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
4034  ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
4036 #ifdef WIN_ASYNC_IO
4037  ut_ad((n & 0xFFFFFFFFUL) == n);
4038 #endif
4039 
4040  wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
4041  mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
4042 
4043  if (mode == OS_AIO_SYNC
4044 #ifdef WIN_ASYNC_IO
4045  && !srv_use_native_aio
4046 #endif /* WIN_ASYNC_IO */
4047  ) {
4048  /* This is actually an ordinary synchronous read or write:
4049  no need to use an i/o-handler thread. NOTE that if we use
4050  Windows async i/o, Windows does not allow us to use
4051  ordinary synchronous os_file_read etc. on the same file,
4052  therefore we have built a special mechanism for synchronous
4053  wait in the Windows case. */
4054 
4055  if (type == OS_FILE_READ) {
4056  return(os_file_read(file, buf, offset,
4057  offset_high, n));
4058  }
4059 
4060  ut_a(type == OS_FILE_WRITE);
4061 
4062  return(os_file_write(name, file, buf, offset, offset_high, n));
4063  }
4064 
4065 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4066 try_again:
4067 #endif
4068  if (mode == OS_AIO_NORMAL) {
4069  if (type == OS_FILE_READ) {
4070  array = os_aio_read_array;
4071  } else {
4072  array = os_aio_write_array;
4073  }
4074  } else if (mode == OS_AIO_IBUF) {
4075  ut_ad(type == OS_FILE_READ);
4076  /* Reduce probability of deadlock bugs in connection with ibuf:
4077  do not let the ibuf i/o handler sleep */
4078 
4079  wake_later = FALSE;
4080 
4081  array = os_aio_ibuf_array;
4082  } else if (mode == OS_AIO_LOG) {
4083 
4084  array = os_aio_log_array;
4085  } else if (mode == OS_AIO_SYNC) {
4086  array = os_aio_sync_array;
4087 
4088 #if defined(LINUX_NATIVE_AIO)
4089  /* In Linux native AIO we don't use sync IO array. */
4090  ut_a(!srv_use_native_aio);
4091 #endif /* LINUX_NATIVE_AIO */
4092  } else {
4093  array = NULL; /* Eliminate compiler warning */
4094  ut_error;
4095  }
4096 
4097  slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4098  name, buf, offset, offset_high, n);
4099  if (type == OS_FILE_READ) {
4100  if (srv_use_native_aio) {
4101  os_n_file_reads++;
4102  os_bytes_read_since_printout += n;
4103 #ifdef WIN_ASYNC_IO
4104  ret = ReadFile(file, buf, (DWORD)n, &len,
4105  &(slot->control));
4106 
4107 #elif defined(LINUX_NATIVE_AIO)
4108  if (!os_aio_linux_dispatch(array, slot)) {
4109  goto err_exit;
4110  }
4111 #endif
4112  } else {
4113  if (!wake_later) {
4114  os_aio_simulated_wake_handler_thread(
4115  os_aio_get_segment_no_from_slot(
4116  array, slot));
4117  }
4118  }
4119  } else if (type == OS_FILE_WRITE) {
4120  if (srv_use_native_aio) {
4121  os_n_file_writes++;
4122 #ifdef WIN_ASYNC_IO
4123  ret = WriteFile(file, buf, (DWORD)n, &len,
4124  &(slot->control));
4125 
4126 #elif defined(LINUX_NATIVE_AIO)
4127  if (!os_aio_linux_dispatch(array, slot)) {
4128  goto err_exit;
4129  }
4130 #endif
4131  } else {
4132  if (!wake_later) {
4133  os_aio_simulated_wake_handler_thread(
4134  os_aio_get_segment_no_from_slot(
4135  array, slot));
4136  }
4137  }
4138  } else {
4139  ut_error;
4140  }
4141 
4142 #ifdef WIN_ASYNC_IO
4143  if (srv_use_native_aio) {
4144  if ((ret && len == n)
4145  || (!ret && GetLastError() == ERROR_IO_PENDING)) {
4146  /* aio was queued successfully! */
4147 
4148  if (mode == OS_AIO_SYNC) {
4149  /* We want a synchronous i/o operation on a
4150  file where we also use async i/o: in Windows
4151  we must use the same wait mechanism as for
4152  async i/o */
4153 
4154  retval = os_aio_windows_handle(ULINT_UNDEFINED,
4155  slot->pos,
4156  &dummy_mess1,
4157  &dummy_mess2,
4158  &dummy_type);
4159 
4160  return(retval);
4161  }
4162 
4163  return(TRUE);
4164  }
4165 
4166  goto err_exit;
4167  }
4168 #endif /* WIN_ASYNC_IO */
4169  /* aio was queued successfully! */
4170  return(TRUE);
4171 
4172 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4173 err_exit:
4174  os_aio_array_free_slot(array, slot);
4175 
4176  retry = os_file_handle_error(name,
4177  type == OS_FILE_READ
4178  ? "aio read" : "aio write");
4179  if (retry) {
4180 
4181  goto try_again;
4182  }
4183 
4184  return(FALSE);
4185 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
4186 }
4187 
4188 #ifdef WIN_ASYNC_IO
4189 /**********************************************************************/
4197 UNIV_INTERN
4198 ibool
4199 os_aio_windows_handle(
4200 /*==================*/
4201  ulint segment,
4209  ulint pos,
4211  fil_node_t**message1,
4216  void** message2,
4217  ulint* type)
4218 {
4219  ulint orig_seg = segment;
4220  os_aio_array_t* array;
4221  os_aio_slot_t* slot;
4222  ulint n;
4223  ulint i;
4224  ibool ret_val;
4225  BOOL ret;
4226  DWORD len;
4227  BOOL retry = FALSE;
4228 
4229  if (segment == ULINT_UNDEFINED) {
4230  array = os_aio_sync_array;
4231  segment = 0;
4232  } else {
4233  segment = os_aio_get_array_and_local_segment(&array, segment);
4234  }
4235 
4236  /* NOTE! We only access constant fields in os_aio_array. Therefore
4237  we do not have to acquire the protecting mutex yet */
4238 
4240  ut_ad(segment < array->n_segments);
4241 
4242  n = array->n_slots / array->n_segments;
4243 
4244  if (array == os_aio_sync_array) {
4245  WaitForSingleObject(
4246  os_aio_array_get_nth_slot(array, pos)->handle,
4247  INFINITE);
4248  i = pos;
4249  } else {
4250  srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
4251  i = WaitForMultipleObjects((DWORD) n,
4252  array->handles + segment * n,
4253  FALSE,
4254  INFINITE);
4255  }
4256 
4258  os_thread_exit(NULL);
4259  }
4260 
4261  os_mutex_enter(array->mutex);
4262 
4263  slot = os_aio_array_get_nth_slot(array, i + segment * n);
4264 
4265  ut_a(slot->reserved);
4266 
4267  if (orig_seg != ULINT_UNDEFINED) {
4268  srv_set_io_thread_op_info(orig_seg,
4269  "get windows aio return value");
4270  }
4271 
4272  ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
4273 
4274  *message1 = slot->message1;
4275  *message2 = slot->message2;
4276 
4277  *type = slot->type;
4278 
4279  if (ret && len == slot->len) {
4280  ret_val = TRUE;
4281 
4282 #ifdef UNIV_DO_FLUSH
4283  if (slot->type == OS_FILE_WRITE
4284  && !os_do_not_call_flush_at_each_write) {
4285  if (!os_file_flush(slot->file)) {
4286  ut_error;
4287  }
4288  }
4289 #endif /* UNIV_DO_FLUSH */
4290  } else if (os_file_handle_error(slot->name, "Windows aio")) {
4291 
4292  retry = TRUE;
4293  } else {
4294 
4295  ret_val = FALSE;
4296  }
4297 
4298  os_mutex_exit(array->mutex);
4299 
4300  if (retry) {
4301  /* retry failed read/write operation synchronously.
4302  No need to hold array->mutex. */
4303 
4304 #ifdef UNIV_PFS_IO
4305  /* This read/write does not go through os_file_read
4306  and os_file_write APIs, need to register with
4307  performance schema explicitly here. */
4308  struct PSI_file_locker* locker = NULL;
4309  register_pfs_file_io_begin(locker, slot->file, slot->len,
4310  (slot->type == OS_FILE_WRITE)
4311  ? PSI_FILE_WRITE
4312  : PSI_FILE_READ,
4313  __FILE__, __LINE__);
4314 #endif
4315 
4316  ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4317 
4318  switch (slot->type) {
4319  case OS_FILE_WRITE:
4320  ret = WriteFile(slot->file, slot->buf,
4321  (DWORD) slot->len, &len,
4322  &(slot->control));
4323 
4324  break;
4325  case OS_FILE_READ:
4326  ret = ReadFile(slot->file, slot->buf,
4327  (DWORD) slot->len, &len,
4328  &(slot->control));
4329 
4330  break;
4331  default:
4332  ut_error;
4333  }
4334 
4335 #ifdef UNIV_PFS_IO
4336  register_pfs_file_io_end(locker, len);
4337 #endif
4338 
4339  if (!ret && GetLastError() == ERROR_IO_PENDING) {
4340  /* aio was queued successfully!
4341  We want a synchronous i/o operation on a
4342  file where we also use async i/o: in Windows
4343  we must use the same wait mechanism as for
4344  async i/o */
4345 
4346  ret = GetOverlappedResult(slot->file,
4347  &(slot->control),
4348  &len, TRUE);
4349  }
4350 
4351  ret_val = ret && len == slot->len;
4352  }
4353 
4354  os_aio_array_free_slot(array, slot);
4355 
4356  return(ret_val);
4357 }
4358 #endif
4359 
4360 #if defined(LINUX_NATIVE_AIO)
4361 /******************************************************************/
4372 static
4373 void
4374 os_aio_linux_collect(
4375 /*=================*/
4376  os_aio_array_t* array,
4377  ulint segment,
4378  ulint seg_size)
4379 {
4380  int i;
4381  int ret;
4382  ulint start_pos;
4383  ulint end_pos;
4384  struct timespec timeout;
4385  struct io_event* events;
4386  struct io_context* io_ctx;
4387 
4388  /* sanity checks. */
4389  ut_ad(array != NULL);
4390  ut_ad(seg_size > 0);
4391  ut_ad(segment < array->n_segments);
4392 
4393  /* Which part of event array we are going to work on. */
4394  events = &array->aio_events[segment * seg_size];
4395 
4396  /* Which io_context we are going to use. */
4397  io_ctx = array->aio_ctx[segment];
4398 
4399  /* Starting point of the segment we will be working on. */
4400  start_pos = segment * seg_size;
4401 
4402  /* End point. */
4403  end_pos = start_pos + seg_size;
4404 
4405 retry:
4406 
4407  /* Go down if we are in shutdown mode.
4408  In case of srv_fast_shutdown == 2, there may be pending
4409  IO requests but that should be OK as we essentially treat
4410  that as a crash of InnoDB. */
4412  os_thread_exit(NULL);
4413  }
4414 
4415  /* Initialize the events. The timeout value is arbitrary.
4416  We probably need to experiment with it a little. */
4417  memset(events, 0, sizeof(*events) * seg_size);
4418  timeout.tv_sec = 0;
4419  timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4420 
4421  ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4422 
4423  /* This error handling is for any error in collecting the
4424  IO requests. The errors, if any, for any particular IO
4425  request are simply passed on to the calling routine. */
4426 
4427  /* Not enough resources! Try again. */
4428  if (ret == -EAGAIN) {
4429  goto retry;
4430  }
4431 
4432  /* Interrupted! I have tested the behaviour in case of an
4433  interrupt. If we have some completed IOs available then
4434  the return code will be the number of IOs. We get EINTR only
4435  if there are no completed IOs and we have been interrupted. */
4436  if (ret == -EINTR) {
4437  goto retry;
4438  }
4439 
4440  /* No pending request! Go back and check again. */
4441  if (ret == 0) {
4442  goto retry;
4443  }
4444 
4445  /* All other errors! should cause a trap for now. */
4446  if (UNIV_UNLIKELY(ret < 0)) {
4447  ut_print_timestamp(stderr);
4448  fprintf(stderr,
4449  " InnoDB: unexpected ret_code[%d] from"
4450  " io_getevents()!\n", ret);
4451  ut_error;
4452  }
4453 
4454  ut_a(ret > 0);
4455 
4456  for (i = 0; i < ret; i++) {
4457  os_aio_slot_t* slot;
4458  struct iocb* control;
4459 
4460  control = (struct iocb *)events[i].obj;
4461  ut_a(control != NULL);
4462 
4463  slot = (os_aio_slot_t *) control->data;
4464 
4465  /* Some sanity checks. */
4466  ut_a(slot != NULL);
4467  ut_a(slot->reserved);
4468 
4469 #if defined(UNIV_AIO_DEBUG)
4470  fprintf(stderr,
4471  "io_getevents[%c]: slot[%p] ctx[%p]"
4472  " seg[%lu]\n",
4473  (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4474  slot, io_ctx, segment);
4475 #endif
4476 
4477  /* We are not scribbling previous segment. */
4478  ut_a(slot->pos >= start_pos);
4479 
4480  /* We have not overstepped to next segment. */
4481  ut_a(slot->pos < end_pos);
4482 
4483  /* Mark this request as completed. The error handling
4484  will be done in the calling function. */
4485  os_mutex_enter(array->mutex);
4486  slot->n_bytes = events[i].res;
4487  slot->ret = events[i].res2;
4488  slot->io_already_done = TRUE;
4489  os_mutex_exit(array->mutex);
4490  }
4491 
4492  return;
4493 }
4494 
4495 /**********************************************************************/
4503 UNIV_INTERN
4504 ibool
4505 os_aio_linux_handle(
4506 /*================*/
4507  ulint global_seg,
4513  fil_node_t**message1,
4514  void** message2,
4518  ulint* type)
4519 {
4520  ulint segment;
4521  os_aio_array_t* array;
4522  os_aio_slot_t* slot;
4523  ulint n;
4524  ulint i;
4525  ibool ret = FALSE;
4526 
4527  /* Should never be doing Sync IO here. */
4528  ut_a(global_seg != ULINT_UNDEFINED);
4529 
4530  /* Find the array and the local segment. */
4531  segment = os_aio_get_array_and_local_segment(&array, global_seg);
4532  n = array->n_slots / array->n_segments;
4533 
4534  /* Loop until we have found a completed request. */
4535  for (;;) {
4536  os_mutex_enter(array->mutex);
4537  for (i = 0; i < n; ++i) {
4538  slot = os_aio_array_get_nth_slot(
4539  array, i + segment * n);
4540  if (slot->reserved && slot->io_already_done) {
4541  /* Something for us to work on. */
4542  goto found;
4543  }
4544  }
4545 
4546  os_mutex_exit(array->mutex);
4547 
4548  /* We don't have any completed request.
4549  Wait for some request. Note that we return
4550  from wait iff we have found a request. */
4551 
4552  srv_set_io_thread_op_info(global_seg,
4553  "waiting for completed aio requests");
4554  os_aio_linux_collect(array, segment, n);
4555  }
4556 
4557 found:
4558  /* Note that it may be that there are more then one completed
4559  IO requests. We process them one at a time. We may have a case
4560  here to improve the performance slightly by dealing with all
4561  requests in one sweep. */
4562  srv_set_io_thread_op_info(global_seg,
4563  "processing completed aio requests");
4564 
4565  /* Ensure that we are scribbling only our segment. */
4566  ut_a(i < n);
4567 
4568  ut_ad(slot != NULL);
4569  ut_ad(slot->reserved);
4570  ut_ad(slot->io_already_done);
4571 
4572  *message1 = slot->message1;
4573  *message2 = slot->message2;
4574 
4575  *type = slot->type;
4576 
4577  if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4578  ret = TRUE;
4579 
4580 #ifdef UNIV_DO_FLUSH
4581  if (slot->type == OS_FILE_WRITE
4582  && !os_do_not_call_flush_at_each_write)
4583  && !os_file_flush(slot->file) {
4584  ut_error;
4585  }
4586 #endif /* UNIV_DO_FLUSH */
4587  } else {
4588  errno = -slot->ret;
4589 
4590  /* os_file_handle_error does tell us if we should retry
4591  this IO. As it stands now, we don't do this retry when
4592  reaping requests from a different context than
4593  the dispatcher. This non-retry logic is the same for
4594  windows and linux native AIO.
4595  We should probably look into this to transparently
4596  re-submit the IO. */
4597  os_file_handle_error(slot->name, "Linux aio");
4598 
4599  ret = FALSE;
4600  }
4601 
4602  os_mutex_exit(array->mutex);
4603 
4604  os_aio_array_free_slot(array, slot);
4605 
4606  return(ret);
4607 }
4608 #endif /* LINUX_NATIVE_AIO */
4609 
4610 /**********************************************************************/
4614 UNIV_INTERN
4615 ibool
4617 /*====================*/
4618  ulint global_segment,
4623  fil_node_t**message1,
4628  void** message2,
4629  ulint* type)
4630 {
4631  os_aio_array_t* array;
4632  ulint segment;
4633  os_aio_slot_t* slot;
4634  os_aio_slot_t* slot2;
4635  os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
4636  ulint n_consecutive;
4637  ulint total_len;
4638  ulint offs;
4639  ulint lowest_offset;
4640  ulint biggest_age;
4641  ulint age;
4642  byte* combined_buf;
4643  byte* combined_buf2;
4644  ibool ret;
4645  ulint n;
4646  ulint i;
4647 
4648  /* Fix compiler warning */
4649  *consecutive_ios = NULL;
4650 
4651  memset(consecutive_ios, 0, sizeof(os_aio_slot_t*) * OS_AIO_MERGE_N_CONSECUTIVE);
4652  segment = os_aio_get_array_and_local_segment(&array, global_segment);
4653 
4654 restart:
4655  /* NOTE! We only access constant fields in os_aio_array. Therefore
4656  we do not have to acquire the protecting mutex yet */
4657 
4658  srv_set_io_thread_op_info(global_segment,
4659  "looking for i/o requests (a)");
4661  ut_ad(segment < array->n_segments);
4662 
4663  n = array->n_slots / array->n_segments;
4664 
4665  /* Look through n slots after the segment * n'th slot */
4666 
4667  if (array == os_aio_read_array
4668  && os_aio_recommend_sleep_for_read_threads) {
4669 
4670  /* Give other threads chance to add several i/os to the array
4671  at once. */
4672 
4673  goto recommended_sleep;
4674  }
4675 
4676  os_mutex_enter(array->mutex);
4677 
4678  srv_set_io_thread_op_info(global_segment,
4679  "looking for i/o requests (b)");
4680 
4681  /* Check if there is a slot for which the i/o has already been
4682  done */
4683 
4684  for (i = 0; i < n; i++) {
4685  slot = os_aio_array_get_nth_slot(array, i + segment * n);
4686 
4687  if (slot->reserved && slot->io_already_done) {
4688 
4689  if (os_aio_print_debug) {
4690  fprintf(stderr,
4691  "InnoDB: i/o for slot %lu"
4692  " already done, returning\n",
4693  (ulong) i);
4694  }
4695 
4696  ret = TRUE;
4697 
4698  goto slot_io_done;
4699  }
4700  }
4701 
4702  n_consecutive = 0;
4703 
4704  /* If there are at least 2 seconds old requests, then pick the oldest
4705  one to prevent starvation. If several requests have the same age,
4706  then pick the one at the lowest offset. */
4707 
4708  biggest_age = 0;
4709  lowest_offset = ULINT_MAX;
4710 
4711  for (i = 0; i < n; i++) {
4712  slot = os_aio_array_get_nth_slot(array, i + segment * n);
4713 
4714  if (slot->reserved) {
4715  age = (ulint)difftime(time(NULL),
4716  slot->reservation_time);
4717 
4718  if ((age >= 2 && age > biggest_age)
4719  || (age >= 2 && age == biggest_age
4720  && slot->offset < lowest_offset)) {
4721 
4722  /* Found an i/o request */
4723  consecutive_ios[0] = slot;
4724 
4725  n_consecutive = 1;
4726 
4727  biggest_age = age;
4728  lowest_offset = slot->offset;
4729  }
4730  }
4731  }
4732 
4733  if (n_consecutive == 0) {
4734  /* There were no old requests. Look for an i/o request at the
4735  lowest offset in the array (we ignore the high 32 bits of the
4736  offset in these heuristics) */
4737 
4738  lowest_offset = ULINT_MAX;
4739 
4740  for (i = 0; i < n; i++) {
4741  slot = os_aio_array_get_nth_slot(array,
4742  i + segment * n);
4743 
4744  if (slot->reserved && slot->offset < lowest_offset) {
4745 
4746  /* Found an i/o request */
4747  consecutive_ios[0] = slot;
4748 
4749  n_consecutive = 1;
4750 
4751  lowest_offset = slot->offset;
4752  }
4753  }
4754  }
4755 
4756  if (n_consecutive == 0) {
4757 
4758  /* No i/o requested at the moment */
4759 
4760  goto wait_for_io;
4761  }
4762 
4763  /* if n_consecutive != 0, then we have assigned
4764  something valid to consecutive_ios[0] */
4765  ut_ad(n_consecutive != 0);
4766  ut_ad(consecutive_ios[0] != NULL);
4767 
4768  slot = consecutive_ios[0];
4769 
4770  /* Check if there are several consecutive blocks to read or write */
4771 
4772 consecutive_loop:
4773  for (i = 0; i < n; i++) {
4774  slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
4775 
4776  if (slot2->reserved && slot2 != slot
4777  && slot2->offset == slot->offset + slot->len
4778  /* check that sum does not wrap over */
4779  && slot->offset + slot->len > slot->offset
4780  && slot2->offset_high == slot->offset_high
4781  && slot2->type == slot->type
4782  && slot2->file == slot->file) {
4783 
4784  /* Found a consecutive i/o request */
4785 
4786  consecutive_ios[n_consecutive] = slot2;
4787  n_consecutive++;
4788 
4789  slot = slot2;
4790 
4791  if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
4792 
4793  goto consecutive_loop;
4794  } else {
4795  break;
4796  }
4797  }
4798  }
4799 
4800  srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
4801 
4802  /* We have now collected n_consecutive i/o requests in the array;
4803  allocate a single buffer which can hold all data, and perform the
4804  i/o */
4805 
4806  total_len = 0;
4807  slot = consecutive_ios[0];
4808 
4809  for (i = 0; i < n_consecutive; i++) {
4810  total_len += consecutive_ios[i]->len;
4811  }
4812 
4813  if (n_consecutive == 1) {
4814  /* We can use the buffer of the i/o request */
4815  combined_buf = slot->buf;
4816  combined_buf2 = NULL;
4817  } else {
4818  combined_buf2 = static_cast<unsigned char *>(ut_malloc(total_len + UNIV_PAGE_SIZE));
4819 
4820  ut_a(combined_buf2);
4821 
4822  combined_buf = static_cast<unsigned char *>(ut_align(combined_buf2, UNIV_PAGE_SIZE));
4823  }
4824 
4825  /* We release the array mutex for the time of the i/o: NOTE that
4826  this assumes that there is just one i/o-handler thread serving
4827  a single segment of slots! */
4828 
4829  os_mutex_exit(array->mutex);
4830 
4831  if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
4832  /* Copy the buffers to the combined buffer */
4833  offs = 0;
4834 
4835  for (i = 0; i < n_consecutive; i++) {
4836 
4837  ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
4838  consecutive_ios[i]->len);
4839  offs += consecutive_ios[i]->len;
4840  }
4841  }
4842 
4843  srv_set_io_thread_op_info(global_segment, "doing file i/o");
4844 
4845  if (os_aio_print_debug) {
4846  fprintf(stderr,
4847  "InnoDB: doing i/o of type %lu at offset %lu %lu,"
4848  " length %lu\n",
4849  (ulong) slot->type, (ulong) slot->offset_high,
4850  (ulong) slot->offset, (ulong) total_len);
4851  }
4852 
4853  /* Do the i/o with ordinary, synchronous i/o functions: */
4854  if (slot->type == OS_FILE_WRITE) {
4855  ret = os_file_write(slot->name, slot->file, combined_buf,
4856  slot->offset, slot->offset_high,
4857  total_len);
4858  } else {
4859  ret = os_file_read(slot->file, combined_buf,
4860  slot->offset, slot->offset_high, total_len);
4861  }
4862 
4863  ut_a(ret);
4864  srv_set_io_thread_op_info(global_segment, "file i/o done");
4865 
4866 #if 0
4867  fprintf(stderr,
4868  "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4869  n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
4870 #endif
4871 
4872  if (slot->type == OS_FILE_READ && n_consecutive > 1) {
4873  /* Copy the combined buffer to individual buffers */
4874  offs = 0;
4875 
4876  for (i = 0; i < n_consecutive; i++) {
4877 
4878  ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
4879  consecutive_ios[i]->len);
4880  offs += consecutive_ios[i]->len;
4881  }
4882  }
4883 
4884  if (combined_buf2) {
4885  ut_free(combined_buf2);
4886  }
4887 
4888  os_mutex_enter(array->mutex);
4889 
4890  /* Mark the i/os done in slots */
4891 
4892  for (i = 0; i < n_consecutive; i++) {
4893  consecutive_ios[i]->io_already_done = TRUE;
4894  }
4895 
4896  /* We return the messages for the first slot now, and if there were
4897  several slots, the messages will be returned with subsequent calls
4898  of this function */
4899 
4900 slot_io_done:
4901 
4902  ut_a(slot->reserved);
4903 
4904  *message1 = slot->message1;
4905  *message2 = slot->message2;
4906 
4907  *type = slot->type;
4908 
4909  os_mutex_exit(array->mutex);
4910 
4911  os_aio_array_free_slot(array, slot);
4912 
4913  return(ret);
4914 
4915 wait_for_io:
4916  srv_set_io_thread_op_info(global_segment, "resetting wait event");
4917 
4918  /* We wait here until there again can be i/os in the segment
4919  of this thread */
4920 
4921  os_event_reset(os_aio_segment_wait_events[global_segment]);
4922 
4923  os_mutex_exit(array->mutex);
4924 
4925 recommended_sleep:
4926  srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
4927 
4928  os_event_wait(os_aio_segment_wait_events[global_segment]);
4929 
4930  if (os_aio_print_debug) {
4931  fprintf(stderr,
4932  "InnoDB: i/o handler thread for i/o"
4933  " segment %lu wakes up\n",
4934  (ulong) global_segment);
4935  }
4936 
4937  goto restart;
4938 }
4939 
4940 /**********************************************************************/
4943 static
4944 ibool
4945 os_aio_array_validate(
4946 /*==================*/
4947  os_aio_array_t* array)
4948 {
4949  os_aio_slot_t* slot;
4950  ulint n_reserved = 0;
4951  ulint i;
4952 
4953  ut_a(array);
4954 
4955  os_mutex_enter(array->mutex);
4956 
4957  ut_a(array->n_slots > 0);
4958  ut_a(array->n_segments > 0);
4959 
4960  for (i = 0; i < array->n_slots; i++) {
4961  slot = os_aio_array_get_nth_slot(array, i);
4962 
4963  if (slot->reserved) {
4964  n_reserved++;
4965  ut_a(slot->len > 0);
4966  }
4967  }
4968 
4969  ut_a(array->n_reserved == n_reserved);
4970 
4971  os_mutex_exit(array->mutex);
4972 
4973  return(TRUE);
4974 }
4975 
4976 /**********************************************************************/
4979 UNIV_INTERN
4980 ibool
4982 /*=================*/
4983 {
4984  os_aio_array_validate(os_aio_read_array);
4985  os_aio_array_validate(os_aio_write_array);
4986  os_aio_array_validate(os_aio_ibuf_array);
4987  os_aio_array_validate(os_aio_log_array);
4988  os_aio_array_validate(os_aio_sync_array);
4989 
4990  return(TRUE);
4991 }
4992 
4993 /**********************************************************************/
4998 static
4999 void
5000 os_aio_print_segment_info(
5001 /*======================*/
5002  FILE* file,
5003  ulint* n_seg,
5004  os_aio_array_t* array)
5005 {
5006  ulint i;
5007 
5008  ut_ad(array);
5009  ut_ad(n_seg);
5010  ut_ad(array->n_segments > 0);
5011 
5012  if (array->n_segments == 1) {
5013  return;
5014  }
5015 
5016  fprintf(file, " [");
5017  for (i = 0; i < array->n_segments; i++) {
5018  if (i != 0) {
5019  fprintf(file, ", ");
5020  }
5021 
5022  fprintf(file, "%lu", n_seg[i]);
5023  }
5024  fprintf(file, "] ");
5025 }
5026 
5027 /**********************************************************************/
5029 UNIV_INTERN
5030 void
5032 /*=========*/
5033  FILE* file)
5034 {
5035  os_aio_array_t* array;
5036  os_aio_slot_t* slot;
5037  ulint n_reserved;
5038  ulint n_res_seg[SRV_MAX_N_IO_THREADS];
5039  time_t current_time;
5040  double time_elapsed;
5041  double avg_bytes_read;
5042  ulint i;
5043 
5044  for (i = 0; i < srv_n_file_io_threads; i++) {
5045  fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
5046  srv_io_thread_op_info[i],
5047  srv_io_thread_function[i]);
5048 
5049 #ifndef __WIN__
5050  if (os_aio_segment_wait_events[i]->is_set) {
5051  fprintf(file, " ev set");
5052  }
5053 #endif
5054 
5055  fprintf(file, "\n");
5056  }
5057 
5058  fputs("Pending normal aio reads:", file);
5059 
5060  array = os_aio_read_array;
5061 loop:
5062  ut_a(array);
5063 
5064  os_mutex_enter(array->mutex);
5065 
5066  ut_a(array->n_slots > 0);
5067  ut_a(array->n_segments > 0);
5068 
5069  n_reserved = 0;
5070 
5071  memset(n_res_seg, 0x0, sizeof(n_res_seg));
5072 
5073  for (i = 0; i < array->n_slots; i++) {
5074  ulint seg_no;
5075 
5076  slot = os_aio_array_get_nth_slot(array, i);
5077 
5078  seg_no = (i * array->n_segments) / array->n_slots;
5079  if (slot->reserved) {
5080  n_reserved++;
5081  n_res_seg[seg_no]++;
5082 #if 0
5083  fprintf(stderr, "Reserved slot, messages %p %p\n",
5084  (void*) slot->message1,
5085  (void*) slot->message2);
5086 #endif
5087  ut_a(slot->len > 0);
5088  }
5089  }
5090 
5091  ut_a(array->n_reserved == n_reserved);
5092 
5093  fprintf(file, " %lu", (ulong) n_reserved);
5094 
5095  os_aio_print_segment_info(file, n_res_seg, array);
5096 
5097  os_mutex_exit(array->mutex);
5098 
5099  if (array == os_aio_read_array) {
5100  fputs(", aio writes:", file);
5101 
5102  array = os_aio_write_array;
5103 
5104  goto loop;
5105  }
5106 
5107  if (array == os_aio_write_array) {
5108  fputs(",\n ibuf aio reads:", file);
5109  array = os_aio_ibuf_array;
5110 
5111  goto loop;
5112  }
5113 
5114  if (array == os_aio_ibuf_array) {
5115  fputs(", log i/o's:", file);
5116  array = os_aio_log_array;
5117 
5118  goto loop;
5119  }
5120 
5121  if (array == os_aio_log_array) {
5122  fputs(", sync i/o's:", file);
5123  array = os_aio_sync_array;
5124 
5125  goto loop;
5126  }
5127 
5128  putc('\n', file);
5129  current_time = time(NULL);
5130  time_elapsed = 0.001 + difftime(current_time, os_last_printout);
5131 
5132  fprintf(file,
5133  "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
5134  "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
5135  (ulong) fil_n_pending_log_flushes,
5136  (ulong) fil_n_pending_tablespace_flushes,
5137  (ulong) os_n_file_reads, (ulong) os_n_file_writes,
5138  (ulong) os_n_fsyncs);
5139 
5141  fprintf(file,
5142  "%lu pending preads, %lu pending pwrites\n",
5143  (ulong) os_file_n_pending_preads,
5144  (ulong) os_file_n_pending_pwrites);
5145  }
5146 
5147  if (os_n_file_reads == os_n_file_reads_old) {
5148  avg_bytes_read = 0.0;
5149  } else {
5150  avg_bytes_read = (double) os_bytes_read_since_printout
5151  / (os_n_file_reads - os_n_file_reads_old);
5152  }
5153 
5154  fprintf(file,
5155  "%.2f reads/s, %lu avg bytes/read,"
5156  " %.2f writes/s, %.2f fsyncs/s\n",
5157  (os_n_file_reads - os_n_file_reads_old)
5158  / time_elapsed,
5159  (ulong)avg_bytes_read,
5160  (os_n_file_writes - os_n_file_writes_old)
5161  / time_elapsed,
5162  (os_n_fsyncs - os_n_fsyncs_old)
5163  / time_elapsed);
5164 
5165  os_n_file_reads_old = os_n_file_reads;
5166  os_n_file_writes_old = os_n_file_writes;
5167  os_n_fsyncs_old = os_n_fsyncs;
5168  os_bytes_read_since_printout = 0;
5169 
5170  os_last_printout = current_time;
5171 }
5172 
5173 /**********************************************************************/
5175 UNIV_INTERN
5176 void
5178 /*======================*/
5179 {
5180  os_n_file_reads_old = os_n_file_reads;
5181  os_n_file_writes_old = os_n_file_writes;
5182  os_n_fsyncs_old = os_n_fsyncs;
5183  os_bytes_read_since_printout = 0;
5184 
5185  os_last_printout = time(NULL);
5186 }
5187 
5188 #ifdef UNIV_DEBUG
5189 /**********************************************************************/
5193 UNIV_INTERN
5194 ibool
5195 os_aio_all_slots_free(void)
5196 /*=======================*/
5197 {
5198  os_aio_array_t* array;
5199  ulint n_res = 0;
5200 
5201  array = os_aio_read_array;
5202 
5203  os_mutex_enter(array->mutex);
5204 
5205  n_res += array->n_reserved;
5206 
5207  os_mutex_exit(array->mutex);
5208 
5209  array = os_aio_write_array;
5210 
5211  os_mutex_enter(array->mutex);
5212 
5213  n_res += array->n_reserved;
5214 
5215  os_mutex_exit(array->mutex);
5216 
5217  array = os_aio_ibuf_array;
5218 
5219  os_mutex_enter(array->mutex);
5220 
5221  n_res += array->n_reserved;
5222 
5223  os_mutex_exit(array->mutex);
5224 
5225  array = os_aio_log_array;
5226 
5227  os_mutex_enter(array->mutex);
5228 
5229  n_res += array->n_reserved;
5230 
5231  os_mutex_exit(array->mutex);
5232 
5233  array = os_aio_sync_array;
5234 
5235  os_mutex_enter(array->mutex);
5236 
5237  n_res += array->n_reserved;
5238 
5239  os_mutex_exit(array->mutex);
5240 
5241  if (n_res == 0) {
5242 
5243  return(TRUE);
5244  }
5245 
5246  return(FALSE);
5247 }
5248 #endif /* UNIV_DEBUG */
5249 
5250 #endif /* !UNIV_HOTBACKUP */