libstdc++
balanced_quicksort.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 
3 // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 3, or (at your option) any later
9 // version.
10 
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /** @file parallel/balanced_quicksort.h
26  * @brief Implementation of a dynamically load-balanced parallel quicksort.
27  *
28  * It works in-place and needs only logarithmic extra memory.
29  * The algorithm is similar to the one proposed in
30  *
31  * P. Tsigas and Y. Zhang.
32  * A simple, fast parallel implementation of quicksort and
33  * its performance evaluation on SUN enterprise 10000.
34  * In 11th Euromicro Conference on Parallel, Distributed and
35  * Network-Based Processing, page 372, 2003.
36  *
37  * This file is a GNU parallel extension to the Standard C++ Library.
38  */
39 
40 // Written by Johannes Singler.
41 
42 #ifndef _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H
43 #define _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H 1
44 
46 #include <bits/stl_algo.h>
47 
48 #include <parallel/settings.h>
49 #include <parallel/partition.h>
50 #include <parallel/random_number.h>
51 #include <parallel/queue.h>
52 #include <functional>
53 
54 #if _GLIBCXX_ASSERTIONS
55 #include <parallel/checkers.h>
56 #endif
57 
58 namespace __gnu_parallel
59 {
60 /** @brief Information local to one thread in the parallel quicksort run. */
61 template<typename RandomAccessIterator>
63  {
65  typedef typename traits_type::difference_type difference_type;
66 
67  /** @brief Continuous part of the sequence, described by an
68  iterator pair. */
70 
71  /** @brief Initial piece to work on. */
73 
74  /** @brief Work-stealing queue. */
76 
77  /** @brief Number of threads involved in this algorithm. */
79 
80  /** @brief Pointer to a counter of elements left over to sort. */
81  volatile difference_type* elements_leftover;
82 
83  /** @brief The complete sequence to sort. */
85 
86  /** @brief Constructor.
87  * @param queue_size Size of the work-stealing queue. */
88  QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { }
89  };
90 
91 /** @brief Balanced quicksort divide step.
92  * @param begin Begin iterator of subsequence.
93  * @param end End iterator of subsequence.
94  * @param comp Comparator.
95  * @param num_threads Number of threads that are allowed to work on
96  * this part.
97  * @pre @c (end-begin)>=1 */
98 template<typename RandomAccessIterator, typename Comparator>
99  typename std::iterator_traits<RandomAccessIterator>::difference_type
100  qsb_divide(RandomAccessIterator begin, RandomAccessIterator end,
101  Comparator comp, thread_index_t num_threads)
102  {
103  _GLIBCXX_PARALLEL_ASSERT(num_threads > 0);
104 
105  typedef std::iterator_traits<RandomAccessIterator> traits_type;
106  typedef typename traits_type::value_type value_type;
107  typedef typename traits_type::difference_type difference_type;
108 
109  RandomAccessIterator pivot_pos =
110  median_of_three_iterators(begin, begin + (end - begin) / 2,
111  end - 1, comp);
112 
113 #if defined(_GLIBCXX_ASSERTIONS)
114  // Must be in between somewhere.
115  difference_type n = end - begin;
116 
117  _GLIBCXX_PARALLEL_ASSERT(
118  (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
119  || (!comp(*pivot_pos, *begin) && !comp(*(end - 1), *pivot_pos))
120  || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
121  || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*(end - 1), *pivot_pos))
122  || (!comp(*pivot_pos, *(end - 1)) && !comp(*begin, *pivot_pos))
123  || (!comp(*pivot_pos, *(end - 1)) && !comp(*(begin + n / 2), *pivot_pos)));
124 #endif
125 
126  // Swap pivot value to end.
127  if (pivot_pos != (end - 1))
128  std::swap(*pivot_pos, *(end - 1));
129  pivot_pos = end - 1;
130 
132  pred(comp, *pivot_pos);
133 
134  // Divide, returning end - begin - 1 in the worst case.
135  difference_type split_pos = parallel_partition(
136  begin, end - 1, pred, num_threads);
137 
138  // Swap back pivot to middle.
139  std::swap(*(begin + split_pos), *pivot_pos);
140  pivot_pos = begin + split_pos;
141 
142 #if _GLIBCXX_ASSERTIONS
143  RandomAccessIterator r;
144  for (r = begin; r != pivot_pos; ++r)
145  _GLIBCXX_PARALLEL_ASSERT(comp(*r, *pivot_pos));
146  for (; r != end; ++r)
147  _GLIBCXX_PARALLEL_ASSERT(!comp(*r, *pivot_pos));
148 #endif
149 
150  return split_pos;
151  }
152 
153 /** @brief Quicksort conquer step.
154  * @param tls Array of thread-local storages.
155  * @param begin Begin iterator of subsequence.
156  * @param end End iterator of subsequence.
157  * @param comp Comparator.
158  * @param iam Number of the thread processing this function.
159  * @param num_threads
160  * Number of threads that are allowed to work on this part. */
161 template<typename RandomAccessIterator, typename Comparator>
162  void
164  RandomAccessIterator begin, RandomAccessIterator end,
165  Comparator comp,
166  thread_index_t iam, thread_index_t num_threads,
167  bool parent_wait)
168  {
169  typedef std::iterator_traits<RandomAccessIterator> traits_type;
170  typedef typename traits_type::value_type value_type;
171  typedef typename traits_type::difference_type difference_type;
172 
173  difference_type n = end - begin;
174 
175  if (num_threads <= 1 || n <= 1)
176  {
177  tls[iam]->initial.first = begin;
178  tls[iam]->initial.second = end;
179 
180  qsb_local_sort_with_helping(tls, comp, iam, parent_wait);
181 
182  return;
183  }
184 
185  // Divide step.
186  difference_type split_pos = qsb_divide(begin, end, comp, num_threads);
187 
188 #if _GLIBCXX_ASSERTIONS
189  _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin));
190 #endif
191 
192  thread_index_t num_threads_leftside =
193  std::max<thread_index_t>(1, std::min<thread_index_t>(
194  num_threads - 1, split_pos * num_threads / n));
195 
196 # pragma omp atomic
197  *tls[iam]->elements_leftover -= (difference_type)1;
198 
199  // Conquer step.
200 # pragma omp parallel num_threads(2)
201  {
202  bool wait;
203  if(omp_get_num_threads() < 2)
204  wait = false;
205  else
206  wait = parent_wait;
207 
208 # pragma omp sections
209  {
210 # pragma omp section
211  {
212  qsb_conquer(tls, begin, begin + split_pos, comp,
213  iam,
214  num_threads_leftside,
215  wait);
216  wait = parent_wait;
217  }
218  // The pivot_pos is left in place, to ensure termination.
219 # pragma omp section
220  {
221  qsb_conquer(tls, begin + split_pos + 1, end, comp,
222  iam + num_threads_leftside,
223  num_threads - num_threads_leftside,
224  wait);
225  wait = parent_wait;
226  }
227  }
228  }
229  }
230 
231 /**
232  * @brief Quicksort step doing load-balanced local sort.
233  * @param tls Array of thread-local storages.
234  * @param comp Comparator.
235  * @param iam Number of the thread processing this function.
236  */
237 template<typename RandomAccessIterator, typename Comparator>
238  void
240  Comparator& comp, int iam, bool wait)
241  {
242  typedef std::iterator_traits<RandomAccessIterator> traits_type;
243  typedef typename traits_type::value_type value_type;
244  typedef typename traits_type::difference_type difference_type;
246 
247  QSBThreadLocal<RandomAccessIterator>& tl = *tls[iam];
248 
249  difference_type base_case_n =
251  if (base_case_n < 2)
252  base_case_n = 2;
253  thread_index_t num_threads = tl.num_threads;
254 
255  // Every thread has its own random number generator.
256  random_number rng(iam + 1);
257 
258  Piece current = tl.initial;
259 
260  difference_type elements_done = 0;
261 #if _GLIBCXX_ASSERTIONS
262  difference_type total_elements_done = 0;
263 #endif
264 
265  for (;;)
266  {
267  // Invariant: current must be a valid (maybe empty) range.
268  RandomAccessIterator begin = current.first, end = current.second;
269  difference_type n = end - begin;
270 
271  if (n > base_case_n)
272  {
273  // Divide.
274  RandomAccessIterator pivot_pos = begin + rng(n);
275 
276  // Swap pivot_pos value to end.
277  if (pivot_pos != (end - 1))
278  std::swap(*pivot_pos, *(end - 1));
279  pivot_pos = end - 1;
280 
282  <Comparator, value_type, value_type, bool>
283  pred(comp, *pivot_pos);
284 
285  // Divide, leave pivot unchanged in last place.
286  RandomAccessIterator split_pos1, split_pos2;
287  split_pos1 = __gnu_sequential::partition(begin, end - 1, pred);
288 
289  // Left side: < pivot_pos; right side: >= pivot_pos.
290 #if _GLIBCXX_ASSERTIONS
291  _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end);
292 #endif
293  // Swap pivot back to middle.
294  if (split_pos1 != pivot_pos)
295  std::swap(*split_pos1, *pivot_pos);
296  pivot_pos = split_pos1;
297 
298  // In case all elements are equal, split_pos1 == 0.
299  if ((split_pos1 + 1 - begin) < (n >> 7)
300  || (end - split_pos1) < (n >> 7))
301  {
302  // Very unequal split, one part smaller than one 128th
303  // elements not strictly larger than the pivot.
305  <Comparator, value_type, value_type, bool>, value_type>
307  <Comparator, value_type, value_type, bool>(comp,
308  *pivot_pos));
309 
310  // Find other end of pivot-equal range.
311  split_pos2 = __gnu_sequential::partition(split_pos1 + 1,
312  end, pred);
313  }
314  else
315  // Only skip the pivot.
316  split_pos2 = split_pos1 + 1;
317 
318  // Elements equal to pivot are done.
319  elements_done += (split_pos2 - split_pos1);
320 #if _GLIBCXX_ASSERTIONS
321  total_elements_done += (split_pos2 - split_pos1);
322 #endif
323  // Always push larger part onto stack.
324  if (((split_pos1 + 1) - begin) < (end - (split_pos2)))
325  {
326  // Right side larger.
327  if ((split_pos2) != end)
328  tl.leftover_parts.push_front(std::make_pair(split_pos2,
329  end));
330 
331  //current.first = begin; //already set anyway
332  current.second = split_pos1;
333  continue;
334  }
335  else
336  {
337  // Left side larger.
338  if (begin != split_pos1)
339  tl.leftover_parts.push_front(std::make_pair(begin,
340  split_pos1));
341 
342  current.first = split_pos2;
343  //current.second = end; //already set anyway
344  continue;
345  }
346  }
347  else
348  {
349  __gnu_sequential::sort(begin, end, comp);
350  elements_done += n;
351 #if _GLIBCXX_ASSERTIONS
352  total_elements_done += n;
353 #endif
354 
355  // Prefer own stack, small pieces.
356  if (tl.leftover_parts.pop_front(current))
357  continue;
358 
359 # pragma omp atomic
360  *tl.elements_leftover -= elements_done;
361 
362  elements_done = 0;
363 
364 #if _GLIBCXX_ASSERTIONS
365  double search_start = omp_get_wtime();
366 #endif
367 
368  // Look for new work.
369  bool successfully_stolen = false;
370  while (wait && *tl.elements_leftover > 0 && !successfully_stolen
372  // Possible dead-lock.
373  && (omp_get_wtime() < (search_start + 1.0))
374 #endif
375  )
376  {
377  thread_index_t victim;
378  victim = rng(num_threads);
379 
380  // Large pieces.
381  successfully_stolen = (victim != iam)
382  && tls[victim]->leftover_parts.pop_back(current);
383  if (!successfully_stolen)
384  yield();
385 #if !defined(__ICC) && !defined(__ECC)
386 # pragma omp flush
387 #endif
388  }
389 
390 #if _GLIBCXX_ASSERTIONS
391  if (omp_get_wtime() >= (search_start + 1.0))
392  {
393  sleep(1);
394  _GLIBCXX_PARALLEL_ASSERT(omp_get_wtime()
395  < (search_start + 1.0));
396  }
397 #endif
398  if (!successfully_stolen)
399  {
400 #if _GLIBCXX_ASSERTIONS
401  _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
402 #endif
403  return;
404  }
405  }
406  }
407  }
408 
409 /** @brief Top-level quicksort routine.
410  * @param begin Begin iterator of sequence.
411  * @param end End iterator of sequence.
412  * @param comp Comparator.
413  * @param num_threads Number of threads that are allowed to work on
414  * this part.
415  */
416 template<typename RandomAccessIterator, typename Comparator>
417  void
418  parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end,
419  Comparator comp,
420  thread_index_t num_threads)
421  {
422  _GLIBCXX_CALL(end - begin)
423 
424  typedef std::iterator_traits<RandomAccessIterator> traits_type;
425  typedef typename traits_type::value_type value_type;
426  typedef typename traits_type::difference_type difference_type;
428 
429  typedef QSBThreadLocal<RandomAccessIterator> tls_type;
430 
431  difference_type n = end - begin;
432 
433  if (n <= 1)
434  return;
435 
436  // At least one element per processor.
437  if (num_threads > n)
438  num_threads = static_cast<thread_index_t>(n);
439 
440  // Initialize thread local storage
441  tls_type** tls = new tls_type*[num_threads];
442  difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1);
443  for (thread_index_t t = 0; t < num_threads; ++t)
444  tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
445 
446  // There can never be more than ceil(log2(n)) ranges on the stack, because
447  // 1. Only one processor pushes onto the stack
448  // 2. The largest range has at most length n
449  // 3. Each range is larger than half of the range remaining
450  volatile difference_type elements_leftover = n;
451  for (int i = 0; i < num_threads; ++i)
452  {
453  tls[i]->elements_leftover = &elements_leftover;
454  tls[i]->num_threads = num_threads;
455  tls[i]->global = std::make_pair(begin, end);
456 
457  // Just in case nothing is left to assign.
458  tls[i]->initial = std::make_pair(end, end);
459  }
460 
461  // Main recursion call.
462  qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true);
463 
464 #if _GLIBCXX_ASSERTIONS
465  // All stack must be empty.
466  Piece dummy;
467  for (int i = 1; i < num_threads; ++i)
468  _GLIBCXX_PARALLEL_ASSERT(!tls[i]->leftover_parts.pop_back(dummy));
469 #endif
470 
471  for (int i = 0; i < num_threads; ++i)
472  delete tls[i];
473  delete[] tls;
474  }
475 } // namespace __gnu_parallel
476 
477 #endif /* _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H */