18 #ifndef __deal2__vectorization_h
19 #define __deal2__vectorization_h
21 #include <deal.II/base/config.h>
22 #include <deal.II/base/std_cxx1x/type_traits.h>
24 #include <deal.II/base/memory_consumption.h>
25 #include <deal.II/base/parallel.h>
43 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL == 2 // AVX
44 #include <immintrin.h>
45 #include <mm_malloc.h>
46 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL == 1 // SSE2
47 #include <emmintrin.h>
48 #include <mm_malloc.h>
61 sqrt(const ::VectorizedArray<Number> &);
63 abs(const ::VectorizedArray<Number> &);
65 max(const ::VectorizedArray<Number> &, const ::VectorizedArray<Number> &);
67 min (const ::VectorizedArray<Number> &, const ::VectorizedArray<Number> &);
77 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL == 2 && defined(__AVX__)
99 data = _mm256_set1_pd(x);
110 return *(
reinterpret_cast<double *
>(&
data)+comp);
120 return *(
reinterpret_cast<const double *
>(&
data)+comp);
134 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
148 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
161 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
175 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
199 res.
data = _mm256_sqrt_pd(data);
215 __m256d mask = _mm256_set1_pd (-0.);
217 res.
data = _mm256_andnot_pd(mask, data);
230 res.
data = _mm256_max_pd (data, other.
data);
243 res.
data = _mm256_min_pd (data, other.
data);
282 data = _mm256_set1_ps(x);
293 return *(
reinterpret_cast<float *
>(&
data)+comp);
303 return *(
reinterpret_cast<const float *
>(&
data)+comp);
312 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
326 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
339 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
353 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
378 res.
data = _mm256_sqrt_ps(data);
393 __m256 mask = _mm256_set1_ps (-0.f);
395 res.
data = _mm256_andnot_ps(mask, data);
408 res.
data = _mm256_max_ps (data, other.
data);
421 res.
data = _mm256_min_ps (data, other.
data);
443 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 && defined(__SSE2__)
468 data = _mm_set1_pd(x);
479 return *(
reinterpret_cast<double *
>(&
data)+comp);
489 return *(
reinterpret_cast<const double *
>(&
data)+comp);
498 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
512 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
525 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
539 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
563 res.
data = _mm_sqrt_pd(data);
579 __m128d mask = _mm_set1_pd (-0.);
581 res.
data = _mm_andnot_pd(mask, data);
594 res.
data = _mm_max_pd (data, other.
data);
607 res.
data = _mm_min_pd (data, other.
data);
647 data = _mm_set1_ps(x);
658 return *(
reinterpret_cast<float *
>(&
data)+comp);
668 return *(
reinterpret_cast<const float *
>(&
data)+comp);
677 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
691 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
704 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
718 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
742 res.
data = _mm_sqrt_ps(data);
758 __m128 mask = _mm_set1_ps (-0.f);
760 res.
data = _mm_andnot_ps(mask, data);
773 res.
data = _mm_max_ps (data, other.
data);
786 res.
data = _mm_min_ps (data, other.
data);
804 #endif // if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0
856 template <
typename Number>
961 res.
data = std::sqrt(data);
974 res.
data = std::fabs(data);
987 res.
data = std::max (data, other.
data);
1000 res.
data = std::min (data, other.
data);
1023 template <
typename Number>
1038 template <
typename Number>
1053 template <
typename Number>
1068 template <
typename Number>
1083 template <
typename Number>
1099 template <
typename Number>
1134 template <
typename Number>
1165 template <
typename Number>
1200 template <
typename Number>
1235 template <
typename Number>
1270 template <
typename Number>
1301 template <
typename Number>
1336 template <
typename Number>
1370 template <
typename Number>
1383 template <
typename Number>
1417 template <
typename T>
1420 static const std::size_t minimum_parallel_grain_size = 160000/
sizeof(T)+1;
1434 bool copy_only =
false)
1436 source_ (source_begin),
1437 destination_ (destination),
1438 copy_only_ (copy_only)
1441 const std::size_t size = source_end - source_begin;
1442 if (size < minimum_parallel_grain_size)
1454 const std::size_t end)
const
1458 if (std_cxx1x::is_trivial<T>::value ==
true)
1459 std::memcpy (destination_+begin, source_+begin, (end-begin)*
sizeof(T));
1460 else if (copy_only_ ==
false)
1461 for (std::size_t i=begin; i<end; ++i)
1464 new (&destination_[i]) T;
1465 destination_[i] = source_[i];
1469 for (std::size_t i=begin; i<end; ++i)
1471 new (&destination_[i]) T;
1472 destination_[i] = source_[i];
1479 const bool copy_only_;
1487 template <
typename T>
1490 static const std::size_t minimum_parallel_grain_size = 160000/
sizeof(T)+1;
1502 destination_ (destination),
1503 trivial_element (false)
1508 if (std_cxx1x::is_trivial<T>::value ==
true)
1510 const unsigned char zero [
sizeof(T)] = {};
1511 if (std::memcmp(zero, &element,
sizeof(T)) == 0)
1512 trivial_element =
true;
1514 if (size < minimum_parallel_grain_size)
1527 const std::size_t end)
const
1531 if (std_cxx1x::is_trivial<T>::value ==
true && trivial_element)
1532 std::memset (destination_+begin, 0, (end-begin)*
sizeof(T));
1534 for (std::size_t i=begin; i<end; ++i)
1537 new (&destination_[i]) T;
1538 destination_[i] = element_;
1543 mutable T *destination_;
1544 bool trivial_element;
1562 template <
class T >
1579 typedef std::size_t size_type;
1661 const T &init = T())
1663 const size_type old_size =
size();
1664 if (std_cxx1x::is_trivial<T>::value ==
false && size_in < old_size)
1674 if (size_in > old_size)
1695 if (size_alloc > allocated_size)
1701 size_type new_size = size_alloc;
1702 if (size_alloc < (2 * allocated_size))
1703 new_size = 2 * allocated_size;
1705 const size_type size_actual_allocate = new_size *
sizeof(T);
1707 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0
1712 T *new_data =
static_cast<T *
>(_mm_malloc (size_actual_allocate,
1715 T *new_data =
static_cast<T *
>(malloc (size_actual_allocate));
1718 throw std::bad_alloc();
1724 std::swap (_data, new_data);
1731 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0
1738 else if (size_alloc == 0)
1752 if (std_cxx1x::is_trivial<T>::value ==
false)
1756 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0
1779 if (std_cxx1x::is_trivial<T>::value ==
false)
1810 template <
typename ForwardIterator>
1812 ForwardIterator
end)
1814 const unsigned int old_size =
size();
1815 reserve (old_size + (end-begin));
1818 if (std_cxx1x::is_trivial<T>::value ==
false)
1862 return _data[index];
1872 return _data[index];
1919 size_type memory =
sizeof(
this);
1943 DEAL_II_NAMESPACE_CLOSE
1961 template <
typename Number>
1963 ::VectorizedArray<Number>
1964 sin (const ::VectorizedArray<Number> &x)
1967 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
1968 sin_val[i] = std::sin(x[i]);
1981 template <
typename Number>
1983 ::VectorizedArray<Number>
1984 tan (const ::VectorizedArray<Number> &x)
1987 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
1988 tan_val[i] = std::tan(x[i]);
2000 template <
typename Number>
2002 ::VectorizedArray<Number>
2003 cos (const ::VectorizedArray<Number> &x)
2006 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2007 cos_val[i] = std::cos(x[i]);
2019 template <
typename Number>
2021 ::VectorizedArray<Number>
2022 exp (const ::VectorizedArray<Number> &x)
2025 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2026 exp_val[i] = std::exp(x[i]);
2038 template <
typename Number>
2040 ::VectorizedArray<Number>
2041 log (const ::VectorizedArray<Number> &x)
2044 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2045 log_val[i] = std::log(x[i]);
2058 template <
typename Number>
2060 ::VectorizedArray<Number>
2061 sqrt (const ::VectorizedArray<Number> &x)
2063 return x.get_sqrt();
2075 template <
typename Number>
2077 ::VectorizedArray<Number>
2078 abs (const ::VectorizedArray<Number> &x)
2092 template <
typename Number>
2094 ::VectorizedArray<Number>
2095 max (const ::VectorizedArray<Number> &x,
2096 const ::VectorizedArray<Number> &y)
2098 return x.get_max(y);
2110 template <
typename Number>
2112 ::VectorizedArray<Number>
2113 min (const ::VectorizedArray<Number> &x,
2114 const ::VectorizedArray<Number> &y)
2116 return x.get_min(y);
VectorizedArray< Number > operator/(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
VectorizedArray< Number > operator+(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
void apply_parallel(const std::size_t begin, const std::size_t end, const std::size_t minimum_parallel_grain_size) const
VectorizedArray< Number > operator-(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
VectorizedArray< Number > log(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > operator*(const VectorizedArray< Number > &u, const VectorizedArray< Number > &v)
virtual void apply_to_subrange(const std::size_t begin, const std::size_t end) const
size_type capacity() const
VectorizedArray< Number > make_vectorized_array(const Number &u)
VectorizedArray< Number > tan(const ::VectorizedArray< Number > &x)
#define AssertIndexRange(index, range)
size_type memory_consumption() const
AlignedVector & operator=(const AlignedVector< T > &vec)
void reserve(const size_type size_alloc)
VectorizedArray< Number > exp(const ::VectorizedArray< Number > &x)
VectorizedArray get_sqrt() const
void push_back(const T in_data)
virtual void apply_to_subrange(const std::size_t begin, const std::size_t end) const
AlignedVector(const size_type size)
reference operator[](const size_type index)
VectorizedArray & operator+=(const VectorizedArray< Number > &vec)
VectorizedArray get_min(const VectorizedArray &other) const
Number & operator[](const unsigned int comp)
VectorizedArray get_abs() const
static const unsigned int n_array_elements
const_reference back() const
VectorizedArray< Number > min(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
#define Assert(cond, exc)
void resize(const size_type size_in, const T &init=T())
void insert_back(ForwardIterator begin, ForwardIterator end)
VectorizedArray & operator-=(const VectorizedArray< Number > &vec)
void swap(AlignedVector< T > &vec)
VectorizedArray< Number > sqrt(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > sin(const ::VectorizedArray< Number > &x)
void resize_fast(const size_type size)
VectorizedArray get_max(const VectorizedArray &other) const
VectorizedArray & operator/=(const VectorizedArray< Number > &vec)
const_iterator end() const
AlignedVectorMove(T *source_begin, T *source_end, T *destination, bool copy_only=false)
const_iterator begin() const
VectorizedArray & operator*=(const VectorizedArray< Number > &vec)
AlignedVector(const AlignedVector< T > &vec)
VectorizedArray & operator=(const Number scalar)
::ExceptionBase & ExcInternalError()
VectorizedArray< Number > abs(const ::VectorizedArray< Number > &x)
AlignedVectorSet(const std::size_t size, const T &element, T *destination)
VectorizedArray< Number > max(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
VectorizedArray< Number > cos(const ::VectorizedArray< Number > &x)