Eigen  3.2.91
TriangularMatrixMatrix.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H
11 #define EIGEN_TRIANGULAR_MATRIX_MATRIX_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 // template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode>
18 // struct gemm_pack_lhs_triangular
19 // {
20 // Matrix<Scalar,mr,mr,
21 // void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
22 // {
23 // conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
24 // const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
25 // int count = 0;
26 // const int peeled_mc = (rows/mr)*mr;
27 // for(int i=0; i<peeled_mc; i+=mr)
28 // {
29 // for(int k=0; k<depth; k++)
30 // for(int w=0; w<mr; w++)
31 // blockA[count++] = cj(lhs(i+w, k));
32 // }
33 // for(int i=peeled_mc; i<rows; i++)
34 // {
35 // for(int k=0; k<depth; k++)
36 // blockA[count++] = cj(lhs(i, k));
37 // }
38 // }
39 // };
40 
41 /* Optimized triangular matrix * matrix (_TRMM++) product built on top of
42  * the general matrix matrix product.
43  */
44 template <typename Scalar, typename Index,
45  int Mode, bool LhsIsTriangular,
46  int LhsStorageOrder, bool ConjugateLhs,
47  int RhsStorageOrder, bool ConjugateRhs,
48  int ResStorageOrder, int Version = Specialized>
49 struct product_triangular_matrix_matrix;
50 
51 template <typename Scalar, typename Index,
52  int Mode, bool LhsIsTriangular,
53  int LhsStorageOrder, bool ConjugateLhs,
54  int RhsStorageOrder, bool ConjugateRhs, int Version>
55 struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
56  LhsStorageOrder,ConjugateLhs,
57  RhsStorageOrder,ConjugateRhs,RowMajor,Version>
58 {
59  static EIGEN_STRONG_INLINE void run(
60  Index rows, Index cols, Index depth,
61  const Scalar* lhs, Index lhsStride,
62  const Scalar* rhs, Index rhsStride,
63  Scalar* res, Index resStride,
64  const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
65  {
66  product_triangular_matrix_matrix<Scalar, Index,
67  (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper),
68  (!LhsIsTriangular),
69  RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
70  ConjugateRhs,
71  LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
72  ConjugateLhs,
73  ColMajor>
74  ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
75  }
76 };
77 
78 // implements col-major += alpha * op(triangular) * op(general)
79 template <typename Scalar, typename Index, int Mode,
80  int LhsStorageOrder, bool ConjugateLhs,
81  int RhsStorageOrder, bool ConjugateRhs, int Version>
82 struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
83  LhsStorageOrder,ConjugateLhs,
84  RhsStorageOrder,ConjugateRhs,ColMajor,Version>
85 {
86 
87  typedef gebp_traits<Scalar,Scalar> Traits;
88  enum {
89  SmallPanelWidth = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
90  IsLower = (Mode&Lower) == Lower,
91  SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
92  };
93 
94  static EIGEN_DONT_INLINE void run(
95  Index _rows, Index _cols, Index _depth,
96  const Scalar* _lhs, Index lhsStride,
97  const Scalar* _rhs, Index rhsStride,
98  Scalar* res, Index resStride,
99  const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
100 };
101 
102 template <typename Scalar, typename Index, int Mode,
103  int LhsStorageOrder, bool ConjugateLhs,
104  int RhsStorageOrder, bool ConjugateRhs, int Version>
105 EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
106  LhsStorageOrder,ConjugateLhs,
107  RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
108  Index _rows, Index _cols, Index _depth,
109  const Scalar* _lhs, Index lhsStride,
110  const Scalar* _rhs, Index rhsStride,
111  Scalar* _res, Index resStride,
112  const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
113  {
114  // strip zeros
115  Index diagSize = (std::min)(_rows,_depth);
116  Index rows = IsLower ? _rows : diagSize;
117  Index depth = IsLower ? diagSize : _depth;
118  Index cols = _cols;
119 
120  typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
121  typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
122  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
123  LhsMapper lhs(_lhs,lhsStride);
124  RhsMapper rhs(_rhs,rhsStride);
125  ResMapper res(_res, resStride);
126 
127  Index kc = blocking.kc(); // cache block size along the K direction
128  Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
129 
130  std::size_t sizeA = kc*mc;
131  std::size_t sizeB = kc*cols;
132 
133  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
134  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
135 
136  Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
137  triangularBuffer.setZero();
138  if((Mode&ZeroDiag)==ZeroDiag)
139  triangularBuffer.diagonal().setZero();
140  else
141  triangularBuffer.diagonal().setOnes();
142 
143  gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
144  gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
145  gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
146 
147  for(Index k2=IsLower ? depth : 0;
148  IsLower ? k2>0 : k2<depth;
149  IsLower ? k2-=kc : k2+=kc)
150  {
151  Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc);
152  Index actual_k2 = IsLower ? k2-actual_kc : k2;
153 
154  // align blocks with the end of the triangular part for trapezoidal lhs
155  if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows))
156  {
157  actual_kc = rows-k2;
158  k2 = k2+actual_kc-kc;
159  }
160 
161  pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
162 
163  // the selected lhs's panel has to be split in three different parts:
164  // 1 - the part which is zero => skip it
165  // 2 - the diagonal block => special kernel
166  // 3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP
167 
168  // the block diagonal, if any:
169  if(IsLower || actual_k2<rows)
170  {
171  // for each small vertical panels of lhs
172  for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
173  {
174  Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
175  Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
176  Index startBlock = actual_k2+k1;
177  Index blockBOffset = k1;
178 
179  // => GEBP with the micro triangular block
180  // The trick is to pack this micro block while filling the opposite triangular part with zeros.
181  // To this end we do an extra triangular copy to a small temporary buffer
182  for (Index k=0;k<actualPanelWidth;++k)
183  {
184  if (SetDiag)
185  triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k);
186  for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
187  triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
188  }
189  pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
190 
191  gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
192  actualPanelWidth, actualPanelWidth, cols, alpha,
193  actualPanelWidth, actual_kc, 0, blockBOffset);
194 
195  // GEBP with remaining micro panel
196  if (lengthTarget>0)
197  {
198  Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
199 
200  pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
201 
202  gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
203  lengthTarget, actualPanelWidth, cols, alpha,
204  actualPanelWidth, actual_kc, 0, blockBOffset);
205  }
206  }
207  }
208  // the part below (lower case) or above (upper case) the diagonal => GEPP
209  {
210  Index start = IsLower ? k2 : 0;
211  Index end = IsLower ? rows : (std::min)(actual_k2,rows);
212  for(Index i2=start; i2<end; i2+=mc)
213  {
214  const Index actual_mc = (std::min)(i2+mc,end)-i2;
215  gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
216  (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
217 
218  gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
219  actual_kc, cols, alpha, -1, -1, 0, 0);
220  }
221  }
222  }
223  }
224 
225 // implements col-major += alpha * op(general) * op(triangular)
226 template <typename Scalar, typename Index, int Mode,
227  int LhsStorageOrder, bool ConjugateLhs,
228  int RhsStorageOrder, bool ConjugateRhs, int Version>
229 struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
230  LhsStorageOrder,ConjugateLhs,
231  RhsStorageOrder,ConjugateRhs,ColMajor,Version>
232 {
233  typedef gebp_traits<Scalar,Scalar> Traits;
234  enum {
235  SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
236  IsLower = (Mode&Lower) == Lower,
237  SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
238  };
239 
240  static EIGEN_DONT_INLINE void run(
241  Index _rows, Index _cols, Index _depth,
242  const Scalar* _lhs, Index lhsStride,
243  const Scalar* _rhs, Index rhsStride,
244  Scalar* res, Index resStride,
245  const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
246 };
247 
248 template <typename Scalar, typename Index, int Mode,
249  int LhsStorageOrder, bool ConjugateLhs,
250  int RhsStorageOrder, bool ConjugateRhs, int Version>
251 EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
252  LhsStorageOrder,ConjugateLhs,
253  RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
254  Index _rows, Index _cols, Index _depth,
255  const Scalar* _lhs, Index lhsStride,
256  const Scalar* _rhs, Index rhsStride,
257  Scalar* _res, Index resStride,
258  const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
259  {
260  const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
261  // strip zeros
262  Index diagSize = (std::min)(_cols,_depth);
263  Index rows = _rows;
264  Index depth = IsLower ? _depth : diagSize;
265  Index cols = IsLower ? diagSize : _cols;
266 
267  typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
268  typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
269  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
270  LhsMapper lhs(_lhs,lhsStride);
271  RhsMapper rhs(_rhs,rhsStride);
272  ResMapper res(_res, resStride);
273 
274  Index kc = blocking.kc(); // cache block size along the K direction
275  Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
276 
277  std::size_t sizeA = kc*mc;
278  std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar);
279 
280  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
281  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
282 
283  Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
284  triangularBuffer.setZero();
285  if((Mode&ZeroDiag)==ZeroDiag)
286  triangularBuffer.diagonal().setZero();
287  else
288  triangularBuffer.diagonal().setOnes();
289 
290  gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
291  gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
292  gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
293  gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
294 
295  for(Index k2=IsLower ? 0 : depth;
296  IsLower ? k2<depth : k2>0;
297  IsLower ? k2+=kc : k2-=kc)
298  {
299  Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc);
300  Index actual_k2 = IsLower ? k2 : k2-actual_kc;
301 
302  // align blocks with the end of the triangular part for trapezoidal rhs
303  if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols))
304  {
305  actual_kc = cols-k2;
306  k2 = actual_k2 + actual_kc - kc;
307  }
308 
309  // remaining size
310  Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2;
311  // size of the triangular part
312  Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
313 
314  Scalar* geb = blockB+ts*ts;
315  geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));
316 
317  pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
318 
319  // pack the triangular part of the rhs padding the unrolled blocks with zeros
320  if(ts>0)
321  {
322  for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
323  {
324  Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
325  Index actual_j2 = actual_k2 + j2;
326  Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
327  Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
328  // general part
329  pack_rhs_panel(blockB+j2*actual_kc,
330  rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
331  panelLength, actualPanelWidth,
332  actual_kc, panelOffset);
333 
334  // append the triangular part via a temporary buffer
335  for (Index j=0;j<actualPanelWidth;++j)
336  {
337  if (SetDiag)
338  triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j);
339  for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k)
340  triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
341  }
342 
343  pack_rhs_panel(blockB+j2*actual_kc,
344  RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
345  actualPanelWidth, actualPanelWidth,
346  actual_kc, j2);
347  }
348  }
349 
350  for (Index i2=0; i2<rows; i2+=mc)
351  {
352  const Index actual_mc = (std::min)(mc,rows-i2);
353  pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
354 
355  // triangular kernel
356  if(ts>0)
357  {
358  for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
359  {
360  Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
361  Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
362  Index blockOffset = IsLower ? j2 : 0;
363 
364  gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
365  blockA, blockB+j2*actual_kc,
366  actual_mc, panelLength, actualPanelWidth,
367  alpha,
368  actual_kc, actual_kc, // strides
369  blockOffset, blockOffset);// offsets
370  }
371  }
372  gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
373  blockA, geb, actual_mc, actual_kc, rs,
374  alpha,
375  -1, -1, 0, 0);
376  }
377  }
378  }
379 
380 /***************************************************************************
381 * Wrapper to product_triangular_matrix_matrix
382 ***************************************************************************/
383 
384 } // end namespace internal
385 
386 namespace internal {
387 template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
388 struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
389 {
390  template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
391  {
392  typedef typename Dest::Scalar Scalar;
393 
394  typedef internal::blas_traits<Lhs> LhsBlasTraits;
395  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
396  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
397  typedef internal::blas_traits<Rhs> RhsBlasTraits;
398  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
399  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
400 
401  typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
402  typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
403 
404  Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
405  * RhsBlasTraits::extractScalarFactor(a_rhs);
406 
407  typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
408  Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
409 
410  enum { IsLower = (Mode&Lower) == Lower };
411  Index stripedRows = ((!LhsIsTriangular) || (IsLower)) ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols());
412  Index stripedCols = ((LhsIsTriangular) || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows());
413  Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
414  : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
415 
416  BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
417 
418  internal::product_triangular_matrix_matrix<Scalar, Index,
419  Mode, LhsIsTriangular,
420  (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
421  (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
422  (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor>
423  ::run(
424  stripedRows, stripedCols, stripedDepth, // sizes
425  &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
426  &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
427  &dst.coeffRef(0,0), dst.outerStride(), // result info
428  actualAlpha, blocking
429  );
430  }
431 };
432 
433 } // end namespace internal
434 
435 } // end namespace Eigen
436 
437 #endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
Definition: Constants.h:314
Definition: Constants.h:196
Definition: Constants.h:202
Definition: LDLT.h:16
Definition: StdDeque.h:58
const unsigned int RowMajorBit
Definition: Constants.h:53
Definition: Constants.h:198
Definition: Constants.h:200
Definition: Eigen_Colamd.h:54
Definition: Constants.h:312