Eigen  3.2.91
GeneralBlockPanelKernel.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
12 
13 
14 namespace Eigen {
15 
16 namespace internal {
17 
18 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
19 class gebp_traits;
20 
21 
23 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
24 {
25  return a<=0 ? b : a;
26 }
27 
28 #if EIGEN_ARCH_i386_OR_x86_64
29 const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30 const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31 const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
32 #else
33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
36 #endif
37 
39 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
40 {
41  static bool m_cache_sizes_initialized = false;
42  static std::ptrdiff_t m_l1CacheSize = 0;
43  static std::ptrdiff_t m_l2CacheSize = 0;
44  static std::ptrdiff_t m_l3CacheSize = 0;
45 
46  if(!m_cache_sizes_initialized)
47  {
48  int l1CacheSize, l2CacheSize, l3CacheSize;
49  queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
50  m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
51  m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
52  m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
53  m_cache_sizes_initialized = true;
54  }
55 
56  if(action==SetAction)
57  {
58  // set the cpu cache size and cache all block sizes from a global cache size in byte
59  eigen_internal_assert(l1!=0 && l2!=0);
60  m_l1CacheSize = *l1;
61  m_l2CacheSize = *l2;
62  m_l3CacheSize = *l3;
63  }
64  else if(action==GetAction)
65  {
66  eigen_internal_assert(l1!=0 && l2!=0);
67  *l1 = m_l1CacheSize;
68  *l2 = m_l2CacheSize;
69  *l3 = m_l3CacheSize;
70  }
71  else
72  {
73  eigen_internal_assert(false);
74  }
75 }
76 
77 /* Helper for computeProductBlockingSizes.
78  *
79  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
80  * this function computes the blocking size parameters along the respective dimensions
81  * for matrix products and related algorithms. The blocking sizes depends on various
82  * parameters:
83  * - the L1 and L2 cache sizes,
84  * - the register level blocking sizes defined by gebp_traits,
85  * - the number of scalars that fit into a packet (when vectorization is enabled).
86  *
87  * \sa setCpuCacheSizes */
88 
89 template<typename LhsScalar, typename RhsScalar, int KcFactor>
90 void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
91 {
92  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
93 
94  // Explanations:
95  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
96  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
97  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
98  // at the register level. This small horizontal panel has to stay within L1 cache.
99  std::ptrdiff_t l1, l2, l3;
100  manage_caching_sizes(GetAction, &l1, &l2, &l3);
101 
102  if (num_threads > 1) {
103  typedef typename Traits::ResScalar ResScalar;
104  enum {
105  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
106  ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
107  k_mask = -8,
108 
109  mr = Traits::mr,
110  mr_mask = -mr,
111 
112  nr = Traits::nr,
113  nr_mask = -nr
114  };
115  // Increasing k gives us more time to prefetch the content of the "C"
116  // registers. However once the latency is hidden there is no point in
117  // increasing the value of k, so we'll cap it at 320 (value determined
118  // experimentally).
119  const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
120  if (k_cache < k) {
121  k = k_cache & k_mask;
122  eigen_internal_assert(k > 0);
123  }
124 
125  const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
126  const Index n_per_thread = numext::div_ceil(n, num_threads);
127  if (n_cache <= n_per_thread) {
128  // Don't exceed the capacity of the l2 cache.
129  eigen_internal_assert(n_cache >= static_cast<Index>(nr));
130  n = n_cache & nr_mask;
131  eigen_internal_assert(n > 0);
132  } else {
133  n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
134  }
135 
136  if (l3 > l2) {
137  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
138  const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
139  const Index m_per_thread = numext::div_ceil(m, num_threads);
140  if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
141  m = m_cache & mr_mask;
142  eigen_internal_assert(m > 0);
143  } else {
144  m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
145  }
146  }
147  }
148  else {
149  // In unit tests we do not want to use extra large matrices,
150  // so we reduce the cache size to check the blocking strategy is not flawed
151 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
152  l1 = 9*1024;
153  l2 = 32*1024;
154  l3 = 512*1024;
155 #endif
156 
157  // Early return for small problems because the computation below are time consuming for small problems.
158  // Perhaps it would make more sense to consider k*n*m??
159  // Note that for very tiny problem, this function should be bypassed anyway
160  // because we use the coefficient-based implementation for them.
161  if((std::max)(k,(std::max)(m,n))<48)
162  return;
163 
164  typedef typename Traits::ResScalar ResScalar;
165  enum {
166  k_peeling = 8,
167  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
168  k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
169  };
170 
171  // ---- 1st level of blocking on L1, yields kc ----
172 
173  // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
174  // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
175  // We also include a register-level block of the result (mx x nr).
176  // (In an ideal world only the lhs panel would stay in L1)
177  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
178  const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
179  const Index old_k = k;
180  if(k>max_kc)
181  {
182  // We are really blocking on the third dimension:
183  // -> reduce blocking size to make sure the last block is as large as possible
184  // while keeping the same number of sweeps over the result.
185  k = (k%max_kc)==0 ? max_kc
186  : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
187 
188  eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
189  }
190 
191  // ---- 2nd level of blocking on max(L2,L3), yields nc ----
192 
193  // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
194  // actual_l2 = max(l2, l3/nb_core_sharing_l3)
195  // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
196  // For instance, it corresponds to 6MB of L3 shared among 4 cores.
197  #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
198  const Index actual_l2 = l3;
199  #else
200  const Index actual_l2 = 1572864; // == 1.5 MB
201  #endif
202 
203 
204 
205  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
206  // The second half is implicitly reserved to access the result and lhs coefficients.
207  // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
208  // to limit this growth: we bound nc to growth by a factor x1.5.
209  // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
210  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
211  Index max_nc;
212  const Index lhs_bytes = m * k * sizeof(LhsScalar);
213  const Index remaining_l1 = l1- k_sub - lhs_bytes;
214  if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
215  {
216  // L1 blocking
217  max_nc = remaining_l1 / (k*sizeof(RhsScalar));
218  }
219  else
220  {
221  // L2 blocking
222  max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
223  }
224  // WARNING Below, we assume that Traits::nr is a power of two.
225  Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
226  if(n>nc)
227  {
228  // We are really blocking over the columns:
229  // -> reduce blocking size to make sure the last block is as large as possible
230  // while keeping the same number of sweeps over the packed lhs.
231  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
232  n = (n%nc)==0 ? nc
233  : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
234  }
235  else if(old_k==k)
236  {
237  // So far, no blocking at all, i.e., kc==k, and nc==n.
238  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
239  // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
240  Index problem_size = k*n*sizeof(LhsScalar);
241  Index actual_lm = actual_l2;
242  Index max_mc = m;
243  if(problem_size<=1024)
244  {
245  // problem is small enough to keep in L1
246  // Let's choose m such that lhs's block fit in 1/3 of L1
247  actual_lm = l1;
248  }
249  else if(l3!=0 && problem_size<=32768)
250  {
251  // we have both L2 and L3, and problem is small enough to be kept in L2
252  // Let's choose m such that lhs's block fit in 1/3 of L2
253  actual_lm = l2;
254  max_mc = 576;
255  }
256  Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
257  if (mc > Traits::mr) mc -= mc % Traits::mr;
258  else if (mc==0) return;
259  m = (m%mc)==0 ? mc
260  : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
261  }
262  }
263 }
264 
265 inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
266 {
267 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
268  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
269  k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
270  m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
271  n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
272  return true;
273  }
274 #else
275  EIGEN_UNUSED_VARIABLE(k)
276  EIGEN_UNUSED_VARIABLE(m)
277  EIGEN_UNUSED_VARIABLE(n)
278 #endif
279  return false;
280 }
281 
298 template<typename LhsScalar, typename RhsScalar, int KcFactor>
299 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
300 {
301  if (!useSpecificBlockingSizes(k, m, n)) {
302  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
303  }
304 
305  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
306  enum {
307  kr = 8,
308  mr = Traits::mr,
309  nr = Traits::nr
310  };
311  if (k > kr) k -= k % kr;
312  if (m > mr) m -= m % mr;
313  if (n > nr) n -= n % nr;
314 }
315 
316 template<typename LhsScalar, typename RhsScalar>
317 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
318 {
319  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
320 }
321 
322 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
323  #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
324 #else
325 
326  // FIXME (a bit overkill maybe ?)
327 
328  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
329  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
330  {
331  c = cj.pmadd(a,b,c);
332  }
333  };
334 
335  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
336  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
337  {
338  t = b; t = cj.pmul(a,t); c = padd(c,t);
339  }
340  };
341 
342  template<typename CJ, typename A, typename B, typename C, typename T>
343  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
344  {
345  gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
346  }
347 
348  #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
349 // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
350 #endif
351 
352 /* Vectorization logic
353  * real*real: unpack rhs to constant packets, ...
354  *
355  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
356  * storing each res packet into two packets (2x2),
357  * at the end combine them: swap the second and addsub them
358  * cf*cf : same but with 2x4 blocks
359  * cplx*real : unpack rhs to constant packets, ...
360  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
361  */
362 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
363 class gebp_traits
364 {
365 public:
366  typedef _LhsScalar LhsScalar;
367  typedef _RhsScalar RhsScalar;
368  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
369 
370  enum {
371  ConjLhs = _ConjLhs,
372  ConjRhs = _ConjRhs,
373  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
374  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
375  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
376  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
377 
378  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
379 
380  // register block size along the N direction must be 1 or 4
381  nr = 4,
382 
383  // register block size along the M direction (currently, this one cannot be modified)
384  default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
385 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
386  // we assume 16 registers
387  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
388  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
389  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
390 #else
391  mr = default_mr,
392 #endif
393 
394  LhsProgress = LhsPacketSize,
395  RhsProgress = 1
396  };
397 
398  typedef typename packet_traits<LhsScalar>::type _LhsPacket;
399  typedef typename packet_traits<RhsScalar>::type _RhsPacket;
400  typedef typename packet_traits<ResScalar>::type _ResPacket;
401 
402  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
403  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
404  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
405 
406  typedef ResPacket AccPacket;
407 
408  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
409  {
410  p = pset1<ResPacket>(ResScalar(0));
411  }
412 
413  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
414  {
415  pbroadcast4(b, b0, b1, b2, b3);
416  }
417 
418 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
419 // {
420 // pbroadcast2(b, b0, b1);
421 // }
422 
423  template<typename RhsPacketType>
424  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
425  {
426  dest = pset1<RhsPacketType>(*b);
427  }
428 
429  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
430  {
431  dest = ploadquad<RhsPacket>(b);
432  }
433 
434  template<typename LhsPacketType>
435  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
436  {
437  dest = pload<LhsPacketType>(a);
438  }
439 
440  template<typename LhsPacketType>
441  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
442  {
443  dest = ploadu<LhsPacketType>(a);
444  }
445 
446  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
447  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
448  {
449  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
450  // let gcc allocate the register in which to store the result of the pmul
451  // (in the case where there is no FMA) gcc fails to figure out how to avoid
452  // spilling register.
453 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
454  EIGEN_UNUSED_VARIABLE(tmp);
455  c = pmadd(a,b,c);
456 #else
457  tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
458 #endif
459  }
460 
461  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
462  {
463  r = pmadd(c,alpha,r);
464  }
465 
466  template<typename ResPacketHalf>
467  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
468  {
469  r = pmadd(c,alpha,r);
470  }
471 
472 protected:
473 // conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
474 // conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
475 };
476 
477 template<typename RealScalar, bool _ConjLhs>
478 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
479 {
480 public:
481  typedef std::complex<RealScalar> LhsScalar;
482  typedef RealScalar RhsScalar;
483  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
484 
485  enum {
486  ConjLhs = _ConjLhs,
487  ConjRhs = false,
488  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
489  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
490  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
491  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
492 
493  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
494  nr = 4,
495 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
496  // we assume 16 registers
497  mr = 3*LhsPacketSize,
498 #else
499  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
500 #endif
501 
502  LhsProgress = LhsPacketSize,
503  RhsProgress = 1
504  };
505 
506  typedef typename packet_traits<LhsScalar>::type _LhsPacket;
507  typedef typename packet_traits<RhsScalar>::type _RhsPacket;
508  typedef typename packet_traits<ResScalar>::type _ResPacket;
509 
510  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
511  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
512  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
513 
514  typedef ResPacket AccPacket;
515 
516  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
517  {
518  p = pset1<ResPacket>(ResScalar(0));
519  }
520 
521  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
522  {
523  dest = pset1<RhsPacket>(*b);
524  }
525 
526  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
527  {
528  dest = pset1<RhsPacket>(*b);
529  }
530 
531  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
532  {
533  dest = pload<LhsPacket>(a);
534  }
535 
536  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
537  {
538  dest = ploadu<LhsPacket>(a);
539  }
540 
541  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
542  {
543  pbroadcast4(b, b0, b1, b2, b3);
544  }
545 
546 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
547 // {
548 // pbroadcast2(b, b0, b1);
549 // }
550 
551  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
552  {
553  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
554  }
555 
556  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
557  {
558 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
559  EIGEN_UNUSED_VARIABLE(tmp);
560  c.v = pmadd(a.v,b,c.v);
561 #else
562  tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
563 #endif
564  }
565 
566  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
567  {
568  c += a * b;
569  }
570 
571  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
572  {
573  r = cj.pmadd(c,alpha,r);
574  }
575 
576 protected:
577  conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
578 };
579 
580 template<typename Packet>
581 struct DoublePacket
582 {
583  Packet first;
584  Packet second;
585 };
586 
587 template<typename Packet>
588 DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
589 {
590  DoublePacket<Packet> res;
591  res.first = padd(a.first, b.first);
592  res.second = padd(a.second,b.second);
593  return res;
594 }
595 
596 template<typename Packet>
597 const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a)
598 {
599  return a;
600 }
601 
602 template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
603 // template<typename Packet>
604 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
605 // {
606 // DoublePacket<Packet> res;
607 // res.first = padd(a.first, b.first);
608 // res.second = padd(a.second,b.second);
609 // return res;
610 // }
611 
612 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
613 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
614 {
615 public:
616  typedef std::complex<RealScalar> Scalar;
617  typedef std::complex<RealScalar> LhsScalar;
618  typedef std::complex<RealScalar> RhsScalar;
619  typedef std::complex<RealScalar> ResScalar;
620 
621  enum {
622  ConjLhs = _ConjLhs,
623  ConjRhs = _ConjRhs,
624  Vectorizable = packet_traits<RealScalar>::Vectorizable
625  && packet_traits<Scalar>::Vectorizable,
626  RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
627  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
628  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
629  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
630 
631  // FIXME: should depend on NumberOfRegisters
632  nr = 4,
633  mr = ResPacketSize,
634 
635  LhsProgress = ResPacketSize,
636  RhsProgress = 1
637  };
638 
639  typedef typename packet_traits<RealScalar>::type RealPacket;
640  typedef typename packet_traits<Scalar>::type ScalarPacket;
641  typedef DoublePacket<RealPacket> DoublePacketType;
642 
643  typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
644  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
645  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
646  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
647 
648  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
649 
650  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
651  {
652  p.first = pset1<RealPacket>(RealScalar(0));
653  p.second = pset1<RealPacket>(RealScalar(0));
654  }
655 
656  // Scalar path
657  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
658  {
659  dest = pset1<ResPacket>(*b);
660  }
661 
662  // Vectorized path
663  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
664  {
665  dest.first = pset1<RealPacket>(real(*b));
666  dest.second = pset1<RealPacket>(imag(*b));
667  }
668 
669  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
670  {
671  loadRhs(b,dest);
672  }
673  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
674  {
675  eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
676  loadRhs(b,dest);
677  }
678 
679  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
680  {
681  // FIXME not sure that's the best way to implement it!
682  loadRhs(b+0, b0);
683  loadRhs(b+1, b1);
684  loadRhs(b+2, b2);
685  loadRhs(b+3, b3);
686  }
687 
688  // Vectorized path
689  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
690  {
691  // FIXME not sure that's the best way to implement it!
692  loadRhs(b+0, b0);
693  loadRhs(b+1, b1);
694  }
695 
696  // Scalar path
697  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
698  {
699  // FIXME not sure that's the best way to implement it!
700  loadRhs(b+0, b0);
701  loadRhs(b+1, b1);
702  }
703 
704  // nothing special here
705  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
706  {
707  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
708  }
709 
710  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
711  {
712  dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
713  }
714 
715  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
716  {
717  c.first = padd(pmul(a,b.first), c.first);
718  c.second = padd(pmul(a,b.second),c.second);
719  }
720 
721  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
722  {
723  c = cj.pmadd(a,b,c);
724  }
725 
726  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
727 
728  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
729  {
730  // assemble c
731  ResPacket tmp;
732  if((!ConjLhs)&&(!ConjRhs))
733  {
734  tmp = pcplxflip(pconj(ResPacket(c.second)));
735  tmp = padd(ResPacket(c.first),tmp);
736  }
737  else if((!ConjLhs)&&(ConjRhs))
738  {
739  tmp = pconj(pcplxflip(ResPacket(c.second)));
740  tmp = padd(ResPacket(c.first),tmp);
741  }
742  else if((ConjLhs)&&(!ConjRhs))
743  {
744  tmp = pcplxflip(ResPacket(c.second));
745  tmp = padd(pconj(ResPacket(c.first)),tmp);
746  }
747  else if((ConjLhs)&&(ConjRhs))
748  {
749  tmp = pcplxflip(ResPacket(c.second));
750  tmp = psub(pconj(ResPacket(c.first)),tmp);
751  }
752 
753  r = pmadd(tmp,alpha,r);
754  }
755 
756 protected:
757  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
758 };
759 
760 template<typename RealScalar, bool _ConjRhs>
761 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
762 {
763 public:
764  typedef std::complex<RealScalar> Scalar;
765  typedef RealScalar LhsScalar;
766  typedef Scalar RhsScalar;
767  typedef Scalar ResScalar;
768 
769  enum {
770  ConjLhs = false,
771  ConjRhs = _ConjRhs,
772  Vectorizable = packet_traits<RealScalar>::Vectorizable
773  && packet_traits<Scalar>::Vectorizable,
774  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
775  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
776  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
777 
778  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
779  // FIXME: should depend on NumberOfRegisters
780  nr = 4,
781  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
782 
783  LhsProgress = ResPacketSize,
784  RhsProgress = 1
785  };
786 
787  typedef typename packet_traits<LhsScalar>::type _LhsPacket;
788  typedef typename packet_traits<RhsScalar>::type _RhsPacket;
789  typedef typename packet_traits<ResScalar>::type _ResPacket;
790 
791  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
792  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
793  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
794 
795  typedef ResPacket AccPacket;
796 
797  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
798  {
799  p = pset1<ResPacket>(ResScalar(0));
800  }
801 
802  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
803  {
804  dest = pset1<RhsPacket>(*b);
805  }
806 
807  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
808  {
809  pbroadcast4(b, b0, b1, b2, b3);
810  }
811 
812 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
813 // {
814 // // FIXME not sure that's the best way to implement it!
815 // b0 = pload1<RhsPacket>(b+0);
816 // b1 = pload1<RhsPacket>(b+1);
817 // }
818 
819  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
820  {
821  dest = ploaddup<LhsPacket>(a);
822  }
823 
824  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
825  {
826  eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
827  loadRhs(b,dest);
828  }
829 
830  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
831  {
832  dest = ploaddup<LhsPacket>(a);
833  }
834 
835  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
836  {
837  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
838  }
839 
840  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
841  {
842 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
843  EIGEN_UNUSED_VARIABLE(tmp);
844  c.v = pmadd(a,b.v,c.v);
845 #else
846  tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
847 #endif
848 
849  }
850 
851  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
852  {
853  c += a * b;
854  }
855 
856  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
857  {
858  r = cj.pmadd(alpha,c,r);
859  }
860 
861 protected:
862  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
863 };
864 
865 // helper for the rotating kernel below
866 template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
867 struct PossiblyRotatingKernelHelper
868 {
869  // default implementation, not rotating
870 
871  typedef typename GebpKernel::Traits Traits;
872  typedef typename Traits::RhsScalar RhsScalar;
873  typedef typename Traits::RhsPacket RhsPacket;
874  typedef typename Traits::AccPacket AccPacket;
875 
876  const Traits& traits;
877  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
878 
879 
880  template <size_t K, size_t Index>
881  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
882  {
883  traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
884  }
885 
886  void unrotateResult(AccPacket&,
887  AccPacket&,
888  AccPacket&,
889  AccPacket&)
890  {
891  }
892 };
893 
894 // rotating implementation
895 template <typename GebpKernel>
896 struct PossiblyRotatingKernelHelper<GebpKernel, true>
897 {
898  typedef typename GebpKernel::Traits Traits;
899  typedef typename Traits::RhsScalar RhsScalar;
900  typedef typename Traits::RhsPacket RhsPacket;
901  typedef typename Traits::AccPacket AccPacket;
902 
903  const Traits& traits;
904  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
905 
906  template <size_t K, size_t Index>
907  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
908  {
909  if (Index == 0) {
910  to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
911  } else {
912  EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
913  to = protate<1>(to);
914  }
915  }
916 
917  void unrotateResult(AccPacket& res0,
918  AccPacket& res1,
919  AccPacket& res2,
920  AccPacket& res3)
921  {
922  PacketBlock<AccPacket> resblock;
923  resblock.packet[0] = res0;
924  resblock.packet[1] = res1;
925  resblock.packet[2] = res2;
926  resblock.packet[3] = res3;
927  ptranspose(resblock);
928  resblock.packet[3] = protate<1>(resblock.packet[3]);
929  resblock.packet[2] = protate<2>(resblock.packet[2]);
930  resblock.packet[1] = protate<3>(resblock.packet[1]);
931  ptranspose(resblock);
932  res0 = resblock.packet[0];
933  res1 = resblock.packet[1];
934  res2 = resblock.packet[2];
935  res3 = resblock.packet[3];
936  }
937 };
938 
939 /* optimized GEneral packed Block * packed Panel product kernel
940  *
941  * Mixing type logic: C += A * B
942  * | A | B | comments
943  * |real |cplx | no vectorization yet, would require to pack A with duplication
944  * |cplx |real | easy vectorization
945  */
946 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
947 struct gebp_kernel
948 {
949  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
950  typedef typename Traits::ResScalar ResScalar;
951  typedef typename Traits::LhsPacket LhsPacket;
952  typedef typename Traits::RhsPacket RhsPacket;
953  typedef typename Traits::ResPacket ResPacket;
954  typedef typename Traits::AccPacket AccPacket;
955 
956  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
957  typedef typename SwappedTraits::ResScalar SResScalar;
958  typedef typename SwappedTraits::LhsPacket SLhsPacket;
959  typedef typename SwappedTraits::RhsPacket SRhsPacket;
960  typedef typename SwappedTraits::ResPacket SResPacket;
961  typedef typename SwappedTraits::AccPacket SAccPacket;
962 
963  typedef typename DataMapper::LinearMapper LinearMapper;
964 
965  enum {
966  Vectorizable = Traits::Vectorizable,
967  LhsProgress = Traits::LhsProgress,
968  RhsProgress = Traits::RhsProgress,
969  ResPacketSize = Traits::ResPacketSize
970  };
971 
972 
973  static const bool UseRotatingKernel =
974  EIGEN_ARCH_ARM &&
975  internal::is_same<LhsScalar, float>::value &&
976  internal::is_same<RhsScalar, float>::value &&
977  internal::is_same<ResScalar, float>::value &&
978  Traits::LhsPacketSize == 4 &&
979  Traits::RhsPacketSize == 4 &&
980  Traits::ResPacketSize == 4;
981 
982  EIGEN_DONT_INLINE
983  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
984  Index rows, Index depth, Index cols, ResScalar alpha,
985  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
986 };
987 
988 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
989 EIGEN_DONT_INLINE
990 void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
991  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
992  Index rows, Index depth, Index cols, ResScalar alpha,
993  Index strideA, Index strideB, Index offsetA, Index offsetB)
994  {
995  Traits traits;
996  SwappedTraits straits;
997 
998  if(strideA==-1) strideA = depth;
999  if(strideB==-1) strideB = depth;
1000  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
1001  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1002  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1003  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1004  const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
1005  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1006  const Index peeled_kc = depth & ~(pk-1);
1007  const Index prefetch_res_offset = 32/sizeof(ResScalar);
1008 // const Index depth2 = depth & ~1;
1009 
1010  //---------- Process 3 * LhsProgress rows at once ----------
1011  // This corresponds to 3*LhsProgress x nr register blocks.
1012  // Usually, make sense only with FMA
1013  if(mr>=3*Traits::LhsProgress)
1014  {
1015  PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
1016 
1017  // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
1018  // and on each largest micro vertical panel of the rhs (depth * nr).
1019  // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
1020  // However, if depth is too small, we can extend the number of rows of these horizontal panels.
1021  // This actual number of rows is computed as follow:
1022  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1023  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1024  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1025  // or because we are testing specific blocking sizes.
1026  const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
1027  for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1028  {
1029  const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
1030  for(Index j2=0; j2<packet_cols4; j2+=nr)
1031  {
1032  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1033  {
1034 
1035  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
1036  // stored into 3 x nr registers.
1037 
1038  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1039  prefetch(&blA[0]);
1040 
1041  // gets res block as register
1042  AccPacket C0, C1, C2, C3,
1043  C4, C5, C6, C7,
1044  C8, C9, C10, C11;
1045  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1046  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1047  traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1048 
1049  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1050  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1051  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1052  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1053 
1054  r0.prefetch(0);
1055  r1.prefetch(0);
1056  r2.prefetch(0);
1057  r3.prefetch(0);
1058 
1059  // performs "inner" products
1060  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1061  prefetch(&blB[0]);
1062  LhsPacket A0, A1;
1063 
1064  for(Index k=0; k<peeled_kc; k+=pk)
1065  {
1066  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
1067  RhsPacket B_0, T0;
1068  LhsPacket A2;
1069 
1070 #define EIGEN_GEBP_ONESTEP(K) \
1071  do { \
1072  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1073  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1074  internal::prefetch(blA+(3*K+16)*LhsProgress); \
1075  if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
1076  traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1077  traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1078  traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1079  possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
1080  traits.madd(A0, B_0, C0, T0); \
1081  traits.madd(A1, B_0, C4, T0); \
1082  traits.madd(A2, B_0, C8, B_0); \
1083  possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
1084  traits.madd(A0, B_0, C1, T0); \
1085  traits.madd(A1, B_0, C5, T0); \
1086  traits.madd(A2, B_0, C9, B_0); \
1087  possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
1088  traits.madd(A0, B_0, C2, T0); \
1089  traits.madd(A1, B_0, C6, T0); \
1090  traits.madd(A2, B_0, C10, B_0); \
1091  possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
1092  traits.madd(A0, B_0, C3 , T0); \
1093  traits.madd(A1, B_0, C7, T0); \
1094  traits.madd(A2, B_0, C11, B_0); \
1095  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1096  } while(false)
1097 
1098  internal::prefetch(blB);
1099  EIGEN_GEBP_ONESTEP(0);
1100  EIGEN_GEBP_ONESTEP(1);
1101  EIGEN_GEBP_ONESTEP(2);
1102  EIGEN_GEBP_ONESTEP(3);
1103  EIGEN_GEBP_ONESTEP(4);
1104  EIGEN_GEBP_ONESTEP(5);
1105  EIGEN_GEBP_ONESTEP(6);
1106  EIGEN_GEBP_ONESTEP(7);
1107 
1108  blB += pk*4*RhsProgress;
1109  blA += pk*3*Traits::LhsProgress;
1110 
1111  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1112  }
1113  // process remaining peeled loop
1114  for(Index k=peeled_kc; k<depth; k++)
1115  {
1116  RhsPacket B_0, T0;
1117  LhsPacket A2;
1118  EIGEN_GEBP_ONESTEP(0);
1119  blB += 4*RhsProgress;
1120  blA += 3*Traits::LhsProgress;
1121  }
1122 
1123 #undef EIGEN_GEBP_ONESTEP
1124 
1125  possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
1126  possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
1127  possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
1128 
1129  ResPacket R0, R1, R2;
1130  ResPacket alphav = pset1<ResPacket>(alpha);
1131 
1132  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1133  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1134  R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1135  traits.acc(C0, alphav, R0);
1136  traits.acc(C4, alphav, R1);
1137  traits.acc(C8, alphav, R2);
1138  r0.storePacket(0 * Traits::ResPacketSize, R0);
1139  r0.storePacket(1 * Traits::ResPacketSize, R1);
1140  r0.storePacket(2 * Traits::ResPacketSize, R2);
1141 
1142  R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1143  R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1144  R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1145  traits.acc(C1, alphav, R0);
1146  traits.acc(C5, alphav, R1);
1147  traits.acc(C9, alphav, R2);
1148  r1.storePacket(0 * Traits::ResPacketSize, R0);
1149  r1.storePacket(1 * Traits::ResPacketSize, R1);
1150  r1.storePacket(2 * Traits::ResPacketSize, R2);
1151 
1152  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1153  R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1154  R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1155  traits.acc(C2, alphav, R0);
1156  traits.acc(C6, alphav, R1);
1157  traits.acc(C10, alphav, R2);
1158  r2.storePacket(0 * Traits::ResPacketSize, R0);
1159  r2.storePacket(1 * Traits::ResPacketSize, R1);
1160  r2.storePacket(2 * Traits::ResPacketSize, R2);
1161 
1162  R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1163  R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1164  R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1165  traits.acc(C3, alphav, R0);
1166  traits.acc(C7, alphav, R1);
1167  traits.acc(C11, alphav, R2);
1168  r3.storePacket(0 * Traits::ResPacketSize, R0);
1169  r3.storePacket(1 * Traits::ResPacketSize, R1);
1170  r3.storePacket(2 * Traits::ResPacketSize, R2);
1171  }
1172  }
1173 
1174  // Deal with remaining columns of the rhs
1175  for(Index j2=packet_cols4; j2<cols; j2++)
1176  {
1177  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1178  {
1179  // One column at a time
1180  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1181  prefetch(&blA[0]);
1182 
1183  // gets res block as register
1184  AccPacket C0, C4, C8;
1185  traits.initAcc(C0);
1186  traits.initAcc(C4);
1187  traits.initAcc(C8);
1188 
1189  LinearMapper r0 = res.getLinearMapper(i, j2);
1190  r0.prefetch(0);
1191 
1192  // performs "inner" products
1193  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1194  LhsPacket A0, A1, A2;
1195 
1196  for(Index k=0; k<peeled_kc; k+=pk)
1197  {
1198  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1199  RhsPacket B_0;
1200 #define EIGEN_GEBGP_ONESTEP(K) \
1201  do { \
1202  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1203  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1204  traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1205  traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1206  traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1207  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1208  traits.madd(A0, B_0, C0, B_0); \
1209  traits.madd(A1, B_0, C4, B_0); \
1210  traits.madd(A2, B_0, C8, B_0); \
1211  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1212  } while(false)
1213 
1214  EIGEN_GEBGP_ONESTEP(0);
1215  EIGEN_GEBGP_ONESTEP(1);
1216  EIGEN_GEBGP_ONESTEP(2);
1217  EIGEN_GEBGP_ONESTEP(3);
1218  EIGEN_GEBGP_ONESTEP(4);
1219  EIGEN_GEBGP_ONESTEP(5);
1220  EIGEN_GEBGP_ONESTEP(6);
1221  EIGEN_GEBGP_ONESTEP(7);
1222 
1223  blB += pk*RhsProgress;
1224  blA += pk*3*Traits::LhsProgress;
1225 
1226  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1227  }
1228 
1229  // process remaining peeled loop
1230  for(Index k=peeled_kc; k<depth; k++)
1231  {
1232  RhsPacket B_0;
1233  EIGEN_GEBGP_ONESTEP(0);
1234  blB += RhsProgress;
1235  blA += 3*Traits::LhsProgress;
1236  }
1237 #undef EIGEN_GEBGP_ONESTEP
1238  ResPacket R0, R1, R2;
1239  ResPacket alphav = pset1<ResPacket>(alpha);
1240 
1241  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1242  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1243  R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1244  traits.acc(C0, alphav, R0);
1245  traits.acc(C4, alphav, R1);
1246  traits.acc(C8, alphav, R2);
1247  r0.storePacket(0 * Traits::ResPacketSize, R0);
1248  r0.storePacket(1 * Traits::ResPacketSize, R1);
1249  r0.storePacket(2 * Traits::ResPacketSize, R2);
1250  }
1251  }
1252  }
1253  }
1254 
1255  //---------- Process 2 * LhsProgress rows at once ----------
1256  if(mr>=2*Traits::LhsProgress)
1257  {
1258  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1259  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1260  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1261  // or because we are testing specific blocking sizes.
1262  Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1263 
1264  for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1265  {
1266  Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1267  for(Index j2=0; j2<packet_cols4; j2+=nr)
1268  {
1269  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1270  {
1271 
1272  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1273  // stored into 2 x nr registers.
1274 
1275  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1276  prefetch(&blA[0]);
1277 
1278  // gets res block as register
1279  AccPacket C0, C1, C2, C3,
1280  C4, C5, C6, C7;
1281  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1282  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1283 
1284  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1285  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1286  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1287  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1288 
1289  r0.prefetch(prefetch_res_offset);
1290  r1.prefetch(prefetch_res_offset);
1291  r2.prefetch(prefetch_res_offset);
1292  r3.prefetch(prefetch_res_offset);
1293 
1294  // performs "inner" products
1295  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1296  prefetch(&blB[0]);
1297  LhsPacket A0, A1;
1298 
1299  for(Index k=0; k<peeled_kc; k+=pk)
1300  {
1301  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1302  RhsPacket B_0, B1, B2, B3, T0;
1303 
1304  #define EIGEN_GEBGP_ONESTEP(K) \
1305  do { \
1306  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1307  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1308  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1309  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1310  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1311  traits.madd(A0, B_0, C0, T0); \
1312  traits.madd(A1, B_0, C4, B_0); \
1313  traits.madd(A0, B1, C1, T0); \
1314  traits.madd(A1, B1, C5, B1); \
1315  traits.madd(A0, B2, C2, T0); \
1316  traits.madd(A1, B2, C6, B2); \
1317  traits.madd(A0, B3, C3, T0); \
1318  traits.madd(A1, B3, C7, B3); \
1319  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1320  } while(false)
1321 
1322  internal::prefetch(blB+(48+0));
1323  EIGEN_GEBGP_ONESTEP(0);
1324  EIGEN_GEBGP_ONESTEP(1);
1325  EIGEN_GEBGP_ONESTEP(2);
1326  EIGEN_GEBGP_ONESTEP(3);
1327  internal::prefetch(blB+(48+16));
1328  EIGEN_GEBGP_ONESTEP(4);
1329  EIGEN_GEBGP_ONESTEP(5);
1330  EIGEN_GEBGP_ONESTEP(6);
1331  EIGEN_GEBGP_ONESTEP(7);
1332 
1333  blB += pk*4*RhsProgress;
1334  blA += pk*(2*Traits::LhsProgress);
1335 
1336  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1337  }
1338  // process remaining peeled loop
1339  for(Index k=peeled_kc; k<depth; k++)
1340  {
1341  RhsPacket B_0, B1, B2, B3, T0;
1342  EIGEN_GEBGP_ONESTEP(0);
1343  blB += 4*RhsProgress;
1344  blA += 2*Traits::LhsProgress;
1345  }
1346 #undef EIGEN_GEBGP_ONESTEP
1347 
1348  ResPacket R0, R1, R2, R3;
1349  ResPacket alphav = pset1<ResPacket>(alpha);
1350 
1351  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1352  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1353  R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1354  R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1355  traits.acc(C0, alphav, R0);
1356  traits.acc(C4, alphav, R1);
1357  traits.acc(C1, alphav, R2);
1358  traits.acc(C5, alphav, R3);
1359  r0.storePacket(0 * Traits::ResPacketSize, R0);
1360  r0.storePacket(1 * Traits::ResPacketSize, R1);
1361  r1.storePacket(0 * Traits::ResPacketSize, R2);
1362  r1.storePacket(1 * Traits::ResPacketSize, R3);
1363 
1364  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1365  R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1366  R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1367  R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1368  traits.acc(C2, alphav, R0);
1369  traits.acc(C6, alphav, R1);
1370  traits.acc(C3, alphav, R2);
1371  traits.acc(C7, alphav, R3);
1372  r2.storePacket(0 * Traits::ResPacketSize, R0);
1373  r2.storePacket(1 * Traits::ResPacketSize, R1);
1374  r3.storePacket(0 * Traits::ResPacketSize, R2);
1375  r3.storePacket(1 * Traits::ResPacketSize, R3);
1376  }
1377  }
1378 
1379  // Deal with remaining columns of the rhs
1380  for(Index j2=packet_cols4; j2<cols; j2++)
1381  {
1382  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1383  {
1384  // One column at a time
1385  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1386  prefetch(&blA[0]);
1387 
1388  // gets res block as register
1389  AccPacket C0, C4;
1390  traits.initAcc(C0);
1391  traits.initAcc(C4);
1392 
1393  LinearMapper r0 = res.getLinearMapper(i, j2);
1394  r0.prefetch(prefetch_res_offset);
1395 
1396  // performs "inner" products
1397  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1398  LhsPacket A0, A1;
1399 
1400  for(Index k=0; k<peeled_kc; k+=pk)
1401  {
1402  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1403  RhsPacket B_0, B1;
1404 
1405 #define EIGEN_GEBGP_ONESTEP(K) \
1406  do { \
1407  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1408  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1409  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1410  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1411  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1412  traits.madd(A0, B_0, C0, B1); \
1413  traits.madd(A1, B_0, C4, B_0); \
1414  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1415  } while(false)
1416 
1417  EIGEN_GEBGP_ONESTEP(0);
1418  EIGEN_GEBGP_ONESTEP(1);
1419  EIGEN_GEBGP_ONESTEP(2);
1420  EIGEN_GEBGP_ONESTEP(3);
1421  EIGEN_GEBGP_ONESTEP(4);
1422  EIGEN_GEBGP_ONESTEP(5);
1423  EIGEN_GEBGP_ONESTEP(6);
1424  EIGEN_GEBGP_ONESTEP(7);
1425 
1426  blB += pk*RhsProgress;
1427  blA += pk*2*Traits::LhsProgress;
1428 
1429  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1430  }
1431 
1432  // process remaining peeled loop
1433  for(Index k=peeled_kc; k<depth; k++)
1434  {
1435  RhsPacket B_0, B1;
1436  EIGEN_GEBGP_ONESTEP(0);
1437  blB += RhsProgress;
1438  blA += 2*Traits::LhsProgress;
1439  }
1440 #undef EIGEN_GEBGP_ONESTEP
1441  ResPacket R0, R1;
1442  ResPacket alphav = pset1<ResPacket>(alpha);
1443 
1444  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1445  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1446  traits.acc(C0, alphav, R0);
1447  traits.acc(C4, alphav, R1);
1448  r0.storePacket(0 * Traits::ResPacketSize, R0);
1449  r0.storePacket(1 * Traits::ResPacketSize, R1);
1450  }
1451  }
1452  }
1453  }
1454  //---------- Process 1 * LhsProgress rows at once ----------
1455  if(mr>=1*Traits::LhsProgress)
1456  {
1457  // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1458  for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1459  {
1460  // loops on each largest micro vertical panel of rhs (depth * nr)
1461  for(Index j2=0; j2<packet_cols4; j2+=nr)
1462  {
1463  // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1464  // stored into 1 x nr registers.
1465 
1466  const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1467  prefetch(&blA[0]);
1468 
1469  // gets res block as register
1470  AccPacket C0, C1, C2, C3;
1471  traits.initAcc(C0);
1472  traits.initAcc(C1);
1473  traits.initAcc(C2);
1474  traits.initAcc(C3);
1475 
1476  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1477  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1478  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1479  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1480 
1481  r0.prefetch(prefetch_res_offset);
1482  r1.prefetch(prefetch_res_offset);
1483  r2.prefetch(prefetch_res_offset);
1484  r3.prefetch(prefetch_res_offset);
1485 
1486  // performs "inner" products
1487  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1488  prefetch(&blB[0]);
1489  LhsPacket A0;
1490 
1491  for(Index k=0; k<peeled_kc; k+=pk)
1492  {
1493  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1494  RhsPacket B_0, B1, B2, B3;
1495 
1496 #define EIGEN_GEBGP_ONESTEP(K) \
1497  do { \
1498  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1499  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1500  traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1501  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1502  traits.madd(A0, B_0, C0, B_0); \
1503  traits.madd(A0, B1, C1, B1); \
1504  traits.madd(A0, B2, C2, B2); \
1505  traits.madd(A0, B3, C3, B3); \
1506  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1507  } while(false)
1508 
1509  internal::prefetch(blB+(48+0));
1510  EIGEN_GEBGP_ONESTEP(0);
1511  EIGEN_GEBGP_ONESTEP(1);
1512  EIGEN_GEBGP_ONESTEP(2);
1513  EIGEN_GEBGP_ONESTEP(3);
1514  internal::prefetch(blB+(48+16));
1515  EIGEN_GEBGP_ONESTEP(4);
1516  EIGEN_GEBGP_ONESTEP(5);
1517  EIGEN_GEBGP_ONESTEP(6);
1518  EIGEN_GEBGP_ONESTEP(7);
1519 
1520  blB += pk*4*RhsProgress;
1521  blA += pk*1*LhsProgress;
1522 
1523  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1524  }
1525  // process remaining peeled loop
1526  for(Index k=peeled_kc; k<depth; k++)
1527  {
1528  RhsPacket B_0, B1, B2, B3;
1529  EIGEN_GEBGP_ONESTEP(0);
1530  blB += 4*RhsProgress;
1531  blA += 1*LhsProgress;
1532  }
1533 #undef EIGEN_GEBGP_ONESTEP
1534 
1535  ResPacket R0, R1;
1536  ResPacket alphav = pset1<ResPacket>(alpha);
1537 
1538  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1539  R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1540  traits.acc(C0, alphav, R0);
1541  traits.acc(C1, alphav, R1);
1542  r0.storePacket(0 * Traits::ResPacketSize, R0);
1543  r1.storePacket(0 * Traits::ResPacketSize, R1);
1544 
1545  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1546  R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1547  traits.acc(C2, alphav, R0);
1548  traits.acc(C3, alphav, R1);
1549  r2.storePacket(0 * Traits::ResPacketSize, R0);
1550  r3.storePacket(0 * Traits::ResPacketSize, R1);
1551  }
1552 
1553  // Deal with remaining columns of the rhs
1554  for(Index j2=packet_cols4; j2<cols; j2++)
1555  {
1556  // One column at a time
1557  const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1558  prefetch(&blA[0]);
1559 
1560  // gets res block as register
1561  AccPacket C0;
1562  traits.initAcc(C0);
1563 
1564  LinearMapper r0 = res.getLinearMapper(i, j2);
1565 
1566  // performs "inner" products
1567  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1568  LhsPacket A0;
1569 
1570  for(Index k=0; k<peeled_kc; k+=pk)
1571  {
1572  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1573  RhsPacket B_0;
1574 
1575 #define EIGEN_GEBGP_ONESTEP(K) \
1576  do { \
1577  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1578  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1579  traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1580  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1581  traits.madd(A0, B_0, C0, B_0); \
1582  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1583  } while(false);
1584 
1585  EIGEN_GEBGP_ONESTEP(0);
1586  EIGEN_GEBGP_ONESTEP(1);
1587  EIGEN_GEBGP_ONESTEP(2);
1588  EIGEN_GEBGP_ONESTEP(3);
1589  EIGEN_GEBGP_ONESTEP(4);
1590  EIGEN_GEBGP_ONESTEP(5);
1591  EIGEN_GEBGP_ONESTEP(6);
1592  EIGEN_GEBGP_ONESTEP(7);
1593 
1594  blB += pk*RhsProgress;
1595  blA += pk*1*Traits::LhsProgress;
1596 
1597  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1598  }
1599 
1600  // process remaining peeled loop
1601  for(Index k=peeled_kc; k<depth; k++)
1602  {
1603  RhsPacket B_0;
1604  EIGEN_GEBGP_ONESTEP(0);
1605  blB += RhsProgress;
1606  blA += 1*Traits::LhsProgress;
1607  }
1608 #undef EIGEN_GEBGP_ONESTEP
1609  ResPacket R0;
1610  ResPacket alphav = pset1<ResPacket>(alpha);
1611  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1612  traits.acc(C0, alphav, R0);
1613  r0.storePacket(0 * Traits::ResPacketSize, R0);
1614  }
1615  }
1616  }
1617  //---------- Process remaining rows, 1 at once ----------
1618  if(peeled_mc1<rows)
1619  {
1620  // loop on each panel of the rhs
1621  for(Index j2=0; j2<packet_cols4; j2+=nr)
1622  {
1623  // loop on each row of the lhs (1*LhsProgress x depth)
1624  for(Index i=peeled_mc1; i<rows; i+=1)
1625  {
1626  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1627  prefetch(&blA[0]);
1628  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1629 
1630  if( (SwappedTraits::LhsProgress % 4)==0 )
1631  {
1632  // NOTE The following piece of code wont work for 512 bit registers
1633  SAccPacket C0, C1, C2, C3;
1634  straits.initAcc(C0);
1635  straits.initAcc(C1);
1636  straits.initAcc(C2);
1637  straits.initAcc(C3);
1638 
1639  const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1640  const Index endk = (depth/spk)*spk;
1641  const Index endk4 = (depth/(spk*4))*(spk*4);
1642 
1643  Index k=0;
1644  for(; k<endk4; k+=4*spk)
1645  {
1646  SLhsPacket A0,A1;
1647  SRhsPacket B_0,B_1;
1648 
1649  straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1650  straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1651 
1652  straits.loadRhsQuad(blA+0*spk, B_0);
1653  straits.loadRhsQuad(blA+1*spk, B_1);
1654  straits.madd(A0,B_0,C0,B_0);
1655  straits.madd(A1,B_1,C1,B_1);
1656 
1657  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1658  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1659  straits.loadRhsQuad(blA+2*spk, B_0);
1660  straits.loadRhsQuad(blA+3*spk, B_1);
1661  straits.madd(A0,B_0,C2,B_0);
1662  straits.madd(A1,B_1,C3,B_1);
1663 
1664  blB += 4*SwappedTraits::LhsProgress;
1665  blA += 4*spk;
1666  }
1667  C0 = padd(padd(C0,C1),padd(C2,C3));
1668  for(; k<endk; k+=spk)
1669  {
1670  SLhsPacket A0;
1671  SRhsPacket B_0;
1672 
1673  straits.loadLhsUnaligned(blB, A0);
1674  straits.loadRhsQuad(blA, B_0);
1675  straits.madd(A0,B_0,C0,B_0);
1676 
1677  blB += SwappedTraits::LhsProgress;
1678  blA += spk;
1679  }
1680  if(SwappedTraits::LhsProgress==8)
1681  {
1682  // Special case where we have to first reduce the accumulation register C0
1683  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1684  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1685  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1686  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1687 
1688  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1689  SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1690 
1691  if(depth-endk>0)
1692  {
1693  // We have to handle the last row of the rhs which corresponds to a half-packet
1694  SLhsPacketHalf a0;
1695  SRhsPacketHalf b0;
1696  straits.loadLhsUnaligned(blB, a0);
1697  straits.loadRhs(blA, b0);
1698  SAccPacketHalf c0 = predux4(C0);
1699  straits.madd(a0,b0,c0,b0);
1700  straits.acc(c0, alphav, R);
1701  }
1702  else
1703  {
1704  straits.acc(predux4(C0), alphav, R);
1705  }
1706  res.scatterPacket(i, j2, R);
1707  }
1708  else
1709  {
1710  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1711  SResPacket alphav = pset1<SResPacket>(alpha);
1712  straits.acc(C0, alphav, R);
1713  res.scatterPacket(i, j2, R);
1714  }
1715  }
1716  else // scalar path
1717  {
1718  // get a 1 x 4 res block as registers
1719  ResScalar C0(0), C1(0), C2(0), C3(0);
1720 
1721  for(Index k=0; k<depth; k++)
1722  {
1723  LhsScalar A0;
1724  RhsScalar B_0, B_1;
1725 
1726  A0 = blA[k];
1727 
1728  B_0 = blB[0];
1729  B_1 = blB[1];
1730  CJMADD(cj,A0,B_0,C0, B_0);
1731  CJMADD(cj,A0,B_1,C1, B_1);
1732 
1733  B_0 = blB[2];
1734  B_1 = blB[3];
1735  CJMADD(cj,A0,B_0,C2, B_0);
1736  CJMADD(cj,A0,B_1,C3, B_1);
1737 
1738  blB += 4;
1739  }
1740  res(i, j2 + 0) += alpha * C0;
1741  res(i, j2 + 1) += alpha * C1;
1742  res(i, j2 + 2) += alpha * C2;
1743  res(i, j2 + 3) += alpha * C3;
1744  }
1745  }
1746  }
1747  // remaining columns
1748  for(Index j2=packet_cols4; j2<cols; j2++)
1749  {
1750  // loop on each row of the lhs (1*LhsProgress x depth)
1751  for(Index i=peeled_mc1; i<rows; i+=1)
1752  {
1753  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1754  prefetch(&blA[0]);
1755  // gets a 1 x 1 res block as registers
1756  ResScalar C0(0);
1757  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1758  for(Index k=0; k<depth; k++)
1759  {
1760  LhsScalar A0 = blA[k];
1761  RhsScalar B_0 = blB[k];
1762  CJMADD(cj, A0, B_0, C0, B_0);
1763  }
1764  res(i, j2) += alpha * C0;
1765  }
1766  }
1767  }
1768  }
1769 
1770 
1771 #undef CJMADD
1772 
1773 // pack a block of the lhs
1774 // The traversal is as follow (mr==4):
1775 // 0 4 8 12 ...
1776 // 1 5 9 13 ...
1777 // 2 6 10 14 ...
1778 // 3 7 11 15 ...
1779 //
1780 // 16 20 24 28 ...
1781 // 17 21 25 29 ...
1782 // 18 22 26 30 ...
1783 // 19 23 27 31 ...
1784 //
1785 // 32 33 34 35 ...
1786 // 36 36 38 39 ...
1787 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1788 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1789 {
1790  typedef typename DataMapper::LinearMapper LinearMapper;
1791  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1792 };
1793 
1794 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1795 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1796  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1797 {
1798  typedef typename packet_traits<Scalar>::type Packet;
1799  enum { PacketSize = packet_traits<Scalar>::size };
1800 
1801  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1802  EIGEN_UNUSED_VARIABLE(stride);
1803  EIGEN_UNUSED_VARIABLE(offset);
1804  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1805  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1806  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1807  Index count = 0;
1808 
1809  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1810  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1811  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1812  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1813  : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1814 
1815  Index i=0;
1816 
1817  // Pack 3 packets
1818  if(Pack1>=3*PacketSize)
1819  {
1820  for(; i<peeled_mc3; i+=3*PacketSize)
1821  {
1822  if(PanelMode) count += (3*PacketSize) * offset;
1823 
1824  for(Index k=0; k<depth; k++)
1825  {
1826  Packet A, B, C;
1827  A = lhs.loadPacket(i+0*PacketSize, k);
1828  B = lhs.loadPacket(i+1*PacketSize, k);
1829  C = lhs.loadPacket(i+2*PacketSize, k);
1830  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1831  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1832  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1833  }
1834  if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1835  }
1836  }
1837  // Pack 2 packets
1838  if(Pack1>=2*PacketSize)
1839  {
1840  for(; i<peeled_mc2; i+=2*PacketSize)
1841  {
1842  if(PanelMode) count += (2*PacketSize) * offset;
1843 
1844  for(Index k=0; k<depth; k++)
1845  {
1846  Packet A, B;
1847  A = lhs.loadPacket(i+0*PacketSize, k);
1848  B = lhs.loadPacket(i+1*PacketSize, k);
1849  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1850  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1851  }
1852  if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1853  }
1854  }
1855  // Pack 1 packets
1856  if(Pack1>=1*PacketSize)
1857  {
1858  for(; i<peeled_mc1; i+=1*PacketSize)
1859  {
1860  if(PanelMode) count += (1*PacketSize) * offset;
1861 
1862  for(Index k=0; k<depth; k++)
1863  {
1864  Packet A;
1865  A = lhs.loadPacket(i+0*PacketSize, k);
1866  pstore(blockA+count, cj.pconj(A));
1867  count+=PacketSize;
1868  }
1869  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1870  }
1871  }
1872  // Pack scalars
1873  if(Pack2<PacketSize && Pack2>1)
1874  {
1875  for(; i<peeled_mc0; i+=Pack2)
1876  {
1877  if(PanelMode) count += Pack2 * offset;
1878 
1879  for(Index k=0; k<depth; k++)
1880  for(Index w=0; w<Pack2; w++)
1881  blockA[count++] = cj(lhs(i+w, k));
1882 
1883  if(PanelMode) count += Pack2 * (stride-offset-depth);
1884  }
1885  }
1886  for(; i<rows; i++)
1887  {
1888  if(PanelMode) count += offset;
1889  for(Index k=0; k<depth; k++)
1890  blockA[count++] = cj(lhs(i, k));
1891  if(PanelMode) count += (stride-offset-depth);
1892  }
1893 }
1894 
1895 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1896 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1897 {
1898  typedef typename DataMapper::LinearMapper LinearMapper;
1899  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1900 };
1901 
1902 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1903 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1904  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1905 {
1906  typedef typename packet_traits<Scalar>::type Packet;
1907  enum { PacketSize = packet_traits<Scalar>::size };
1908 
1909  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1910  EIGEN_UNUSED_VARIABLE(stride);
1911  EIGEN_UNUSED_VARIABLE(offset);
1912  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1913  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1914  Index count = 0;
1915 
1916 // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1917 // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1918 // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1919 
1920  int pack = Pack1;
1921  Index i = 0;
1922  while(pack>0)
1923  {
1924  Index remaining_rows = rows-i;
1925  Index peeled_mc = i+(remaining_rows/pack)*pack;
1926  for(; i<peeled_mc; i+=pack)
1927  {
1928  if(PanelMode) count += pack * offset;
1929 
1930  const Index peeled_k = (depth/PacketSize)*PacketSize;
1931  Index k=0;
1932  if(pack>=PacketSize)
1933  {
1934  for(; k<peeled_k; k+=PacketSize)
1935  {
1936  for (Index m = 0; m < pack; m += PacketSize)
1937  {
1938  PacketBlock<Packet> kernel;
1939  for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1940  ptranspose(kernel);
1941  for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
1942  }
1943  count += PacketSize*pack;
1944  }
1945  }
1946  for(; k<depth; k++)
1947  {
1948  Index w=0;
1949  for(; w<pack-3; w+=4)
1950  {
1951  Scalar a(cj(lhs(i+w+0, k))),
1952  b(cj(lhs(i+w+1, k))),
1953  c(cj(lhs(i+w+2, k))),
1954  d(cj(lhs(i+w+3, k)));
1955  blockA[count++] = a;
1956  blockA[count++] = b;
1957  blockA[count++] = c;
1958  blockA[count++] = d;
1959  }
1960  if(pack%4)
1961  for(;w<pack;++w)
1962  blockA[count++] = cj(lhs(i+w, k));
1963  }
1964 
1965  if(PanelMode) count += pack * (stride-offset-depth);
1966  }
1967 
1968  pack -= PacketSize;
1969  if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1970  pack = Pack2;
1971  }
1972 
1973  for(; i<rows; i++)
1974  {
1975  if(PanelMode) count += offset;
1976  for(Index k=0; k<depth; k++)
1977  blockA[count++] = cj(lhs(i, k));
1978  if(PanelMode) count += (stride-offset-depth);
1979  }
1980 }
1981 
1982 // copy a complete panel of the rhs
1983 // this version is optimized for column major matrices
1984 // The traversal order is as follow: (nr==4):
1985 // 0 1 2 3 12 13 14 15 24 27
1986 // 4 5 6 7 16 17 18 19 25 28
1987 // 8 9 10 11 20 21 22 23 26 29
1988 // . . . . . . . . . .
1989 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1990 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1991 {
1992  typedef typename packet_traits<Scalar>::type Packet;
1993  typedef typename DataMapper::LinearMapper LinearMapper;
1994  enum { PacketSize = packet_traits<Scalar>::size };
1995  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
1996 };
1997 
1998 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1999 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2000  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2001 {
2002  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
2003  EIGEN_UNUSED_VARIABLE(stride);
2004  EIGEN_UNUSED_VARIABLE(offset);
2005  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2006  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2007  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2008  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2009  Index count = 0;
2010  const Index peeled_k = (depth/PacketSize)*PacketSize;
2011 // if(nr>=8)
2012 // {
2013 // for(Index j2=0; j2<packet_cols8; j2+=8)
2014 // {
2015 // // skip what we have before
2016 // if(PanelMode) count += 8 * offset;
2017 // const Scalar* b0 = &rhs[(j2+0)*rhsStride];
2018 // const Scalar* b1 = &rhs[(j2+1)*rhsStride];
2019 // const Scalar* b2 = &rhs[(j2+2)*rhsStride];
2020 // const Scalar* b3 = &rhs[(j2+3)*rhsStride];
2021 // const Scalar* b4 = &rhs[(j2+4)*rhsStride];
2022 // const Scalar* b5 = &rhs[(j2+5)*rhsStride];
2023 // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
2024 // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
2025 // Index k=0;
2026 // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
2027 // {
2028 // for(; k<peeled_k; k+=PacketSize) {
2029 // PacketBlock<Packet> kernel;
2030 // for (int p = 0; p < PacketSize; ++p) {
2031 // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
2032 // }
2033 // ptranspose(kernel);
2034 // for (int p = 0; p < PacketSize; ++p) {
2035 // pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
2036 // count+=PacketSize;
2037 // }
2038 // }
2039 // }
2040 // for(; k<depth; k++)
2041 // {
2042 // blockB[count+0] = cj(b0[k]);
2043 // blockB[count+1] = cj(b1[k]);
2044 // blockB[count+2] = cj(b2[k]);
2045 // blockB[count+3] = cj(b3[k]);
2046 // blockB[count+4] = cj(b4[k]);
2047 // blockB[count+5] = cj(b5[k]);
2048 // blockB[count+6] = cj(b6[k]);
2049 // blockB[count+7] = cj(b7[k]);
2050 // count += 8;
2051 // }
2052 // // skip what we have after
2053 // if(PanelMode) count += 8 * (stride-offset-depth);
2054 // }
2055 // }
2056 
2057  if(nr>=4)
2058  {
2059  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2060  {
2061  // skip what we have before
2062  if(PanelMode) count += 4 * offset;
2063  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2064  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2065  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2066  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2067 
2068  Index k=0;
2069  if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
2070  {
2071  for(; k<peeled_k; k+=PacketSize) {
2072  PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
2073  kernel.packet[0] = dm0.loadPacket(k);
2074  kernel.packet[1%PacketSize] = dm1.loadPacket(k);
2075  kernel.packet[2%PacketSize] = dm2.loadPacket(k);
2076  kernel.packet[3%PacketSize] = dm3.loadPacket(k);
2077  ptranspose(kernel);
2078  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2079  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2080  pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2081  pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2082  count+=4*PacketSize;
2083  }
2084  }
2085  for(; k<depth; k++)
2086  {
2087  blockB[count+0] = cj(dm0(k));
2088  blockB[count+1] = cj(dm1(k));
2089  blockB[count+2] = cj(dm2(k));
2090  blockB[count+3] = cj(dm3(k));
2091  count += 4;
2092  }
2093  // skip what we have after
2094  if(PanelMode) count += 4 * (stride-offset-depth);
2095  }
2096  }
2097 
2098  // copy the remaining columns one at a time (nr==1)
2099  for(Index j2=packet_cols4; j2<cols; ++j2)
2100  {
2101  if(PanelMode) count += offset;
2102  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2103  for(Index k=0; k<depth; k++)
2104  {
2105  blockB[count] = cj(dm0(k));
2106  count += 1;
2107  }
2108  if(PanelMode) count += (stride-offset-depth);
2109  }
2110 }
2111 
2112 // this version is optimized for row major matrices
2113 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2114 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2115 {
2116  typedef typename packet_traits<Scalar>::type Packet;
2117  typedef typename DataMapper::LinearMapper LinearMapper;
2118  enum { PacketSize = packet_traits<Scalar>::size };
2119  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2120 };
2121 
2122 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2123 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2124  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2125 {
2126  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2127  EIGEN_UNUSED_VARIABLE(stride);
2128  EIGEN_UNUSED_VARIABLE(offset);
2129  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2130  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2131  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2132  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2133  Index count = 0;
2134 
2135 // if(nr>=8)
2136 // {
2137 // for(Index j2=0; j2<packet_cols8; j2+=8)
2138 // {
2139 // // skip what we have before
2140 // if(PanelMode) count += 8 * offset;
2141 // for(Index k=0; k<depth; k++)
2142 // {
2143 // if (PacketSize==8) {
2144 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2145 // pstoreu(blockB+count, cj.pconj(A));
2146 // } else if (PacketSize==4) {
2147 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2148 // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2149 // pstoreu(blockB+count, cj.pconj(A));
2150 // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2151 // } else {
2152 // const Scalar* b0 = &rhs[k*rhsStride + j2];
2153 // blockB[count+0] = cj(b0[0]);
2154 // blockB[count+1] = cj(b0[1]);
2155 // blockB[count+2] = cj(b0[2]);
2156 // blockB[count+3] = cj(b0[3]);
2157 // blockB[count+4] = cj(b0[4]);
2158 // blockB[count+5] = cj(b0[5]);
2159 // blockB[count+6] = cj(b0[6]);
2160 // blockB[count+7] = cj(b0[7]);
2161 // }
2162 // count += 8;
2163 // }
2164 // // skip what we have after
2165 // if(PanelMode) count += 8 * (stride-offset-depth);
2166 // }
2167 // }
2168  if(nr>=4)
2169  {
2170  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2171  {
2172  // skip what we have before
2173  if(PanelMode) count += 4 * offset;
2174  for(Index k=0; k<depth; k++)
2175  {
2176  if (PacketSize==4) {
2177  Packet A = rhs.loadPacket(k, j2);
2178  pstoreu(blockB+count, cj.pconj(A));
2179  count += PacketSize;
2180  } else {
2181  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2182  blockB[count+0] = cj(dm0(0));
2183  blockB[count+1] = cj(dm0(1));
2184  blockB[count+2] = cj(dm0(2));
2185  blockB[count+3] = cj(dm0(3));
2186  count += 4;
2187  }
2188  }
2189  // skip what we have after
2190  if(PanelMode) count += 4 * (stride-offset-depth);
2191  }
2192  }
2193  // copy the remaining columns one at a time (nr==1)
2194  for(Index j2=packet_cols4; j2<cols; ++j2)
2195  {
2196  if(PanelMode) count += offset;
2197  for(Index k=0; k<depth; k++)
2198  {
2199  blockB[count] = cj(rhs(k, j2));
2200  count += 1;
2201  }
2202  if(PanelMode) count += stride-offset-depth;
2203  }
2204 }
2205 
2206 } // end namespace internal
2207 
2210 inline std::ptrdiff_t l1CacheSize()
2211 {
2212  std::ptrdiff_t l1, l2, l3;
2213  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2214  return l1;
2215 }
2216 
2219 inline std::ptrdiff_t l2CacheSize()
2220 {
2221  std::ptrdiff_t l1, l2, l3;
2222  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2223  return l2;
2224 }
2225 
2231 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2232 {
2233  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2234 }
2235 
2236 } // end namespace Eigen
2237 
2238 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
Definition: Constants.h:314
Definition: LDLT.h:16
Definition: StdDeque.h:58
Definition: Eigen_Colamd.h:54
Definition: Constants.h:312