ViennaCL - The Vienna Computing Library  1.5.2
matrix_product.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_GENERATOR_GENERATE_MATRIX_PRODUCT_HPP
2 #define VIENNACL_GENERATOR_GENERATE_MATRIX_PRODUCT_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
27 #include <vector>
28 
30 
34 
35 #include "viennacl/forwards.h"
36 
37 #include "viennacl/tools/tools.hpp"
38 
39 namespace viennacl{
40 
41  namespace generator{
42 
44  class matrix_product : public profile_base{
45 
46  enum access_flow{
47  REGULAR,
48  STRIDED
49  };
50 
51  bool is_slow_impl(viennacl::ocl::device const &) const { return false; }
52 
53  vcl_size_t lmem_used(vcl_size_t scalartype_size) const {
54  vcl_size_t lmem_used = 0;
55  if(use_lhs_shared_)
56  lmem_used += (ml_ + 1) * (cache_width_ + 1) * scalartype_size;
57  if(use_rhs_shared_)
58  lmem_used += (cache_width_ + 1) * (nl_ + 1) * scalartype_size;
59  return lmem_used;
60  }
61 
62  virtual void print(std::ostream & s) const{
63  s << "{vector_type, local_size1, cache_width, local_size2, ms, ks, ns, use_lhs_shared, use_rhs_shared} = {"
64  << vector_size_ << ","
65  << local_size1_ << ", "
66  << cache_width_ << ", "
67  << local_size2_ << ", "
68  << ms_ << ", "
69  << ks_ << ", "
70  << ns_ << ", "
71  << use_lhs_shared_ << ", " << use_rhs_shared_ << "}" ;
72  }
73 
74 
75  bool invalid_impl(viennacl::ocl::device const & /*dev*/, vcl_size_t /*scalartype_size*/) const{
76  static const unsigned int alignment = 128;
77  return ml_ > alignment
78  || cache_width_ > alignment
79  || nl_ > alignment
80  || ml_ < ms_
81  || cache_width_ < ks_
82  || nl_ < ns_
83  || (ms_ % vector_size_) > 0
84  || (ks_ % vector_size_) > 0
85  || (ns_ % vector_size_) > 0;
86  }
87 
88  public:
90  matrix_product(unsigned int vectorization
91  , vcl_size_t local_size1, vcl_size_t cache_width, vcl_size_t local_size2
92  , unsigned int ms, unsigned int ks, unsigned int ns
93  , bool use_lhs_shared, bool use_rhs_shared) : profile_base(vectorization,local_size1, local_size2,1){
94  local_size1_ = local_size1;
95  local_size2_ = local_size2;
96  cache_width_=cache_width;
97  ml_= ms*local_size1;
98  nl_=ns*local_size2;
99  ms_ = ms;
100  ks_=ks;
101  ns_=ns;
102  use_lhs_shared_ = use_lhs_shared;
103  use_rhs_shared_ = use_rhs_shared;
104  }
105 
106  static std::string csv_format() {
107  return "Vec,LSize1,CacheWidth,LSize2,mS,kS,nS,NumGroups";
108  }
109 
110  std::string csv_representation() const{
111  std::ostringstream oss;
112  oss << vector_size_
113  << "," << local_size1_
114  << "," << cache_width_
115  << "," << local_size2_
116  << "," << ms_
117  << "," << ks_
118  << "," << ns_
119  << "," << use_lhs_shared_
120  << "," << use_rhs_shared_;
121  return oss.str();
122  }
123 
124  void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg) const {
125  //set M, N
126  scheduler::statement_node const & first_node = statements.front().second;
127  vcl_size_t M = utils::call_on_matrix(first_node.lhs, utils::internal_size1_fun());
128  vcl_size_t N = utils::call_on_matrix(first_node.lhs, utils::internal_size2_fun());
129 
130  //set ND range
131  configure_local_sizes(k, kernel_id);
132  k.global_work_size(0, M/ms_);
133  k.global_work_size(1, N/ns_);
134 
135  //set arguments
136  //M,N
137  k.arg(n_arg++, cl_uint(M));
138  k.arg(n_arg++, cl_uint(N));
139 
140  //K
141  for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
142  scheduler::statement::container_type exprs = it->first.array();
143  for(scheduler::statement::container_type::iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){
145  scheduler::statement_node const * current_node = &(*iit);
146  //The LHS of the prod is a matrix
148  {
149  k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
150  }
151  else{
152  //The LHS of the prod is a matrix expression
153  current_node = &exprs[current_node->lhs.node_index];
155  {
157  k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
158  else
159  k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
160  }
161  else{
162  assert(false && bool("unexpected expression tree"));
163  }
164  }
165  return;
166  }
167  }
168  }
169 
170  }
171 
172  static std::string size1() { return "M"; }
173  static std::string size2() { return "K"; }
174  static std::string size3() { return "N"; }
175 
176  void kernel_arguments(statements_type const & /*statements*/, std::string & arguments_string) const{
177  arguments_string += detail::generate_value_kernel_argument("unsigned int", "M");
178  arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
179  arguments_string += detail::generate_value_kernel_argument("unsigned int", "K");
180  }
181 
182  private:
183 
184  void transform_block(detail::mapped_matrix const & /*mat_infos*/, bool store_shared
185  , unsigned int & large_block_1, unsigned int & large_block_2
186  , unsigned int & small_block_1, unsigned int & small_block_2
187  , access_flow flow) const {
188  if(flow==REGULAR){
189  large_block_2/=vector_size_;
190  if(!store_shared)
191  small_block_2/=vector_size_;
192  }
193  else{
194  large_block_1/=vector_size_;
195  if(!store_shared)
196  small_block_1/=vector_size_;
197  }
198  }
199 
200 
201  std::string helper_variable(utils::kernel_generation_stream & stream
202  , bool store_in_register
203  , std::string const & type
204  , std::string const & name
205  , std::string const & expr) const {
206  if(!store_in_register)
207  return expr;
208  stream << type << " " << name << " = " << expr << ";" << std::endl;
209  return name;
210  }
211 
212  void fetch_element_to_local_mem(utils::kernel_generation_stream & stream,
213  std::string const & lmem_name,
214  vcl_size_t lmem_size2,
215  std::string const & global_ptr,
216  detail::mapped_matrix const & mat,
217  access_flow flow,
218  std::string const & i,
219  std::string const & j) const {
220 
221  if(flow==REGULAR){
222  stream << "val = *(" << global_ptr << " + " << j << " + " << mat.size2() << "*" << i << ");" << std::endl;
223  for(unsigned int a = 0 ; a < vector_size_ ; ++a)
224  if(vector_size_>1)
225  stream << lmem_name << "[" << i << "*" << lmem_size2 << " + " << j << "*" << vector_size_<<" + " << a << "] = val.s" << a << ";" <<std::endl;
226  else
227  stream << lmem_name << "[" << i << "*" << lmem_size2 << " + " << j << "*" << vector_size_ << "] = val" << ";" <<std::endl;
228  }
229  else{
230  stream << "val = *(" << global_ptr << "+ " << j << "*" << mat.size1() << " + " << i << ");" << std::endl;
231  for(unsigned int a = 0 ; a < vector_size_ ; ++a)
232  if(vector_size_>1)
233  stream << lmem_name << "[" << i << "*" << vector_size_*lmem_size2 << " + " << j << " + " << a*lmem_size2 << "] = val.s" << a << ";" <<std::endl;
234  else
235  stream << lmem_name << "[" << i << "*" << vector_size_*lmem_size2 << " + " << j << "] = val" << ";" <<std::endl;
236  }
237  }
238  void fetch_to_local_mem(utils::kernel_generation_stream & stream,
239  std::string const & lmem_name,
240  vcl_size_t lmem_size2,
241  std::string const & global_ptr,
242  unsigned int bound1,
243  unsigned int bound2,
244  detail::mapped_matrix const & mat,
245  access_flow flow) const {
246  std::string aligned_scalartype = mat.scalartype();
247  if(vector_size_ > 1)
248  aligned_scalartype+=utils::to_string(vector_size_);
249  stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
250  stream << "{" << std::endl;
251  stream << aligned_scalartype << " val;" << std::endl;
252  //Can unroll
253  if(bound2%local_size2_==0 && bound1%local_size1_==0){
254  for(unsigned int j = 0 ; j < bound2 ; j+=static_cast<unsigned int>(local_size2_)){
255  for(unsigned int i = 0 ; i < bound1 ; i+=static_cast<unsigned int>(local_size1_)){
256  std::string indi = "(get_local_id(0) + " + utils::to_string(i)+")";
257  std::string indj = "(get_local_id(1) + " + utils::to_string(j)+")";
258  fetch_element_to_local_mem(stream,lmem_name,lmem_size2,global_ptr,mat,flow,indi,indj);
259  }
260  }
261  }
262  else{
263  stream << "for(unsigned int j = get_local_id(1)" << " ; j < " << bound2 << "; j+= " << local_size2_ << "){" << std::endl;
264  stream.inc_tab();
265  stream << "for(unsigned int i = get_local_id(0)" << " ; i < " << bound1 << "; i+= " << local_size1_ << "){" << std::endl;
266  stream.inc_tab();
267  fetch_element_to_local_mem(stream,lmem_name,lmem_size2,global_ptr,mat,flow,"i","j");
268  stream.dec_tab();
269  stream << "}" << std::endl;
270  stream.dec_tab();
271  stream << "}" << std::endl;
272 
273  }
274  stream << "}" << std::endl;
275  stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
276 
277  }
278 
279  void core(vcl_size_t /*kernel_id*/, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
280 
284 
285  detail::mapped_matrix const * assigned = static_cast<detail::mapped_matrix const *>(at(mapping.at(0), std::make_pair(&statements.front().second,detail::LHS_NODE_TYPE)).get());
286  detail::mapped_matrix_product* prod = NULL;
287  detail::mapped_matrix const * lhs = NULL;
288  detail::mapped_matrix const * rhs = NULL;
289 
290  bool is_lhs_transposed = false;
291  bool is_rhs_transposed = false;
292 
293  for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
294  scheduler::statement::container_type const & exprs = it->first.array();
295  vcl_size_t i = std::distance(statements.begin(), it);
296  for(scheduler::statement::container_type::const_iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){
298  prod = (detail::mapped_matrix_product *)at(mapping.at(i), std::make_pair(&(*iit), detail::PARENT_NODE_TYPE)).get();
299  if(iit->lhs.type_family == scheduler::COMPOSITE_OPERATION_FAMILY){
300  is_lhs_transposed = true;
301  lhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&exprs[iit->lhs.node_index],detail::LHS_NODE_TYPE)).get();
302  }
303  else{
304  is_lhs_transposed = false;
305  lhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&(*iit), detail::LHS_NODE_TYPE)).get();
306  }
307 
308  if(iit->rhs.type_family == scheduler::COMPOSITE_OPERATION_FAMILY){
309  is_rhs_transposed = true;
310  rhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&exprs[iit->rhs.node_index], detail::LHS_NODE_TYPE)).get();
311  }
312  else{
313  is_rhs_transposed = false;
314  rhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&(*iit),detail::RHS_NODE_TYPE)).get();
315  }
316 
317  }
318  }
319  }
320 
321  if(vector_size_>1){
322  std::string StrV = "/"+utils::to_string(vector_size_) ;
323 
324  for(detail::mapping_type::const_iterator it = mapping.front().begin() ; it != mapping.front().end() ; ++it){
325  if(detail::mapped_matrix const * p = dynamic_cast<detail::mapped_matrix const *>(it->second.get())){
326  if(p->is_row_major())
327  p->bind_sizes("M", "N"+StrV);
328  else
329  p->bind_sizes("M"+StrV, "N");
330  }
331  }
332 
333  if(lhs->is_row_major())
334  if(is_lhs_transposed)
335  lhs->bind_sizes("M"+StrV, "K");
336  else
337  lhs->bind_sizes("M", "K"+StrV);
338  else
339  if(is_lhs_transposed)
340  lhs->bind_sizes("M", "K"+StrV);
341  else
342  lhs->bind_sizes("M"+StrV, "K");
343 
344 
345  if(rhs->is_row_major())
346  if(is_rhs_transposed)
347  rhs->bind_sizes("K"+StrV, "N");
348  else
349  rhs->bind_sizes("K", "N"+StrV);
350  else
351  if(is_rhs_transposed)
352  rhs->bind_sizes("K", "N"+StrV);
353  else
354  rhs->bind_sizes("K"+StrV, "N");
355 
356 
357  }
358  else{
359  for(detail::mapping_type::const_iterator it = mapping.front().begin() ; it != mapping.front().end() ; ++it){
360  if(detail::mapped_matrix const * p = dynamic_cast<detail::mapped_matrix const *>(it->second.get())){
361  p->bind_sizes("M", "N");
362  }
363  }
364 
365  lhs->bind_sizes("M", "K");
366  rhs->bind_sizes("K", "N");
367  }
368 
369 
370 
371  std::string aligned_scalartype = assigned->scalartype();
372  if(vector_size_ > 1)
373  aligned_scalartype+=utils::to_string(vector_size_);
374 
375 
376  access_flow result_access_flow;
377  if(assigned->is_row_major())
378  result_access_flow = REGULAR;
379  else
380  result_access_flow = STRIDED;
381 
382  access_flow lhs_access_flow;
383  if((lhs->is_row_major() && !is_lhs_transposed)
384  ||(!lhs->is_row_major() && is_lhs_transposed))
385  lhs_access_flow = REGULAR;
386  else
387  lhs_access_flow = STRIDED;
388 
389  access_flow rhs_access_flow;
390  if((rhs->is_row_major() && !is_rhs_transposed)
391  ||(!rhs->is_row_major() && is_rhs_transposed))
392  rhs_access_flow = REGULAR;
393  else
394  rhs_access_flow = STRIDED;
395 
396 
397  std::string lhs_value_scalartype;
398  if(use_lhs_shared_)
399  lhs_value_scalartype = lhs->scalartype();
400  else
401  lhs_value_scalartype = aligned_scalartype;
402 
403  std::string rhs_value_scalartype;
404  if(use_rhs_shared_)
405  rhs_value_scalartype = rhs->scalartype();
406  else
407  rhs_value_scalartype = aligned_scalartype;
408 
409 
410  unsigned int ml_res = static_cast<unsigned int>(ml_), nl_res = static_cast<unsigned int>(nl_), ms_res = static_cast<unsigned int>(ms_), ns_res = static_cast<unsigned int>(ns_);
411  unsigned int ml_lhs = static_cast<unsigned int>(ml_), cache_width_lhs = static_cast<unsigned int>(cache_width_), ms_lhs = static_cast<unsigned int>(ms_), ks_lhs = static_cast<unsigned int>(ks_);
412  unsigned int cache_width_rhs = static_cast<unsigned int>(cache_width_), nl_rhs = static_cast<unsigned int>(nl_), ks_rhs = static_cast<unsigned int>(ks_), ns_rhs = static_cast<unsigned int>(ns_);
413 
414  transform_block(*assigned,false,ml_res,nl_res,ms_res,ns_res,result_access_flow);
415  transform_block(*lhs,use_lhs_shared_,ml_lhs,cache_width_lhs,ms_lhs,ks_lhs,lhs_access_flow);
416  transform_block(*rhs,use_rhs_shared_,cache_width_rhs,nl_rhs,ks_rhs,ns_rhs,rhs_access_flow);
417 
421 
422 
423  vcl_size_t local_lhs_size1 = ml_ ;
424  vcl_size_t local_lhs_size2 = cache_width_ + 1;
425 
426  vcl_size_t local_rhs_size1 = cache_width_;
427  vcl_size_t local_rhs_size2 = nl_ + 1;
428 
430  for(unsigned int m=0; m< ms_res; ++m)
431  for(unsigned int n=0; n < ns_res ; ++n)
432  stream << aligned_scalartype << " " << "res" << m << "_" << n << " = (" << aligned_scalartype << ")(0) ;" << std::endl;
433 
435  if(use_lhs_shared_)
436  stream << "__local " << lhs->scalartype() << " lhs_buf[" << local_lhs_size1*local_lhs_size2 << "]" << ";" << std::endl;
437  if(use_rhs_shared_)
438  stream << "__local " << rhs->scalartype() << " rhs_buf[" << local_rhs_size1*local_rhs_size2 << "]" << ";" << std::endl;
439 
441  //stream << "__global " << aligned_scalartype << "* res_ptr = " << assigned->name() << " + " << assigned->offset(std::make_pair("get_global_id(0)*" + utils::to_string(ms_res), "get_global_id(1)*" + utils::to_string(ns_res))) << ";" << std::endl;
442 
443 
445  if(use_lhs_shared_){
446  std::string i = "get_group_id(0)*" + utils::to_string(ml_lhs);
447  stream << "__global " << aligned_scalartype << "* global_lhs_ptr = " << lhs->name() << " + ";
448  if(lhs_access_flow==REGULAR)
449  stream << "(" << i << ")" << "*" << lhs->size2();
450  else
451  stream << i;
452  stream << ";" << std::endl;
453  }
454 
456  else{
457  if(lhs_access_flow==REGULAR)
458  for(unsigned int m=0; m<ms_lhs; ++m)
459  stream << "__global " << aligned_scalartype << "* " << "lhs_ptr_" << m << " = " << lhs->name() << " + "
460  << lhs->size2() << "* ("
461  << "get_group_id(0)*" << ml_lhs << "+" << "get_local_id(0)*" << ms_lhs << "+" << m
462  << " );" << std::endl;
463  else
464  for(unsigned int k=0; k<ks_lhs; ++k)
465  stream << "__global " << aligned_scalartype<< "* " << "lhs_ptr_" << k << " = " << lhs->name() << " + "
466  << "(" << lhs->size1() << ")*" << k
467  << "+ " << "get_group_id(0)*" << ml_lhs << "+" << "get_local_id(0)*" << ms_lhs << ";" << std::endl;
468  }
469 
471  if(use_rhs_shared_){
472  std::string j = "get_group_id(1)*" + utils::to_string(nl_rhs);
473  stream << "__global " << aligned_scalartype << "* global_rhs_ptr = " << rhs->name() << " + ";
474  if(rhs_access_flow==REGULAR)
475  stream << j;
476  else
477  stream << "(" << j << ")" << "*" << rhs->size1();
478  stream << ";" << std::endl;
479  }
480 
482  else{
483  if(rhs_access_flow==REGULAR)
484  for(unsigned int k = 0 ; k < ks_rhs ; ++k)
485  stream << "__global " << aligned_scalartype << "* " << "rhs_ptr_" << k << " = " << rhs->name() << " + "
486  << "(" << k << ")" << "*" << rhs->size2()
487  << " + " << "get_local_id(1)*" << ns_rhs << " + get_group_id(1)*" << nl_rhs
488  << ";" << std::endl;
489  else
490  for(unsigned int n = 0 ; n < ns_rhs ; ++n)
491  stream << "__global " << aligned_scalartype << "* " << "rhs_ptr_" << n << " = " << rhs->name() << " + "
492  << "(" << "get_local_id(1)*" << ns_rhs << " + get_group_id(1)*" << nl_rhs << " + " << n << ")" << "*" << rhs->size1()
493  << ";" << std::endl;
494  }
495 
496 
498  std::string block_num = helper_variable(stream,false,"unsigned int", "block_num", "K/" + utils::to_string(cache_width_));
499  stream << "for(unsigned int bl=0 ; bl<" << block_num << " ; ++bl){" << std::endl;
500  stream.inc_tab();
501 
503  if(use_lhs_shared_){
504  fetch_to_local_mem(stream,"lhs_buf",local_lhs_size2,"global_lhs_ptr",ml_lhs,cache_width_lhs,*lhs,lhs_access_flow);
505  for(unsigned int m=0; m<ms_lhs; ++m)
506  stream << "__local " << lhs_value_scalartype << "* lhs_ptr_" << m << " = lhs_buf + "
507  << "(" << "get_local_id(0)*" << ms_lhs << "+" << m << ")" << "*" << local_lhs_size2
508  << ";" << std::endl;
509  }
510 
512  if(use_rhs_shared_){
513  fetch_to_local_mem(stream,"rhs_buf", local_rhs_size2, "global_rhs_ptr",cache_width_rhs,nl_rhs,*rhs,rhs_access_flow);
514  for(unsigned int k=0; k<ks_rhs; ++k)
515  stream << "__local " << rhs_value_scalartype << "* rhs_ptr_" << k << " = rhs_buf + "
516  << k*local_rhs_size2 << " + " << "get_local_id(1)*" << ns_rhs
517  << ";" << std::endl;
518  }
519 
520 
521  stream << " for(unsigned int bs=0 ; bs < " << cache_width_/ks_ << " ; ++bs){" << std::endl;
522  stream.inc_tab();
523 
524 
525  for(unsigned int k = 0 ; k < ks_rhs ; ++k){
526  for(unsigned int n=0 ; n < ns_rhs ; ++n){
527  stream << rhs_value_scalartype << " val_rhs_" << k << "_" << n << " = " ;
528  if(use_rhs_shared_ )
529  stream << "* rhs_ptr_" << k << "++";
530  else{
531  if(rhs_access_flow==REGULAR)
532  stream << "* rhs_ptr_" << k << "++";
533  else
534  stream << "* rhs_ptr_" << n << "++";
535  }
536  stream << ";";
537  stream << std::endl;
538  }
539  }
540 
541 
542  for(unsigned int k = 0 ; k < ks_lhs ; ++k){
543  for(unsigned int m=0 ; m < ms_lhs ; ++m){
544  stream << lhs_value_scalartype << " " << "val_lhs_" << m << "_" << k << " = ";
545  if(use_lhs_shared_)
546  stream << "* lhs_ptr_" << m << "++" ;
547  else if(lhs_access_flow==REGULAR)
548  stream << "* lhs_ptr_" << m << "++";
549  else
550  stream << "* lhs_ptr_" << k << "++";
551  stream << ";";
552  stream << std::endl;
553  }
554  }
555 
556 
557  for(unsigned int n=0 ; n < ns_res ; ++n){
558  for(unsigned int k = 0 ; k < ks_ ; ++k){
559  for(unsigned int m=0 ; m < ms_res ; ++m){
560  for(unsigned int a = 0; a<vector_size_; ++a){
561 
562  int ind_lhs_1 = m;
563  int ind_lhs_2 = k;
564  int ind_s_lhs = a;
565 
566  int ind_rhs_1=k;
567  int ind_rhs_2=n;
568  int ind_s_rhs=a;
569 
570  if(result_access_flow==REGULAR){
571  if(!use_lhs_shared_){
572  if(lhs_access_flow==REGULAR){
573  ind_s_lhs = ind_lhs_2%vector_size_;
574  ind_lhs_2 /= vector_size_;
575  }
576  else{
577  ind_s_lhs = ind_lhs_1%vector_size_;
578  ind_lhs_1 /= vector_size_;
579  }
580  }
581  }
582  else{
583  if(use_lhs_shared_){
584  ind_lhs_1 = ind_lhs_1*vector_size_+a;
585  }
586  else{
587  if(lhs_access_flow==REGULAR){
588  ind_lhs_1 = ind_lhs_1*vector_size_+a;
589  ind_s_lhs = ind_lhs_2%vector_size_;
590  ind_lhs_2 /= vector_size_;
591  }
592  }
593  }
594 
595  if(result_access_flow==REGULAR){
596  if(use_rhs_shared_){
597  ind_rhs_2 = ind_rhs_2*vector_size_+a;
598  }
599  else{
600  if(rhs_access_flow==STRIDED){
601  ind_rhs_2 = ind_rhs_2*vector_size_+a;
602  ind_s_rhs = ind_rhs_1%vector_size_;
603  ind_rhs_1 = ind_rhs_1/vector_size_;
604  }
605  else{
606  }
607  }
608  }
609  else{
610  if(!use_rhs_shared_){
611  if(rhs_access_flow==REGULAR){
612  ind_s_rhs = ind_rhs_2%vector_size_;
613  ind_rhs_2/=vector_size_;
614  }
615  else{
616  ind_s_rhs = ind_rhs_1%vector_size_;
617  ind_rhs_1/=vector_size_;
618  }
619  }
620  }
621 
622  std::ostringstream res_oss;
623  std::ostringstream lhs_oss;
624  std::ostringstream rhs_oss;
625 
626  res_oss << "res" << m << "_" << n ;
627  if(vector_size_>1) res_oss << ".s" << a;
628 
629  lhs_oss << "val_lhs_" << ind_lhs_1 << "_" << ind_lhs_2;
630  if(!use_lhs_shared_ && vector_size_>1) lhs_oss << ".s" << ind_s_lhs;
631 
632 
633  rhs_oss << "val_rhs_" << ind_rhs_1 << "_" << ind_rhs_2;
634  if(!use_rhs_shared_ && vector_size_>1) rhs_oss << ".s" << ind_s_rhs;
635 
636 
637  stream << res_oss.str() << "+=" << lhs_oss.str() << "*" << rhs_oss.str() << ";" << std::endl;
638  }
639  }
640  }
641  }
642 
643 
644  if(use_rhs_shared_){
645  for(unsigned int k=0 ; k<ks_ ; ++k)
646  stream << "rhs_ptr_" << k << " += " << ks_rhs*local_rhs_size2 - ns_rhs << ";" << std::endl;
647  }
648  else{
649  if(rhs_access_flow==REGULAR)
650  for(unsigned int k=0 ; k<ks_ ; ++k)
651  stream << "rhs_ptr_" << k << " += " << ks_rhs << "*" << rhs->size2() << " - " << ns_rhs << ";" << std::endl;
652  }
653 
654  if(!use_lhs_shared_){
655  if(lhs_access_flow==STRIDED)
656  for(unsigned int k=0 ; k<ks_lhs ; ++k)
657  stream << "lhs_ptr_" << k << " += " << ks_lhs << "*" << lhs->size1() << " - " << ms_lhs << ";" << std::endl;
658  }
659 
660 
661 
662  stream.dec_tab();
663  stream << "}" << std::endl;
664 
665  if(use_lhs_shared_){
666  if(lhs_access_flow==REGULAR)
667  stream << "global_lhs_ptr += " << cache_width_lhs << ";" << std::endl;
668  else
669  stream << "global_lhs_ptr += " << cache_width_lhs << "*" << lhs->size1() << ";" << std::endl;
670  }
671 
672  if(use_rhs_shared_){
673  if(rhs_access_flow==REGULAR)
674  stream << "global_rhs_ptr += " << cache_width_rhs << "*" << rhs->size2() << ";" << std::endl;
675  else
676  stream << "global_rhs_ptr += " << cache_width_rhs << ";" << std::endl;
677  }
678 
679  stream.dec_tab();
680  stream << "}" << std::endl;
681 
682  for(unsigned int m=0 ; m < ms_res ; ++m){
683  for(unsigned int n=0 ; n < ns_res ; ++n){
684  std::string i = "get_global_id(0)*" + utils::to_string(ms_res) + "+" + utils::to_string(m);
685  std::string j = "get_global_id(1)*" + utils::to_string(ns_res) + "+" + utils::to_string(n);
686  prod->access_name("res"+utils::to_string(m)+"_"+utils::to_string(n));
687  std::string str;
688  detail::traverse(statements.front().first, statements.front().second, detail::expression_generation_traversal(std::make_pair(i, j), -1, str, mapping[0]), false);
689  stream << str << ";" << std::endl;
690  }
691  }
692 
693 
694  }
695 
696  private:
697  vcl_size_t local_size1_;
698  vcl_size_t local_size2_;
699  vcl_size_t cache_width_;
700 
701  vcl_size_t ml_;
702  vcl_size_t nl_;
703 
704  vcl_size_t ms_;
705  vcl_size_t ks_;
706  vcl_size_t ns_;
707 
708  bool use_lhs_shared_;
709  bool use_rhs_shared_;
710  };
711 
712  }
713 
714 }
715 
716 #endif
static std::string csv_format()
Definition: matrix_product.hpp:106
void arg(unsigned int pos, cl_char val)
Sets a char argument at the provided position.
Definition: kernel.hpp:124
std::size_t vcl_size_t
Definition: forwards.h:58
void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const &statements, viennacl::ocl::kernel &k, unsigned int &n_arg) const
Configures the range and enqueues the arguments associated with the profile.
Definition: matrix_product.hpp:124
Kernel generation class for matrix-matrix products.
Definition: matrix_product.hpp:44
static std::string size2()
Definition: matrix_product.hpp:173
vcl_size_t node_index
Definition: forwards.h:276
Internal utils for a dynamic OpenCL kernel generation.
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:59
Base class for an operation profile.
Definition: profile_base.hpp:47
lhs_rhs_element lhs
Definition: forwards.h:422
Various little tools used here and there in ViennaCL.
Definition: forwards.h:176
static std::string size1()
Definition: matrix_product.hpp:172
void prod(const T1 &A, bool transposed_A, const T2 &B, bool transposed_B, T3 &C, ScalarType alpha, ScalarType beta)
Definition: matrix_operations.hpp:2305
void kernel_arguments(statements_type const &, std::string &arguments_string) const
Definition: matrix_product.hpp:176
A class representing a compute device (e.g. a GPU)
Definition: device.hpp:49
Mapping of a matrix to a generator class.
Definition: mapped_objects.hpp:236
This file provides the forward declarations for the main types used within ViennaCL.
Functor for obtaining the internal number of columns of a ViennaCL matrix.
Definition: utils.hpp:188
std::list< std::pair< scheduler::statement, scheduler::statement_node > > statements_type
Definition: profile_base.hpp:49
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>
Definition: forwards.h:97
matrix_product(unsigned int vectorization, vcl_size_t local_size1, vcl_size_t cache_width, vcl_size_t local_size2, unsigned int ms, unsigned int ks, unsigned int ns, bool use_lhs_shared, bool use_rhs_shared)
The user constructor.
Definition: matrix_product.hpp:90
Base classes for the profiles.
Map ViennaCL objects to generator wrappers.
static std::string size3()
Definition: matrix_product.hpp:174
Functor for obtaining the internal number of rows of a ViennaCL matrix.
Definition: utils.hpp:181
void configure_local_sizes(viennacl::ocl::kernel &k, vcl_size_t) const
Definition: profile_base.hpp:59
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
std::vector< value_type > container_type
Definition: forwards.h:452
unsigned int vector_size_
Definition: profile_base.hpp:178
std::string csv_representation() const
csv representation of an operation
Definition: matrix_product.hpp:110
std::string to_string(T const t)
Definition: utils.hpp:204
statement_node_type_family type_family
Definition: forwards.h:269
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:759
op_element op
Definition: forwards.h:423
Main datastructure for an node in the statement tree.
Definition: forwards.h:420
operation_node_type type
Definition: forwards.h:416