ViennaCL - The Vienna Computing Library  1.5.2
generate.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_GENERATOR_GENERATE_HPP
2 #define VIENNACL_GENERATOR_GENERATE_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
26 #include <cstring>
27 #include <vector>
28 #include <typeinfo>
29 
32 
37 
38 #include "viennacl/tools/tools.hpp"
39 
40 namespace viennacl{
41 
42  namespace generator{
43 
48  public:
50  typedef std::pair<expression_type, vcl_size_t> forced_profile_key_type;
51  private:
52  typedef std::pair<expression_descriptor, generator::profile_base::statements_type> representation_node_type;
53  typedef std::vector<representation_node_type> statements_type;
54  typedef std::map<forced_profile_key_type, tools::shared_ptr<profile_base> > forced_profiles_type;
55 
60  static bool is_flow_transposed(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node){
61  viennacl::scheduler::statement::container_type const & expr = statement.array();
64  else{
67  res = res || is_lhs_flow_transposed(statement, expr[root_node.lhs.node_index]);
69  res = res || is_lhs_flow_transposed(statement, expr[root_node.rhs.node_index]);
70  return res;
71  }
72  }
73 
75  static bool is_lhs_flow_transposed(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node){
76  scheduler::statement::container_type const & expr = statement.array();
78  return is_flow_transposed(statement, expr[root_node.lhs.node_index]);
79  else
81  }
82 
84  static bool is_rhs_flow_transposed(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node){
85  viennacl::scheduler::statement::container_type const & expr = statement.array();
87  return is_flow_transposed(statement, expr[root_node.rhs.node_index]);
88  else
90  }
91 
93  static void fill_expression_descriptor_scalar(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
94  viennacl::scheduler::statement::container_type const & expr = statement.array();
95  bool is_invalid = (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE)
96  || (descriptor.type_family==SCALAR_REDUCE_FAMILY && root_node.op.type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE);
97  if(is_invalid){
98  descriptor.type_family = INVALID_EXPRESSION_FAMILY;
99  descriptor.type = INVALID_EXPRESSION_TYPE;
100  }
102  descriptor.type_family = SCALAR_REDUCE_FAMILY;
103  descriptor.type = SCALAR_REDUCE_TYPE;
104  }
106  fill_expression_descriptor_scalar(statement, expr[root_node.lhs.node_index],descriptor);
108  fill_expression_descriptor_scalar(statement, expr[root_node.rhs.node_index],descriptor);
109  }
110 
112  static void fill_expression_descriptor_vector(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
113  viennacl::scheduler::statement::container_type const & expr = statement.array();
114  bool is_invalid = (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
116  || (descriptor.type_family==VECTOR_REDUCE_FAMILY && root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE);
117  if(is_invalid){
118  descriptor.type_family=INVALID_EXPRESSION_FAMILY;
119  descriptor.type=INVALID_EXPRESSION_TYPE;
120  }
122  descriptor.type_family=VECTOR_REDUCE_FAMILY;
123  if(is_lhs_flow_transposed(statement,root_node))
124  descriptor.type=VECTOR_REDUCE_Tx_TYPE;
125  else
126  descriptor.type=VECTOR_REDUCE_Nx_TYPE;
127  }
129  fill_expression_descriptor_vector(statement, expr[root_node.lhs.node_index],descriptor);
131  fill_expression_descriptor_vector(statement, expr[root_node.rhs.node_index],descriptor);
132  }
133 
135  static void fill_expression_descriptor_matrix(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
136  viennacl::scheduler::statement::container_type const & expr = statement.array();
137  bool is_invalid = (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
139  || (descriptor.type_family==MATRIX_PRODUCT_FAMILY && root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE);
140  if(is_invalid){
141  descriptor.type_family=INVALID_EXPRESSION_FAMILY;
142  descriptor.type=INVALID_EXPRESSION_TYPE;
143  }
145  descriptor.type_family=MATRIX_PRODUCT_FAMILY;
146  bool lhs_trans = is_lhs_flow_transposed(statement,root_node);
147  bool rhs_trans = is_rhs_flow_transposed(statement,root_node);
148  if(!lhs_trans && !rhs_trans)
149  descriptor.type=MATRIX_PRODUCT_NN_TYPE;
150  else if(lhs_trans && !rhs_trans)
151  descriptor.type=MATRIX_PRODUCT_TN_TYPE;
152  else if(!lhs_trans && rhs_trans)
153  descriptor.type=MATRIX_PRODUCT_NT_TYPE;
154  else if(lhs_trans && rhs_trans)
155  descriptor.type=MATRIX_PRODUCT_TT_TYPE;
156 
157  }
159  fill_expression_descriptor_matrix(statement, expr[root_node.lhs.node_index],descriptor);
161  fill_expression_descriptor_matrix(statement, expr[root_node.rhs.node_index],descriptor);
162  }
163 
165  void fill_descriptor(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
167  descriptor.scalartype_size = utils::call_on_element(root_node.lhs, utils::scalartype_size_fun());
169  descriptor.type_family = VECTOR_SAXPY_FAMILY;
170  descriptor.type = VECTOR_SAXPY_TYPE;
171  fill_expression_descriptor_vector(statement,root_node,descriptor);
172  }
173  else if(lhs_family==viennacl::scheduler::MATRIX_TYPE_FAMILY){
174  descriptor.type_family = MATRIX_SAXPY_FAMILY;
175  descriptor.type = MATRIX_SAXPY_TYPE;
176  fill_expression_descriptor_matrix(statement,root_node,descriptor);
177  }
178  else if(lhs_family==viennacl::scheduler::SCALAR_TYPE_FAMILY){
179  descriptor.type_family = SCALAR_SAXPY_FAMILY;
180  descriptor.type = SCALAR_SAXPY_TYPE;
181  fill_expression_descriptor_scalar(statement,root_node,descriptor);
182  }
183  }
184 
189  template<class StatementsType>
190  void set_expression_arguments(profile_base const & profile, unsigned int device_offset, StatementsType const & statements, unsigned int & kernel_id, viennacl::ocl::program & p, std::list<viennacl::ocl::kernel *> & kernels) const {
191  for(vcl_size_t i = 0 ; i < profile.num_kernels() ; ++i){
192  //add kernel name
193  char str[32];
194  std::sprintf(str,"kernel_%d_%d",device_offset,kernel_id);
195  viennacl::ocl::kernel & kernel = p.get_kernel(str);
196  kernels.push_back(&kernel);
197  unsigned int current_arg = 0;
198  //Configure ND Range and enqueue arguments
199  profile.configure_range_enqueue_arguments(i, statements, kernel, current_arg);
200  std::set<void *> memory;
201  for(typename StatementsType::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
202  detail::traverse(it->first, it->second, detail::set_arguments_functor(memory,current_arg,kernel));
203  }
204  ++kernel_id;
205  }
206  }
207 
209  profile_base const & get_profile(viennacl::ocl::device const & device, expression_descriptor const & descriptor) const {
210  forced_profiles_type::const_iterator it = forced_profiles_.find(std::make_pair(descriptor.type, descriptor.scalartype_size));
211  if(it != forced_profiles_.end())
212  return *it->second;
213  return *profiles::get(device,descriptor);
214  }
215 
216  public:
217 
220  statements_.reserve(16);
221  }
222 
224  template<class T>
225  void force_profile(forced_profile_key_type key, T const & t){
226  forced_profiles_.insert(std::pair<forced_profile_key_type, tools::shared_ptr<profile_base> >(key, tools::shared_ptr<profile_base>(new T(t))));
227  }
228 
232  bool add(scheduler::statement const & statement, scheduler::statement_node const & root_node) {
233  expression_descriptor descriptor;
234  fill_descriptor(statement, root_node, descriptor);
235  if(descriptor.type_family==INVALID_EXPRESSION_FAMILY)
236  return false;
237  if(statements_.empty())
238  statements_.push_back(std::make_pair(descriptor,profile_base::statements_type(1,std::make_pair(statement, root_node))));
239  else
240  if(statements_.back().first == descriptor)
241  statements_.back().second.push_back(std::make_pair(statement, root_node));
242  else
243  statements_.push_back(std::make_pair(descriptor,profile_base::statements_type(1,std::make_pair(statement, root_node))));
244  return true;
245  }
246 
248  void configure_program(viennacl::ocl::program & p, std::list<viennacl::ocl::kernel *> & kernels) const {
249  unsigned int kernel_id = 0;
250  std::vector<viennacl::ocl::device>::const_iterator found = std::find(ctx_.devices().begin(),ctx_.devices().end(),ctx_.current_device());
251  for(statements_type::const_iterator it = statements_.begin() ; it != statements_.end() ; ++it)
252  set_expression_arguments(get_profile(ctx_.current_device(), it->first), static_cast<unsigned int>(std::distance(ctx_.devices().begin(), found)), it->second, kernel_id, p, kernels);
253  }
254 
256  void make_program_name(char * program_name) const {
257  unsigned int current_arg = 0;
258  void* memory[64] = {NULL};
259  for(statements_type::const_iterator it = statements_.begin() ; it != statements_.end() ; ++it){
260  for(profile_base::statements_type::const_iterator iit = it->second.begin() ; iit != it->second.end() ; ++iit){
261  detail::traverse(iit->first, iit->second, detail::statement_representation_functor(memory, current_arg, program_name));
262  }
263  }
264  *program_name='\0';
265  }
266 
268  std::string make_opencl_program_string() const {
270 
271  //Headers generation
272  stream << "#if defined(cl_khr_fp64)\n";
273  stream << "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n";
274  stream << "#elif defined(cl_amd_fp64)\n";
275  stream << "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n";
276  stream << "#endif\n";
277  stream << std::endl;
278 
279  vcl_size_t device_offset =0;
280  for(std::vector<viennacl::ocl::device>::const_iterator it = ctx_.devices().begin() ; it != ctx_.devices().end() ; ++it)
281  for(statements_type::const_iterator iit = statements_.begin() ; iit != statements_.end() ; ++iit)
282  get_profile(*it,iit->first)(stream,device_offset++,iit->second);
283 
284  return stream.str();
285  }
286 
291  std::string make_cuda_program_string() const {
292  //Creates OpenCL string with #ifdef and attributes
294  vcl_size_t device_offset =0;
295  for(std::vector<viennacl::ocl::device>::const_iterator it = ctx_.devices().begin() ; it != ctx_.devices().end() ; ++it)
296  for(statements_type::const_iterator iit = statements_.begin() ; iit != statements_.end() ; ++iit)
297  get_profile(*it,iit->first)(stream,device_offset++,iit->second);
298  std::string res = stream.str();
299 
300  viennacl::tools::find_and_replace(res,"__attribute__","//__attribute__");
301 
302  //Pointer
303  viennacl::tools::find_and_replace(res, "__global float*", "float*");
304  viennacl::tools::find_and_replace(res, "__local float*", "float*");
305 
306  viennacl::tools::find_and_replace(res, "__global double*", "double*");
307  viennacl::tools::find_and_replace(res, "__local double*", "double*");
308 
309  //Qualifiers
310  viennacl::tools::find_and_replace(res,"__global","__device__");
311  viennacl::tools::find_and_replace(res,"__kernel","__global__");
312  viennacl::tools::find_and_replace(res,"__constant","__constant__");
313  viennacl::tools::find_and_replace(res,"__local","__shared__");
314 
315  //Indexing
316  viennacl::tools::find_and_replace(res,"get_num_groups(0)","gridDim.x");
317  viennacl::tools::find_and_replace(res,"get_num_groups(1)","gridDim.y");
318 
319  viennacl::tools::find_and_replace(res,"get_local_size(0)","blockDim.x");
320  viennacl::tools::find_and_replace(res,"get_local_size(1)","blockDim.y");
321 
322  viennacl::tools::find_and_replace(res,"get_group_id(0)","blockIdx.x");
323  viennacl::tools::find_and_replace(res,"get_group_id(1)","blockIdx.y");
324 
325  viennacl::tools::find_and_replace(res,"get_local_id(0)","threadIdx.x");
326  viennacl::tools::find_and_replace(res,"get_local_id(1)","threadIdx.y");
327 
328  viennacl::tools::find_and_replace(res,"get_global_id(0)","(blockIdx.x*blockDim.x + threadIdx.x)");
329  viennacl::tools::find_and_replace(res,"get_global_id(1)","(blockIdx.y*blockDim.y + threadIdx.y)");
330 
331  //Synchronization
332  viennacl::tools::find_and_replace(res,"barrier(CLK_LOCAL_MEM_FENCE)","__syncthreads()");
333  viennacl::tools::find_and_replace(res,"barrier(CLK_GLOBAL_MEM_FENCE)","__syncthreads()");
334 
335 
336  return res;
337  }
338 
339  private:
340  statements_type statements_;
341  viennacl::ocl::context const & ctx_;
342  forced_profiles_type forced_profiles_;
343  };
344 
351  inline viennacl::ocl::program & get_configured_program(viennacl::generator::code_generator const & generator, std::list<viennacl::ocl::kernel*> & kernels, bool force_recompilation = false){
352  char* program_name = new char[256];
353  generator.make_program_name(program_name);
354  if(force_recompilation)
356  if(!viennacl::ocl::current_context().has_program(program_name)){
357  std::string source_code = generator.make_opencl_program_string();
358  #ifdef VIENNACL_DEBUG_BUILD
359  std::cout << "Building " << program_name << "..." << std::endl;
360  std::cout << source_code << std::endl;
361  #endif
362  viennacl::ocl::current_context().add_program(source_code, program_name);
363  }
365  generator.configure_program(p, kernels);
366  delete[] program_name;
367 
368  return p;
369  }
370 
372  inline void enqueue(viennacl::generator::code_generator const & generator, bool force_recompilation = false){
373  std::list<viennacl::ocl::kernel*> kernels;
374  get_configured_program(generator, kernels, force_recompilation);
375  for(std::list<viennacl::ocl::kernel*>::iterator it = kernels.begin() ; it != kernels.end() ; ++it){
376  viennacl::ocl::enqueue(**it, (*it)->context().get_queue());
377  }
378  }
379 
383  gen.add(s,s.array()[0]);
384  return gen.make_opencl_program_string();
385  }
386 
390  gen.add(s, s.array()[0]);
391  return gen.make_cuda_program_string();
392  }
393 
397  gen.add(s,root_node);
399  }
400 
404  }
405 
406  }
407 }
408 #endif
statement_node_subtype subtype
Definition: forwards.h:270
A stream class where the kernel sources are streamed to. Takes care of indentation of the sources...
Definition: utils.hpp:233
std::size_t vcl_size_t
Definition: forwards.h:58
void delete_program(std::string const &name)
Delete the program with the provided name.
Definition: context.hpp:401
code_generator(viennacl::ocl::context const &ctx=viennacl::ocl::current_context())
The constructor.
Definition: generate.hpp:219
viennacl::ocl::program & get_configured_program(viennacl::generator::code_generator const &generator, std::list< viennacl::ocl::kernel * > &kernels, bool force_recompilation=false)
Creates the program associated with a generator object and fills the kernels. Checks the context for ...
Definition: generate.hpp:351
vcl_size_t node_index
Definition: forwards.h:276
void enqueue(viennacl::generator::code_generator const &generator, bool force_recompilation=false)
Set the arguments and enqueue a generator object.
Definition: generate.hpp:372
void force_profile(forced_profile_key_type key, T const &t)
Force the generator to use a specific profile for an operation.
Definition: generate.hpp:225
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:59
lhs_rhs_element lhs
Definition: forwards.h:422
Various little tools used here and there in ViennaCL.
Definition: forwards.h:176
std::string str()
Definition: utils.hpp:255
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:51
A class representing a compute device (e.g. a GPU)
Definition: device.hpp:49
viennacl::ocl::program & add_program(cl_program p, std::string const &prog_name)
Adds a program to the context.
Definition: context.hpp:340
Functor to map the statements to the types defined in mapped_objects.hpp.
std::list< std::pair< scheduler::statement, scheduler::statement_node > > statements_type
Definition: profile_base.hpp:49
Vendor-specific parameters for the generated kernels.
A class for holding meta information such as the type or the underlying scalar type of an expression ...
Definition: forwards.h:84
lhs_rhs_element rhs
Definition: forwards.h:424
Helper class for the OpenCL kernel generator, representing a statement.
Definition: statement_representation_functor.hpp:52
void configure_program(viennacl::ocl::program &p, std::list< viennacl::ocl::kernel * > &kernels) const
Set the arguments for a program previously generated by the generator and fills the kernels...
Definition: generate.hpp:248
viennacl::ocl::program & get_program(std::string const &name)
Returns the program with the provided name.
Definition: context.hpp:414
std::string get_cuda_device_code(viennacl::scheduler::statement const &s)
Convenience function to get the CUDA device code for a single statement.
Definition: generate.hpp:388
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:95
Definition: forwards.h:170
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:48
statement_node_type_family
Groups the type of a node in the statement tree. Used for faster dispatching.
Definition: forwards.h:162
int find_and_replace(std::string &source, std::string const &find, std::string const &replace)
Replace in a source string a pattern by another.
Definition: tools.hpp:154
void generate_enqueue_statement(viennacl::scheduler::statement const &s, scheduler::statement_node const &root_node)
Generate and enqueue a statement plus root_node into the current queue.
Definition: generate.hpp:395
std::string make_cuda_program_string() const
Creates the CUDA device code from the set of expressions in the object.
Definition: generate.hpp:291
viennacl::ocl::context & current_context()
Convenience function for returning the current context.
Definition: backend.hpp:192
Definition: forwards.h:173
A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependen...
Definition: shared_ptr.hpp:83
Forwards declaration.
Wrapper class for an OpenCL program.
Definition: program.hpp:40
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
std::vector< value_type > container_type
Definition: forwards.h:452
std::string make_opencl_program_string() const
Creates the OpenCL program string from the set of expressions in the object.
Definition: generate.hpp:268
container_type const & array() const
Definition: forwards.h:473
viennacl::ocl::kernel & get_kernel(std::string const &name)
Returns the kernel with the provided name.
Definition: context.hpp:638
Class for handling code generation.
Definition: generate.hpp:47
bool add(scheduler::statement const &statement, scheduler::statement_node const &root_node)
Add a statement and the root node to the expression list.
Definition: generate.hpp:232
Functor to set the arguments of a statement into a kernel.
statement_node_type_family type_family
Definition: forwards.h:269
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
Definition: forwards.h:447
std::vector< viennacl::ocl::device > const & devices() const
Returns a vector with all devices in this context.
Definition: context.hpp:89
std::string get_opencl_program_string(viennacl::scheduler::statement const &s)
Convenience function to get the OpenCL program string for a single statement.
Definition: generate.hpp:381
std::pair< expression_type, vcl_size_t > forced_profile_key_type
typedef of the key used in the forced profiles. Contains the expression type and the size of the scal...
Definition: generate.hpp:50
Functor to generate the string id of a statement.
op_element op
Definition: forwards.h:423
void make_program_name(char *program_name) const
Creates an identifier string for the set of expressions in the object.
Definition: generate.hpp:256
expression_type_family type_family
Definition: forwards.h:90
Main datastructure for an node in the statement tree.
Definition: forwards.h:420
operation_node_type type
Definition: forwards.h:416