SHOGUN  v3.2.0
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
StreamingSparseFeatures.cpp
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Shashwat Lal Das
8  * Modifications (W) 2013 Thoralf Klein
9  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
10  */
12 namespace shogun
13 {
14 
15 template <class T>
17 {
19  init();
20 }
21 
22 template <class T>
24  bool is_labelled,
25  int32_t size)
27 {
29  init(file, is_labelled, size);
30 }
31 
32 template <class T>
34 {
35  if (parser.is_running())
36  parser.end_parser();
37 }
38 
39 template <class T>
41 {
42  ASSERT(index>=0 && index<current_num_features)
43  return current_sgvector.get_feature(index);
44 }
45 
46 template <class T>
48 {
50 }
51 
52 template <class T>
54 {
55  int32_t n=current_num_features;
56  ASSERT(n<=num)
57  current_num_features=num;
58  return n;
59 }
60 
61 template <class T>
63 {
64  T result=0;
65 
66  //result remains zero when one of the vectors is non existent
67  if (avec && bvec)
68  {
69  SGSparseVector<T> asv(avec, alen, false);
70  SGSparseVector<T> bsv(bvec, blen, false);
71 
72  result=alpha*SGSparseVector<T>::sparse_dot(asv, bsv);
73  }
74 
75  return result;
76 }
77 
78 template <class T>
79 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
80 {
81  ASSERT(vec)
82  ASSERT(dim>=current_num_features)
83 
84  return current_sgvector.dense_dot(alpha, vec, dim, b);
85 }
86 
87 template <class T>
89 {
90  ASSERT(vec2)
91 
92  int32_t current_length = current_sgvector.num_feat_entries;
93  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
94 
95  float64_t result=0;
96  if (current_vector)
97  {
98  for (int32_t i=0; i<current_length; i++) {
99  if (current_vector[i].feat_index < vec2_len) {
100  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
101  }
102  }
103  }
104 
105  return result;
106 }
107 
108 template <class T>
110 {
111  ASSERT(vec2)
112 
113  int32_t current_length = current_sgvector.num_feat_entries;
114  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
115 
116  float32_t result=0;
117  if (current_vector)
118  {
119  for (int32_t i=0; i<current_length; i++) {
120  if (current_vector[i].feat_index < vec2_len) {
121  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
122  }
123  }
124  }
125 
126  return result;
127 }
128 
129 template <class T>
130 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
131 {
132  ASSERT(vec2)
133  if (vec2_len < current_num_features)
134  {
135  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
136  vec2_len, current_num_features);
137  }
138 
139  SGSparseVectorEntry<T>* sv=current_sgvector.features;
140  int32_t num_feat=current_sgvector.num_feat_entries;
141 
142  if (sv)
143  {
144  if (abs_val)
145  {
146  for (int32_t i=0; i<num_feat; i++)
147  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
148  }
149  else
150  {
151  for (int32_t i=0; i<num_feat; i++)
152  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
153  }
154  }
155 }
156 
157 template <class T>
158 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
159 {
160  ASSERT(vec2)
161  if (vec2_len < current_num_features)
162  {
163  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
164  vec2_len, current_num_features);
165  }
166 
167  SGSparseVectorEntry<T>* sv=current_sgvector.features;
168  int32_t num_feat=current_sgvector.num_feat_entries;
169 
170  if (sv)
171  {
172  if (abs_val)
173  {
174  for (int32_t i=0; i<num_feat; i++)
175  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
176  }
177  else
178  {
179  for (int32_t i=0; i<num_feat; i++)
180  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
181  }
182  }
183 }
184 
185 template <class T>
187 {
188  return current_sgvector.num_feat_entries;
189 }
190 
191 template <class T>
193 {
194  int32_t current_length = current_sgvector.num_feat_entries;
195  SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
196 
197  ASSERT(current_vector)
198 
199  float32_t sq=0;
200 
201  for (int32_t i=0; i<current_length; i++)
202  sq += current_vector[i].entry * current_vector[i].entry;
203 
204  return sq;
205 }
206 
207 template <class T>
209 {
210  SGSparseVectorEntry<T>* old_ptr = current_sgvector.features;
211 
212  // setting false to disallow reallocation
213  // and guarantee stable get_vector().features pointer
214  get_vector().sort_features(true);
215 
216  ASSERT(old_ptr == current_sgvector.features);
217 }
218 
219 template <class T>
221 {
222  return new CStreamingSparseFeatures<T>(*this);
223 }
224 
225 template <class T>
227 {
228  if (current_sgvector.features)
229  return 1;
230  return 0;
231 }
232 
234 {
235  parser.set_read_vector(&CStreamingFile::get_sparse_vector);
236 }
237 
239 {
240  parser.set_read_vector_and_label
242 }
243 
244 #define GET_FEATURE_TYPE(f_type, sg_type) \
245 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \
246 { \
247  return f_type; \
248 }
249 
252 GET_FEATURE_TYPE(F_BYTE, uint8_t)
253 GET_FEATURE_TYPE(F_BYTE, int8_t)
254 GET_FEATURE_TYPE(F_SHORT, int16_t)
255 GET_FEATURE_TYPE(F_WORD, uint16_t)
256 GET_FEATURE_TYPE(F_INT, int32_t)
257 GET_FEATURE_TYPE(F_UINT, uint32_t)
258 GET_FEATURE_TYPE(F_LONG, int64_t)
259 GET_FEATURE_TYPE(F_ULONG, uint64_t)
263 #undef GET_FEATURE_TYPE
264 
265 
266 template <class T>
267 void CStreamingSparseFeatures<T>::init()
268 {
269  working_file=NULL;
270  current_vec_index=0;
271  current_num_features=-1;
272 
273  set_generic<T>();
274 }
275 
276 template <class T>
277 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
278  bool is_labelled,
279  int32_t size)
280 {
281  init();
282  has_labels = is_labelled;
283  working_file = file;
284  SG_REF(working_file);
285  parser.init(file, is_labelled, size);
286  parser.set_free_vector_after_release(false);
287 }
288 
289 template <class T>
291 {
292  if (!parser.is_running())
293  parser.start_parser();
294 }
295 
296 template <class T>
298 {
299  parser.end_parser();
300 }
301 
302 template <class T>
304 {
305  int32_t current_length = 0;
306  SGSparseVectorEntry<T>* current_vector = NULL;
307 
308  bool ret_value;
309  ret_value = (bool) parser.get_next_example(current_vector,
310  current_length,
311  current_label);
312 
313  if (!ret_value)
314  return false;
315 
316  // ref_count disabled, because parser still owns the memory
317  current_sgvector = SGSparseVector<T>(current_vector, current_length, false);
318 
319  // Update number of features based on highest index
320  int32_t current_dimension = get_vector().get_num_dimensions();
321  current_num_features = CMath::max(current_num_features, current_dimension);
322 
323  current_vec_index++;
324  return true;
325 }
326 
327 template <class T>
329 {
330  return current_sgvector;
331 }
332 
333 template <class T>
335 {
336  ASSERT(has_labels)
337 
338  return current_label;
339 }
340 
341 template <class T>
343 {
344  parser.finalize_example();
345 }
346 
347 template <class T>
349 {
350  return current_num_features;
351 }
352 
353 template <class T>
355 {
357  return -1;
358 }
359 
360 template <class T>
362 {
363  return current_num_features;
364 }
365 
366 template <class T>
368 {
369  return current_sgvector.num_feat_entries;
370 }
371 
372 template <class T>
374 {
375  return C_STREAMING_SPARSE;
376 }
377 
378 template class CStreamingSparseFeatures<bool>;
379 template class CStreamingSparseFeatures<char>;
380 template class CStreamingSparseFeatures<int8_t>;
381 template class CStreamingSparseFeatures<uint8_t>;
382 template class CStreamingSparseFeatures<int16_t>;
384 template class CStreamingSparseFeatures<int32_t>;
386 template class CStreamingSparseFeatures<int64_t>;
391 }
#define SG_ERROR(...)
Definition: SGIO.h:131
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:141
virtual void add_to_dense_vec(float64_t alpha, float64_t *vec2, int32_t vec2_len, bool abs_val=false)
T sparse_dot(const SGSparseVector< T > &v)
static T sparse_dot(T alpha, SGSparseVectorEntry< T > *avec, int32_t alen, SGSparseVectorEntry< T > *bvec, int32_t blen)
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:35
A Streaming File access class.
Definition: StreamingFile.h:39
virtual float32_t dot(CStreamingDotFeatures *df)
virtual int32_t get_dim_feature_space() const
#define ASSERT(x)
Definition: SGIO.h:203
double float64_t
Definition: common.h:48
long double floatmax_t
Definition: common.h:49
#define SG_REF(x)
Definition: SGRefObject.h:34
virtual void get_sparse_vector_and_label(SGSparseVectorEntry< bool > *&vector, int32_t &len, float64_t &label)
virtual CFeatures * duplicate() const
static T max(T a, T b)
return the maximum of two integers
Definition: Math.h:160
Streaming features that support dot products among other operations.
float float32_t
Definition: common.h:47
virtual EFeatureClass get_feature_class() const
virtual void get_sparse_vector(SGSparseVectorEntry< bool > *&vector, int32_t &len)
The class Features is the base class of all feature objects.
Definition: Features.h:62
#define GET_FEATURE_TYPE(f_type, sg_type)
T dense_dot(T alpha, T *vec, int32_t dim, T b)
This class implements streaming features with sparse feature vectors. The vector is represented as an...
static T abs(T a)
return the absolute value of a number
Definition: Math.h:179

SHOGUN Machine Learning Toolbox - Documentation