BitMagic-C++
bmalgo_similarity.h
Go to the documentation of this file.
1 #ifndef BMALGO_SIMILARITY__H__INCLUDED__
2 #define BMALGO_SIMILARITY__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 #include <algorithm>
22 #include <functional>
23 #include <vector>
24 
25 #ifndef BM__H__INCLUDED__
26 // BitMagic utility headers do not include main "bm.h" declaration
27 // #include "bm.h" or "bm64.h" explicitly
28 # error missing include (bm.h or bm64.h)
29 #endif
30 
31 
32 #include "bmfunc.h"
33 #include "bmdef.h"
34 
35 #include "bmalgo_impl.h"
36 
37 namespace bm
38 {
39 
40 /*! Similarity descriptor between two objects (bit vectors, blocks, etc)
41  \internal
42 */
43 template <typename SO, unsigned DMD_SZ, typename IDX_VALUE, typename SValue, typename SFunc>
45 {
46 public:
48  typedef SValue similarity_value_type;
49  typedef SFunc similarity_functor;
50 public:
52  : so1_(0), so2_(0), so1_idx_(0), so2_idx_(0)
53  {
54  similarity_ = 0;
55  }
56 
57  similarity_descriptor(const SO* so1, const SO* so2,
58  const distance_metric_descriptor* dmd_ptr)
59  :so1_(so1),
60  so2_(so2),
61  so1_idx_(0), so2_idx_(0)
62  {
63  for (size_t i = 0; i < DMD_SZ; ++i)
64  dmd_[i] = dmd_ptr[i];
65  similarity_ = 0;
66  }
67 
68  similarity_descriptor(const SO* so1, IDX_VALUE i1,
69  const SO* so2, IDX_VALUE i2,
70  const distance_metric_descriptor* dmd_ptr)
71  :so1_(so1), so2_(so2), so1_idx_(i1), so2_idx_(i2)
72  {
73  for (size_t i = 0; i < DMD_SZ; ++i)
74  dmd_[i] = dmd_ptr[i];
75  similarity_ = 0;
76  }
77 
80  so1_(sd.so1_),
81  so2_(sd.so2_),
82  so1_idx_(sd.so1_idx_),
83  so2_idx_(sd.so2_idx_)
84  {
85  for (size_t i = 0; i < DMD_SZ; ++i)
86  dmd_[i] = sd.dmd_[i];
87  }
89  {
91  so1_ = sd.so1_; so2_ = sd.so2_;
93  for (size_t i = 0; i < DMD_SZ; ++i)
94  dmd_[i] = sd.dmd_[i];
95  return *this;
96  }
97 
98  bool operator > (const similarity_descriptor& sd) const
99  {
100  return similarity_ > sd.similarity_;
101  }
102 
103  SValue similarity() const { return similarity_; }
104  void set_similarity(SValue s) { similarity_ = s;}
105 
106  const SO* get_first() const { return so1_; }
107  const SO* get_second() const { return so2_; }
108 
109  IDX_VALUE get_first_idx() const { return so1_idx_; }
110  IDX_VALUE get_second_idx() const { return so2_idx_; }
111 
114 
115  void set_metric(size_t i, distance_metric metric)
116  {
117  BM_ASSERT(i < DMD_SZ);
118  dmd_[i].metric = metric;
119  }
120 
121 
122 protected:
123  SValue similarity_; //< final similarity product
124  const SO* so1_; //< object 1 for similarity comparison
125  const SO* so2_; //< object 2 for similarity comparison
126  IDX_VALUE so1_idx_; //< index of object 1
127  IDX_VALUE so2_idx_; //< index of object 2
128  distance_metric_descriptor dmd_[DMD_SZ]; //< array of distance operations defined on objects
129 };
130 
131 /*!
132  Batch of objects for similarity measurement
133  \internal
134 */
135 template<class SDESCR>
137 {
139  typedef typename SDESCR::similarity_object_type similarity_object_type;
140  typedef typename SDESCR::similarity_value_type similarity_value_type;
141  typedef typename SDESCR::similarity_functor similarity_functor;
142  typedef std::vector<SDESCR> vector_type;
143 
144  /// run the similarity calculation using distance metrics engine
145  void calculate()
146  {
147  for( size_t i = 0; i < descr_vect_.size(); ++i)
148  {
149  similaruty_descriptor_type& sdescr = descr_vect_[i];
150 
151  const similarity_object_type* so1 = sdescr.get_first();
152  const similarity_object_type* so2 = sdescr.get_second();
153 
154  distance_metric_descriptor* dit = sdescr.distance_begin();
155  distance_metric_descriptor* dit_end = sdescr.distance_end();
156  bm::distance_operation(*so1, *so2, dit, dit_end);
157 
158  // reduce: use functor to compute final similarity metric
159  similarity_functor func;
160  similarity_value_type d = func(dit, dit_end);
161  sdescr.set_similarity(d);
162  } // for i
163  }
164 
165  void sort()
166  {
167  std::sort(descr_vect_.begin(), descr_vect_.end(), std::greater<SDESCR>());
168  }
169 
170  void reserve(size_t cap)
171  {
172  descr_vect_.reserve(cap);
173  }
174 
175  void push_back(const similaruty_descriptor_type& sdt)
176  {
177  descr_vect_.push_back(sdt);
178  }
179 
180 public:
181  std::vector<SDESCR> descr_vect_;
182 };
183 
184 
185 /**
186  Utility function to build jaccard similarity batch for sparse_vector<>
187  \internal
188 */
189 template<class SIMBATCH, class SV>
190 void build_jaccard_similarity_batch(SIMBATCH& sbatch, const SV& sv)
191 {
192 
193  size_t plains = sv.plains();
194  sbatch.reserve((plains * plains) / 2);
195 
197  dmd[0].metric = bm::COUNT_AND;
198  dmd[1].metric = bm::COUNT_OR;
199 
200  // build a batch for triangular distance matrix
201  //
202  for (unsigned i = 0; i < plains; ++i)
203  {
204  const typename SV::bvector_type* bv1 = sv.get_plain(i);
205  if (bv1)
206  {
207  for (unsigned j = i+1; j < plains; ++j)
208  {
209  const typename SV::bvector_type* bv2 = sv.get_plain(j);
210  if (bv2 && bv1 != bv2)
211  {
212  sbatch.push_back(typename SIMBATCH::similaruty_descriptor_type(bv1, i, bv2, j, &dmd[0]));
213  }
214  } // for j
215  }
216 
217  } // for i
218 }
219 
220 
221 
222 } // namespace bm
223 
224 #include "bmundef.h"
225 
226 #endif
void calculate()
run the similarity calculation using distance metrics engine
(A & B).count()
Definition: bmalgo_impl.h:59
distance_metric_descriptor * distance_begin()
IDX_VALUE get_first_idx() const
IDX_VALUE get_second_idx() const
const SO * get_first() const
std::vector< SDESCR > vector_type
void build_jaccard_similarity_batch(SIMBATCH &sbatch, const SV &sv)
Utility function to build jaccard similarity batch for sparse_vector<>
Definition: bm.h:76
const SO * get_second() const
pre-processor un-defines to avoid global space pollution (internal)
bool operator>(const similarity_descriptor &sd) const
Distance metric descriptor, holds metric code and result.
Definition: bmalgo_impl.h:87
SDESCR::similarity_functor similarity_functor
Algorithms for bvector<>
void push_back(const similaruty_descriptor_type &sdt)
void distance_operation(const BV &bv1, const BV &bv2, distance_metric_descriptor *dmit, distance_metric_descriptor *dmit_end)
Distance computing template function.
Definition: bmalgo_impl.h:702
(A | B).count()
Definition: bmalgo_impl.h:61
Bit manipulation primitives (internal)
void reserve(size_t cap)
similarity_descriptor(const SO *so1, const SO *so2, const distance_metric_descriptor *dmd_ptr)
Definitions(internal)
similarity_descriptor(const similarity_descriptor &sd)
similarity_descriptor(const SO *so1, IDX_VALUE i1, const SO *so2, IDX_VALUE i2, const distance_metric_descriptor *dmd_ptr)
SDESCR::similarity_value_type similarity_value_type
distance_metric_descriptor dmd_[DMD_SZ]
void set_metric(size_t i, distance_metric metric)
#define BM_ASSERT
Definition: bmdef.h:117
distance_metric
Distance metrics codes defined for vectors A and B.
Definition: bmalgo_impl.h:57
bm::bvector bvector_type
similarity_descriptor & operator=(const similarity_descriptor &sd)
std::vector< SDESCR > descr_vect_
SDESCR::similarity_object_type similarity_object_type
distance_metric_descriptor * distance_end()