Edinburgh Speech Tools  2.1-release
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
EST_Wagon.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Public declarations for Wagon (CART builder) */
38 /* */
39 /*=======================================================================*/
40 #ifndef __WAGON_H__
41 #define __WAGON_H__
42 
43 #include "EST_String.h"
44 #include "EST_Val.h"
45 #include "EST_TVector.h"
46 #include "EST_TList.h"
47 #include "EST_simplestats.h" /* For EST_SuffStats class */
48 #include "EST_Track.h"
49 #include "siod.h"
50 #define wagon_error(WMESS) (cerr << WMESS << endl,exit(-1))
51 
52 // I get floating point exceptions of Alphas when I do any comparisons
53 // with HUGE_VAL or FLT_MAX so I'll make my own
54 #define WGN_HUGE_VAL 1.0e20
55 
56 class WVector : public EST_FVector
57 {
58  public:
59  WVector(int n) : EST_FVector(n) {}
60  int get_int_val(int n) const { return (int)a_no_check(n); }
61  float get_flt_val(int n) const { return a_no_check(n); }
62  void set_int_val(int n,int i) { a_check(n) = (int)i; }
63  void set_flt_val(int n,float f) { a_check(n) = f; }
64 };
65 
68 
69 /* Different types of feature */
70 enum wn_dtype {/* for predictees and predictors */
71  wndt_binary, wndt_float, wndt_class,
72  /* for predictees only */
73  wndt_cluster, wndt_vector, wndt_matrix, wndt_trajectory,
74  /* for ignored features */
75  wndt_ignore};
76 
77 class WDataSet : public WVectorList {
78  private:
79  int dlength;
80  EST_IVector p_type;
81  EST_IVector p_ignore;
82  EST_StrVector p_name;
83  public:
84  void load_description(const EST_String& descfname,LISP ignores);
85  void ignore_non_numbers();
86 
87  int ftype(const int &i) const {return p_type(i);}
88  int ignore(int i) const {return p_ignore(i); }
89  void set_ignore(int i,int value) { p_ignore[i] = value; }
90  const EST_String &feat_name(const int &i) const {return p_name(i);}
91  int samples(void) const {return length();}
92  int width(void) const {return dlength;}
93 };
94 enum wn_oper {wnop_equal, wnop_binary, wnop_greaterthan,
95  wnop_lessthan, wnop_is, wnop_in, wnop_matches};
96 
97 class WQuestion {
98  private:
99  int feature_pos;
100  wn_oper op;
101  int yes;
102  int no;
103  EST_Val operand1;
104  EST_IList operandl;
105  float score;
106  public:
107  WQuestion() {;}
108  WQuestion(const WQuestion &s)
109  { feature_pos=s.feature_pos;
110  op=s.op; yes=s.yes; no=s.no; operand1=s.operand1;
111  operandl = s.operandl; score=s.score;}
112  ~WQuestion() {;}
113  WQuestion(int fp, wn_oper o,EST_Val a)
114  { feature_pos=fp; op=o; operand1=a; }
115  void set_fp(const int &fp) {feature_pos=fp;}
116  void set_oper(const wn_oper &o) {op=o;}
117  void set_operand1(const EST_Val &a) {operand1 = a;}
118  void set_yes(const int &y) {yes=y;}
119  void set_no(const int &n) {no=n;}
120  int get_yes(void) const {return yes;}
121  int get_no(void) const {return no;}
122  const int get_fp(void) const {return feature_pos;}
123  const wn_oper get_op(void) const {return op;}
124  const EST_Val get_operand1(void) const {return operand1;}
125  const EST_IList &get_operandl(void) const {return operandl;}
126  const float get_score(void) const {return score;}
127  void set_score(const float &f) {score=f;}
128  const int ask(const WVector &w) const;
129  friend ostream& operator<<(ostream& s, const WQuestion &q);
130 };
131 
132 enum wnim_type {wnim_unset, wnim_float, wnim_class,
133  wnim_cluster, wnim_vector, wnim_matrix,
134  wnim_trajectory};
135 
136 // Impurity measure for cumulating impurities from set of data
137 class WImpurity {
138  private:
139  wnim_type t;
140  EST_SuffStats a;
142 
143  float cluster_impurity();
144  float cluster_member_mean(int i);
145  float vector_impurity();
146  float trajectory_impurity();
147  public:
148  EST_IList members; // Maybe there should be a cluster class
149  EST_SuffStats **trajectory;
150  float score;
151  int l,width;
152 
153  WImpurity() { t=wnim_unset; a.reset(); trajectory=0; l=0; width=0; }
154  ~WImpurity();
155  WImpurity(const WVectorVector &ds);
156  void copy(const WImpurity &s)
157  {
158  int i,j;
159  t=s.t; a=s.a; p=s.p; members=s.members; l=s.l; width=s.width;
160  score = s.score;
161  if (s.trajectory)
162  {
163  trajectory = new EST_SuffStats *[l];
164  for (i=0; i<l; i++)
165  {
166  trajectory[i] = new EST_SuffStats[width];
167  for (j=0; j<width; j++)
168  trajectory[i][j] = s.trajectory[i][j];
169  }
170  }
171  }
172  WImpurity &operator = (const WImpurity &a) { copy(a); return *this; }
173 
174  float measure(void);
175  double samples(void);
176  wnim_type type(void) const { return t;}
177  void cumulate(const float pv,double count=1.0);
178  EST_Val value(void);
179  EST_DiscreteProbDistribution &pd() { return p; }
180  float cluster_distance(int i); // distance i from centre in sds
181  int in_cluster(int i); // distance i from centre < most remote member
182  float cluster_ranking(int i); // position in closeness to centre
183  friend ostream& operator<<(ostream &s, WImpurity &imp);
184 };
185 
186 class WDlist {
187  private:
188  float p_score;
189  WQuestion p_question;
190  EST_String p_token;
191  int p_freq;
192  int p_samples;
193  WDlist *next;
194  public:
195  WDlist() { next=0; }
196  ~WDlist() { if (next != 0) delete next; }
197  void set_score(float s) { p_score = s; }
198  void set_question(const WQuestion &q) { p_question = q; }
199  void set_best(const EST_String &t,int freq, int samples)
200  { p_token = t; p_freq = freq; p_samples = samples;}
201  float score() const {return p_score;}
202  const EST_String &token(void) const {return p_token;}
203  const WQuestion &question() const {return p_question;}
204  EST_Val predict(const WVector &w);
205  friend WDlist *add_to_dlist(WDlist *l,WDlist *a);
206  friend ostream &operator<<(ostream &s, WDlist &d);
207 };
208 
209 class WNode {
210  private:
211  WVectorVector data;
212  WQuestion question;
213  WImpurity impurity;
214  WNode *left;
215  WNode *right;
216  void print_out(ostream &s, int margin);
217  int leaf(void) const { return ((left == 0) || (right == 0)); }
218  int pure(void);
219  public:
220  WNode() { left = right = 0; }
221  ~WNode() { if (left != 0) {delete left; left=0;}
222  if (right != 0) {delete right; right=0;} }
223  WVectorVector &get_data(void) { return data; }
224  void set_subnodes(WNode *l,WNode *r) { left=l; right=r; }
225  void set_impurity(const WImpurity &imp) {impurity=imp;}
226  void set_question(const WQuestion &q) {question=q;}
227  void prune(void);
228  void held_out_prune(void);
229  WImpurity &get_impurity(void) {return impurity;}
230  WQuestion &get_question(void) {return question;}
231  EST_Val predict(const WVector &w);
232  WNode *predict_node(const WVector &d);
233  int samples(void) const { return data.n(); }
234  friend ostream& operator<<(ostream &s, WNode &n);
235 };
236 
237 extern Discretes wgn_discretes;
238 extern WDataSet wgn_dataset;
239 extern WDataSet wgn_test_dataset;
240 extern EST_FMatrix wgn_DistMatrix;
241 extern EST_Track wgn_VertexTrack;
242 extern EST_Track wgn_UnitTrack;
243 extern EST_Track wgn_VertexFeats;
244 
245 void wgn_load_datadescription(EST_String fname,LISP ignores);
246 void wgn_load_dataset(WDataSet &ds,EST_String fname);
247 WNode *wgn_build_tree(float &score);
248 WNode *wgn_build_dlist(float &score,ostream *output);
249 WNode *wagon_stepwise(float limit);
250 float wgn_score_question(WQuestion &q, WVectorVector &ds);
251 void wgn_find_split(WQuestion &q,WVectorVector &ds,
253 float summary_results(WNode &tree,ostream *output);
254 
255 extern int wgn_min_cluster_size;
256 extern int wgn_held_out;
257 extern int wgn_prune;
258 extern int wgn_quiet;
259 extern int wgn_verbose;
260 extern int wgn_predictee;
261 extern int wgn_count_field;
262 extern EST_String wgn_count_field_name;
263 extern EST_String wgn_predictee_name;
264 extern float wgn_float_range_split;
265 extern float wgn_balance;
266 extern EST_String wgn_opt_param;
267 extern EST_String wgn_vertex_output;
268 
269 #define wgn_ques_feature(X) (get_c_string(car(X)))
270 #define wgn_ques_oper_str(X) (get_c_string(car(cdr(X))))
271 #define wgn_ques_operand(X) (car(cdr(cdr(X))))
272 
273 int wagon_ask_question(LISP question, LISP value);
274 
275 #endif /* __WAGON_H__ */
A vector class for floating point numbers. EST_FVector x should be used instead of float *x wherever ...
Definition: EST_FMatrix.h:118
const float & a_check(int n) const
read-only const access operator: with bounds checking
EST_FVector()
Default constructor.
Definition: EST_FMatrix.h:125
INLINE int n() const
number of items in vector.
Definition: EST_TVector.h:252
INLINE const float & a_no_check(int n) const
read-only const access operator: without bounds checking
Definition: EST_TVector.h:255
void reset(void)
reset internal values