Edinburgh Speech Tools  2.1-release
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
EST_Token.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : April 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* Token/Tokenizer class */
37 /* */
38 /*=======================================================================*/
39 
40 #ifndef __EST_TOKEN_H__
41 #define __EST_TOKEN_H__
42 
43 #include <cstdio>
44 
45 using namespace std;
46 
47 #include "EST_String.h"
48 #include "EST_common.h"
49 
50 // I can never really remember this so we'll define it here
51 /// The default whitespace characters
52 extern const EST_String EST_Token_Default_WhiteSpaceChars;
53 ///
54 extern const EST_String EST_Token_Default_SingleCharSymbols;
55 ///
56 extern const EST_String EST_Token_Default_PunctuationSymbols;
57 ///
58 extern const EST_String EST_Token_Default_PrePunctuationSymbols;
59 
60 /** \class EST_Token
61  * @ingroup stringclasses
62  This class is similar to \ref EST_String but also maintains
63  the original punctuation and whitespace found around the
64  token.
65 
66  \ref EST_Token 's primary use is with \ref EST_TokenStream class
67  which allows easy tokenizing of ascii files.
68 
69  A token consists of four parts, any of which may be empty: a
70  name, the actual token, preceding whitespace, preceding
71  punctuation, the name and succeeding punctuation.
72 
73  @author Alan W Black (awb@cstr.ed.ac.uk): April 1996
74 */
75 class EST_Token {
76  private:
77  EST_String space;
78  EST_String prepunc;
79  EST_String pname;
80  EST_String punc;
81  int linenum;
82  int linepos;
83  int p_filepos;
84  int p_quoted;
85 
86  public:
87  ///
88  EST_Token() {init();}
89  ///
90  EST_Token(const EST_String p) {init(); pname = p; }
91  ///
92  void init() {p_quoted=linenum=linepos=p_filepos=0;}
93 
94  /**@name Basic access to fields */
95  ///@{
96  /// set token from a string
97  void set_token(const EST_String &p) { pname = p; }
98  ///
99  void set_token(const char *p) { pname = p; }
100  /// set whitespace of token.
101  void set_whitespace(const EST_String &p) { space = p; }
102  ///
103  void set_whitespace(const char *p) { space = p; }
104  /// set (post) punctuation of token.
105  void set_punctuation(const EST_String &p) { punc = p; }
106  ///
107  void set_punctuation(const char *p) { punc = p; }
108  /// set prepunction
109  void set_prepunctuation(const EST_String &p) { prepunc = p; }
110  ///
111  void set_prepunctuation(const char *p) { prepunc = p; }
112  ///
113  const EST_String &whitespace() { return space; }
114  ///
115  const EST_String &punctuation() { return punc; }
116  ///
117  const EST_String &prepunctuation() { return prepunc; }
118 
119  /**@name Access token as a string */
120  ///@{
121  const EST_String &string() const { return String(); }
122  /// Access token as a string
123  const EST_String &S() const { return S(); }
124  /// Access token as a string
125  const EST_String &String() const { return pname; }
126  /// For automatic coercion to \ref EST_String
127  operator EST_String() const { return String(); }
128  ///@}
129 
130  /**@name Access token as a int */
131  ///@{
132  int Int(bool &valid) const { return String().Int(valid); }
133  int Int() const { return String().Int(); }
134  int I(bool &valid) const { return Int(valid); }
135  int I() const { return Int(); }
136  operator int() const { return Int(); }
137  ///@}
138 
139  /**@name Access token as a long */
140  ///@{
141  long Long(bool &valid) const { return String().Long(valid); }
142  long Long() const { return String().Long(); }
143  long L(bool &valid) const { return Long(valid); }
144  long L() const { return Long(); }
145  operator long() const { return Long(); }
146  ///@}
147 
148  /**@name Access token as a float */
149  ///@{
150  float Float(bool &valid) const { return String().Float(valid); }
151  float Float() const { return String().Float(); }
152  float F(bool &valid) const { return Float(valid); }
153  float F() const { return Float(); }
154  operator float() const { return Float(); }
155  ///@}
156 
157  /**@name Access token as a double */
158  ///@{
159  double Double(bool &valid) const { return String().Double(valid); }
160  double Double() const { return String().Double(); }
161  double D(bool &valid) const { return Double(valid); }
162  double D() const { return Double(); }
163  operator double() const { return Double(); }
164  ///@}
165 
166  ///@}
167 
168  /**@name Quotation related methods */
169  ///@{
170  /// Note that this token was quoted (or not)
171  void set_quoted(int q) { p_quoted = q; }
172  /// TRUE is token was quoted
173  int quoted() const { return p_quoted; }
174  ///@}
175  ///
176  void set_row(int r) { linenum = r; }
177  ///
178  void set_col(int c) { linepos = c; }
179  /// Set file position in original \ref EST_TokenStream
180  void set_filepos(int c) { p_filepos = c; }
181  /// Return lower case version of token name
182  EST_String lstring() { return downcase(pname); }
183  /// Return upper case version of token name
184  EST_String ustring() { return upcase(pname); }
185  /// Line number in original \ref EST_TokenStream.
186  int row(void) const { return linenum; }
187  /// Line position in original \ref EST_TokenStream.
188  int col(void) const { return linepos; }
189  /// file position in original \ref EST_TokenStream.
190  int filepos(void) const { return p_filepos; }
191 
192  /// A string describing current position, suitable for error messages
193  const EST_String pos_description() const;
194 
195  ///
196  friend ostream& operator << (ostream& s, const EST_Token &p);
197 
198  ///
199  EST_Token & operator = (const EST_Token &a);
200  ///
201  EST_Token & operator = (const EST_String &a);
202  ///
203  int operator == (const EST_String &a) { return (pname == a); }
204  ///
205  int operator != (const EST_String &a) { return (pname != a); }
206  ///
207  int operator == (const char *a) { return (strcmp(pname,a)==0); }
208  ///
209  int operator != (const char *a) { return (strcmp(pname,a)!=0); }
210 };
211 
212 enum EST_tokenstream_type {tst_none, tst_file, tst_pipe, tst_string, tst_istream};
213 
214 /** \class EST_TokenStream
215  A class that allows the reading of \ref EST_Token from a file
216  stream, pipe or string. It automatically tokenizes a file based on
217  user definable whitespace and punctuation.
218 
219  The definitions of whitespace and punctuation are user definable.
220  Also support for single character symbols is included. Single
221  character symbols *always* are treated as individual tokens
222  irrespective of their white space context. Also a quote
223  mode can be used to read uqoted tokens.
224 
225  The setting of whitespace, pre and post punctuation, single character
226  symbols and quote mode must be down (immediately) after opening
227  the stream.
228 
229  There is no unget but peek provides look ahead of one token.
230 
231  Note there is an interesting issue about what to do about
232  the last whitespace in the file. Should it be ignored or should
233  it be attached to a token with a name string of length zero.
234  In unquoted mode the eof() will return TRUE if the next token name
235  is empty (the mythical last token). In quoted mode the last must
236  be returned so eof will not be raised.
237 
238  @author Alan W Black (awb@cstr.ed.ac.uk): April 1996
239 */
241  private:
242  EST_tokenstream_type type;
243  EST_String WhiteSpaceChars;
244  EST_String SingleCharSymbols;
245  EST_String PunctuationSymbols;
246  EST_String PrePunctuationSymbols;
247  EST_String Origin;
248  FILE *fp;
249  istream *is;
250  int fd;
251  char *buffer;
252  int buffer_length;
253  int pos;
254  int linepos;
255  int p_filepos;
256  int getch(void);
257  EST_TokenStream &getch(char &C);
258  int peeked_charp;
259  int peeked_char; // ungot character
260  int peekch(void);
261  int peeked_tokp;
262  int eof_flag;
263  int quotes;
264  char quote;
265  char escape;
266  EST_Token current_tok;
267  void default_values(void);
268  /* local buffers to save reallocating */
269  int tok_wspacelen;
270  char *tok_wspace;
271  int tok_stufflen;
272  char *tok_stuff;
273  int tok_prepuncslen;
274  char *tok_prepuncs;
275  int close_at_end;
276 
277  /* character class map */
278  char p_table[256];
279  bool p_table_wrong;
280 
281  /** This function is deliberately private so that you'll get a compilation
282  error if you assign a token stream or pass it as an (non-reference)
283  argument. The problem with copying is that you need to copy the
284  filedescriptiors too (which can't be done for pipes). You probably
285  don't really want a copy anyway and meant to pass it as a reference.
286  If you really need this (some sort of clever look ahead) I am not
287  sure what he consequences really are (or how portable they are).
288  Pass the \ref EST_TokenStream by reference instead.
289  */
291 
292  void build_table();
293 
294  inline int getch_internal();
295  inline int peekch_internal();
296  inline int getpeeked_internal();
297  public:
298  ///
299  EST_TokenStream();
300  /// will close file if appropriate for type
301  ~EST_TokenStream();
302  ///@{
303  /// open a \ref EST_TokenStream for a file.
304  int open(const EST_String &filename);
305  /// open a \ref EST_TokenStream for an already opened file
306  int open(FILE *ofp, int close_when_finished);
307  /// open a \ref EST_TokenStream for an already open istream
308  int open(istream &newis);
309  /// open a \ref EST_TokenStream for string rather than a file
310  int open_string(const EST_String &newbuffer);
311  /// Close stream.
312  void close(void);
313  ///@}
314  /**@name stream access functions */
315  ///@{
316  /// get next token in stream
317  EST_TokenStream &get(EST_Token &t);
318  /// get next token in stream
319  EST_Token &get();
320  /**@name get the next token which must be the argument. */
321  ///@{
322  EST_Token &must_get(EST_String expected, bool *ok);
323  EST_Token &must_get(EST_String expected, bool &ok)
324  { return must_get(expected, &ok); }
325  EST_Token &must_get(EST_String expected)
326  { return must_get(expected, (bool *)NULL); }
327  ///@}
328  /// get up to `s` in stream as a single token.
329  EST_Token get_upto(const EST_String &s);
330  /// get up to `s` in end of line as a single token.
331  EST_Token get_upto_eoln(void);
332  /// peek at next token
334  { if (!peeked_tokp) get();
335  peeked_tokp = TRUE; return current_tok; }
336  /// Reading binary data, (don't use peek() immediately beforehand)
337  int fread(void *buff,int size,int nitems);
338  ///@}
339  /**@name stream initialization functions */
340  ///@{
341  /// set which characters are to be treated as whitespace
343  { WhiteSpaceChars = ws; p_table_wrong=1;}
344  /// set which characters are to be treated as single character symbols
346  { SingleCharSymbols = sc; p_table_wrong=1;}
347  /// set which characters are to be treated as (post) punctuation
349  { PunctuationSymbols = ps; p_table_wrong=1;}
350  /// set which characters are to be treated as (post) punctuation
352  { PrePunctuationSymbols = ps; p_table_wrong=1;}
353  /// set characters to be used as quotes and escape, and set quote mode
354  void set_quotes(char q, char e) { quotes = TRUE; quote = q; escape = e; p_table_wrong=1;}
355  /// query quote mode
356  int quoted_mode(void) { return quotes; }
357  ///@}
358  /**@name miscellaneous */
359  ///@{
360  /// returns line number of \ref EST_TokenStream
361  int linenum(void) const {return linepos;}
362  /// end of file
363  int eof()
364  { return (eof_flag || ((!quotes) && (peek() == ""))); }
365  /// end of line
366  int eoln();
367  /// current file position in \ref EST_TokenStream
368  int filepos(void) const { return (type == tst_string) ? pos : p_filepos; }
369  /// tell, synonym for filepos
370  int tell(void) const { return filepos(); }
371  /// seek, reposition file pointer
372  int seek(int position);
373  int seek_end();
374  /// Reset to start of file/string
375  int restart(void);
376  /// A string describing current position, suitable for error messages
377  const EST_String pos_description();
378  /// The originating filename (if there is one)
379  const EST_String filename() const { return Origin; }
380  /// For the people who *need* the actual description (if possible)
381  FILE *filedescriptor() { return (type == tst_file) ? fp : 0; }
382  ///
383  EST_TokenStream & operator >>(EST_Token &p);
384  ///
385  EST_TokenStream & operator >>(EST_String &p);
386  ///
387  friend ostream& operator <<(ostream& s, EST_TokenStream &p);
388  ///@}
389 };
390 
391 /** Quote a string with given quotes and escape character
392 */
393 EST_String quote_string(const EST_String &s,
394  const EST_String &quote = "\"",
395  const EST_String &escape = "\\",
396  int force=0);
397 
398 #endif // __EST_TOKEN_H__
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
Definition: EST_Token.h:342
int row(void) const
Line number in original EST_TokenStream.
Definition: EST_Token.h:186
FILE * filedescriptor()
For the people who need the actual description (if possible)
Definition: EST_Token.h:381
void set_prepunctuation(const EST_String &p)
set prepunction
Definition: EST_Token.h:109
EST_String lstring()
Return lower case version of token name.
Definition: EST_Token.h:182
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:345
int tell(void) const
tell, synonym for filepos
Definition: EST_Token.h:370
const EST_String filename() const
The originating filename (if there is one)
Definition: EST_Token.h:379
int quoted() const
TRUE is token was quoted.
Definition: EST_Token.h:173
int quoted_mode(void)
query quote mode
Definition: EST_Token.h:356
void set_punctuation(const EST_String &p)
set (post) punctuation of token.
Definition: EST_Token.h:105
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:351
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:348
int eof()
end of file
Definition: EST_Token.h:363
void set_token(const EST_String &p)
set token from a string
Definition: EST_Token.h:97
int linenum(void) const
returns line number of EST_TokenStream
Definition: EST_Token.h:361
const EST_String & S() const
Access token as a string.
Definition: EST_Token.h:123
void set_filepos(int c)
Set file position in original EST_TokenStream.
Definition: EST_Token.h:180
int col(void) const
Line position in original EST_TokenStream.
Definition: EST_Token.h:188
EST_Token & peek(void)
peek at next token
Definition: EST_Token.h:333
int filepos(void) const
file position in original EST_TokenStream.
Definition: EST_Token.h:190
EST_String ustring()
Return upper case version of token name.
Definition: EST_Token.h:184
void set_quotes(char q, char e)
set characters to be used as quotes and escape, and set quote mode
Definition: EST_Token.h:354
int filepos(void) const
current file position in EST_TokenStream
Definition: EST_Token.h:368
const EST_String & String() const
Access token as a string.
Definition: EST_Token.h:125
void set_whitespace(const EST_String &p)
set whitespace of token.
Definition: EST_Token.h:101
void set_quoted(int q)
Note that this token was quoted (or not)
Definition: EST_Token.h:171