VTK
Public Types | Public Member Functions | Static Public Member Functions | Protected Member Functions | List of all members
vtkTokenizer Class Reference

Converts a document collection into a term collection. More...

#include <vtkTokenizer.h>

Inheritance diagram for vtkTokenizer:
[legend]
Collaboration diagram for vtkTokenizer:
[legend]

Public Types

typedef vtkTableAlgorithm Superclass
 
typedef vtkstd::pair
< vtkUnicodeString::value_type,
vtkUnicodeString::value_type
DelimiterRange
 
typedef vtkstd::vector
< DelimiterRange
DelimiterRanges
 
- Public Types inherited from vtkTableAlgorithm
typedef vtkAlgorithm Superclass
 
- Public Types inherited from vtkAlgorithm
typedef vtkObject Superclass
 
- Public Types inherited from vtkObject
typedef vtkObjectBase Superclass
 

Public Member Functions

virtual const char * GetClassName ()
 
virtual int IsA (const char *type)
 
void PrintSelf (ostream &os, vtkIndent indent)
 
void AddDroppedDelimiters (vtkUnicodeString::value_type begin, vtkUnicodeString::value_type end)
 
void AddDroppedDelimiters (const DelimiterRanges &ranges)
 
void AddKeptDelimiters (vtkUnicodeString::value_type begin, vtkUnicodeString::value_type end)
 
void AddKeptDelimiters (const DelimiterRanges &ranges)
 
void DropPunctuation ()
 
void DropWhitespace ()
 
void KeepPunctuation ()
 
void KeepWhitespace ()
 
void KeepLogosyllabic ()
 
void ClearDroppedDelimiters ()
 
void ClearKeptDelimiters ()
 
- Public Member Functions inherited from vtkTableAlgorithm
virtual int ProcessRequest (vtkInformation *, vtkInformationVector **, vtkInformationVector *)
 
vtkTableGetOutput ()
 
vtkTableGetOutput (int index)
 
void SetInput (vtkDataObject *obj)
 
void SetInput (int index, vtkDataObject *obj)
 
- Public Member Functions inherited from vtkAlgorithm
int HasExecutive ()
 
vtkExecutiveGetExecutive ()
 
virtual void SetExecutive (vtkExecutive *executive)
 
virtual int ModifyRequest (vtkInformation *request, int when)
 
vtkInformationGetInputPortInformation (int port)
 
vtkInformationGetOutputPortInformation (int port)
 
int GetNumberOfInputPorts ()
 
int GetNumberOfOutputPorts ()
 
void UpdateProgress (double amount)
 
vtkInformationGetInputArrayInformation (int idx)
 
void RemoveAllInputs ()
 
vtkDataObjectGetOutputDataObject (int port)
 
virtual void RemoveInputConnection (int port, vtkAlgorithmOutput *input)
 
int GetNumberOfInputConnections (int port)
 
int GetTotalNumberOfInputConnections ()
 
vtkAlgorithmOutputGetInputConnection (int port, int index)
 
virtual void Update ()
 
virtual void UpdateInformation ()
 
virtual void UpdateWholeExtent ()
 
void ConvertTotalInputToPortConnection (int ind, int &port, int &conn)
 
virtual double ComputePriority ()
 
int ProcessRequest (vtkInformation *request, vtkCollection *inInfo, vtkInformationVector *outInfo)
 
virtual int ComputePipelineMTime (vtkInformation *request, vtkInformationVector **inInfoVec, vtkInformationVector *outInfoVec, int requestFromOutputPort, unsigned long *mtime)
 
virtual vtkInformationGetInformation ()
 
virtual void SetInformation (vtkInformation *)
 
virtual void Register (vtkObjectBase *o)
 
virtual void UnRegister (vtkObjectBase *o)
 
virtual void SetAbortExecute (int)
 
virtual int GetAbortExecute ()
 
virtual void AbortExecuteOn ()
 
virtual void AbortExecuteOff ()
 
virtual void SetProgress (double)
 
virtual double GetProgress ()
 
void SetProgressText (const char *ptext)
 
virtual char * GetProgressText ()
 
virtual unsigned long GetErrorCode ()
 
virtual void SetInputArrayToProcess (int idx, int port, int connection, int fieldAssociation, const char *name)
 
virtual void SetInputArrayToProcess (int idx, int port, int connection, int fieldAssociation, int fieldAttributeType)
 
virtual void SetInputArrayToProcess (int idx, vtkInformation *info)
 
virtual void SetInputArrayToProcess (int idx, int port, int connection, const char *fieldAssociation, const char *attributeTypeorName)
 
vtkDataObjectGetInputDataObject (int port, int connection)
 
virtual void SetInputConnection (int port, vtkAlgorithmOutput *input)
 
virtual void SetInputConnection (vtkAlgorithmOutput *input)
 
virtual void AddInputConnection (int port, vtkAlgorithmOutput *input)
 
virtual void AddInputConnection (vtkAlgorithmOutput *input)
 
vtkAlgorithmOutputGetOutputPort (int index)
 
vtkAlgorithmOutputGetOutputPort ()
 
virtual void SetReleaseDataFlag (int)
 
virtual int GetReleaseDataFlag ()
 
void ReleaseDataFlagOn ()
 
void ReleaseDataFlagOff ()
 
int UpdateExtentIsEmpty (vtkDataObject *output)
 
int UpdateExtentIsEmpty (vtkInformation *pinfo, int extentType)
 
- Public Member Functions inherited from vtkObject
virtual void DebugOn ()
 
virtual void DebugOff ()
 
unsigned char GetDebug ()
 
void SetDebug (unsigned char debugFlag)
 
virtual void Modified ()
 
virtual unsigned long GetMTime ()
 
unsigned long AddObserver (unsigned long event, vtkCommand *, float priority=0.0f)
 
unsigned long AddObserver (const char *event, vtkCommand *, float priority=0.0f)
 
vtkCommandGetCommand (unsigned long tag)
 
void RemoveObserver (vtkCommand *)
 
void RemoveObservers (unsigned long event, vtkCommand *)
 
void RemoveObservers (const char *event, vtkCommand *)
 
int HasObserver (unsigned long event, vtkCommand *)
 
int HasObserver (const char *event, vtkCommand *)
 
void RemoveObserver (unsigned long tag)
 
void RemoveObservers (unsigned long event)
 
void RemoveObservers (const char *event)
 
void RemoveAllObservers ()
 
int HasObserver (unsigned long event)
 
int HasObserver (const char *event)
 
template<class U , class T >
unsigned long AddObserver (unsigned long event, U observer, void(T::*callback)(), float priority=0.0f)
 
template<class U , class T >
unsigned long AddObserver (unsigned long event, U observer, void(T::*callback)(vtkObject *, unsigned long, void *), float priority=0.0f)
 
int InvokeEvent (unsigned long event, void *callData)
 
int InvokeEvent (const char *event, void *callData)
 
int InvokeEvent (unsigned long event)
 
int InvokeEvent (const char *event)
 
- Public Member Functions inherited from vtkObjectBase
const char * GetClassName () const
 
virtual void Delete ()
 
virtual void FastDelete ()
 
void Print (ostream &os)
 
void SetReferenceCount (int)
 
void PrintRevisions (ostream &os)
 
virtual void PrintHeader (ostream &os, vtkIndent indent)
 
virtual void PrintTrailer (ostream &os, vtkIndent indent)
 
int GetReferenceCount ()
 

Static Public Member Functions

static vtkTokenizerNew ()
 
static int IsTypeOf (const char *type)
 
static vtkTokenizerSafeDownCast (vtkObject *o)
 
static const DelimiterRanges Punctuation ()
 
static const DelimiterRanges Whitespace ()
 
static const DelimiterRanges Logosyllabic ()
 
- Static Public Member Functions inherited from vtkTableAlgorithm
static vtkTableAlgorithmNew ()
 
static int IsTypeOf (const char *type)
 
static vtkTableAlgorithmSafeDownCast (vtkObject *o)
 
- Static Public Member Functions inherited from vtkAlgorithm
static vtkAlgorithmNew ()
 
static int IsTypeOf (const char *type)
 
static vtkAlgorithmSafeDownCast (vtkObject *o)
 
static void SetDefaultExecutivePrototype (vtkExecutive *proto)
 
static vtkInformationIntegerKeyINPUT_IS_OPTIONAL ()
 
static vtkInformationIntegerKeyINPUT_IS_REPEATABLE ()
 
static
vtkInformationInformationVectorKey
INPUT_REQUIRED_FIELDS ()
 
static
vtkInformationStringVectorKey
INPUT_REQUIRED_DATA_TYPE ()
 
static
vtkInformationInformationVectorKey
INPUT_ARRAYS_TO_PROCESS ()
 
static vtkInformationIntegerKeyINPUT_PORT ()
 
static vtkInformationIntegerKeyINPUT_CONNECTION ()
 
static vtkInformationIntegerKeyPRESERVES_DATASET ()
 
static vtkInformationIntegerKeyPRESERVES_GEOMETRY ()
 
static vtkInformationIntegerKeyPRESERVES_BOUNDS ()
 
static vtkInformationIntegerKeyPRESERVES_TOPOLOGY ()
 
static vtkInformationIntegerKeyPRESERVES_ATTRIBUTES ()
 
static vtkInformationIntegerKeyPRESERVES_RANGES ()
 
- Static Public Member Functions inherited from vtkObject
static int IsTypeOf (const char *type)
 
static vtkObjectSafeDownCast (vtkObject *o)
 
static vtkObjectNew ()
 
static void BreakOnError ()
 
static void SetGlobalWarningDisplay (int val)
 
static void GlobalWarningDisplayOn ()
 
static void GlobalWarningDisplayOff ()
 
static int GetGlobalWarningDisplay ()
 
- Static Public Member Functions inherited from vtkObjectBase
static int IsTypeOf (const char *name)
 
static vtkObjectBaseNew ()
 

Protected Member Functions

 vtkTokenizer ()
 
 ~vtkTokenizer ()
 
int FillInputPortInformation (int port, vtkInformation *info)
 
virtual int RequestData (vtkInformation *request, vtkInformationVector **inputVector, vtkInformationVector *outputVector)
 
- Protected Member Functions inherited from vtkTableAlgorithm
 vtkTableAlgorithm ()
 
 ~vtkTableAlgorithm ()
 
virtual int RequestInformation (vtkInformation *request, vtkInformationVector **inputVector, vtkInformationVector *outputVector)
 
virtual int FillOutputPortInformation (int port, vtkInformation *info)
 
virtual int RequestUpdateExtent (vtkInformation *, vtkInformationVector **, vtkInformationVector *)
 
- Protected Member Functions inherited from vtkAlgorithm
 vtkAlgorithm ()
 
 ~vtkAlgorithm ()
 
virtual void SetNumberOfInputPorts (int n)
 
virtual void SetNumberOfOutputPorts (int n)
 
int InputPortIndexInRange (int index, const char *action)
 
int OutputPortIndexInRange (int index, const char *action)
 
int GetInputArrayAssociation (int idx, vtkInformationVector **inputVector)
 
virtual vtkExecutiveCreateDefaultExecutive ()
 
virtual void ReportReferences (vtkGarbageCollector *)
 
virtual void SetNumberOfInputConnections (int port, int n)
 
int GetInputArrayAssociation (int idx, int connection, vtkInformationVector **inputVector)
 
int GetInputArrayAssociation (int idx, vtkDataObject *input)
 
vtkDataArrayGetInputArrayToProcess (int idx, vtkInformationVector **inputVector)
 
vtkDataArrayGetInputArrayToProcess (int idx, vtkInformationVector **inputVector, int &association)
 
vtkDataArrayGetInputArrayToProcess (int idx, int connection, vtkInformationVector **inputVector)
 
vtkDataArrayGetInputArrayToProcess (int idx, int connection, vtkInformationVector **inputVector, int &association)
 
vtkDataArrayGetInputArrayToProcess (int idx, vtkDataObject *input)
 
vtkDataArrayGetInputArrayToProcess (int idx, vtkDataObject *input, int &association)
 
vtkAbstractArrayGetInputAbstractArrayToProcess (int idx, vtkInformationVector **inputVector)
 
vtkAbstractArrayGetInputAbstractArrayToProcess (int idx, vtkInformationVector **inputVector, int &association)
 
vtkAbstractArrayGetInputAbstractArrayToProcess (int idx, int connection, vtkInformationVector **inputVector)
 
vtkAbstractArrayGetInputAbstractArrayToProcess (int idx, int connection, vtkInformationVector **inputVector, int &association)
 
vtkAbstractArrayGetInputAbstractArrayToProcess (int idx, vtkDataObject *input)
 
vtkAbstractArrayGetInputAbstractArrayToProcess (int idx, vtkDataObject *input, int &association)
 
vtkInformationGetInputArrayFieldInformation (int idx, vtkInformationVector **inputVector)
 
virtual void SetNthInputConnection (int port, int index, vtkAlgorithmOutput *input)
 
virtual void SetErrorCode (unsigned long)
 
- Protected Member Functions inherited from vtkObject
 vtkObject ()
 
virtual ~vtkObject ()
 
virtual void RegisterInternal (vtkObjectBase *, int check)
 
virtual void UnRegisterInternal (vtkObjectBase *, int check)
 
void InternalGrabFocus (vtkCommand *mouseEvents, vtkCommand *keypressEvents=NULL)
 
void InternalReleaseFocus ()
 
- Protected Member Functions inherited from vtkObjectBase
 vtkObjectBase ()
 
virtual ~vtkObjectBase ()
 
virtual void CollectRevisions (ostream &os)
 
 vtkObjectBase (const vtkObjectBase &)
 
void operator= (const vtkObjectBase &)
 

Additional Inherited Members

- Public Attributes inherited from vtkAlgorithm
int AbortExecute
 
- Static Protected Member Functions inherited from vtkAlgorithm
static vtkInformationIntegerKeyPORT_REQUIREMENTS_FILLED ()
 
- Protected Attributes inherited from vtkAlgorithm
vtkInformationInformation
 
double Progress
 
char * ProgressText
 
unsigned long ErrorCode
 
- Protected Attributes inherited from vtkObject
unsigned char Debug
 
vtkTimeStamp MTime
 
vtkSubjectHelper * SubjectHelper
 
- Protected Attributes inherited from vtkObjectBase
int ReferenceCount
 
vtkWeakPointerBase ** WeakPointers
 
- Static Protected Attributes inherited from vtkAlgorithm
static vtkExecutiveDefaultExecutivePrototype
 

Detailed Description

Converts a document collection into a term collection.

Given an artifact table containing text documents, splits each document into its component tokens, producing a feature table containing the results.

Tokenization is performed by splitting input text into tokens based on character delimiters. Delimiters are divided into two categories: "dropped" and "kept". "Dropped" delimiters are discarded from the output, while "kept" delimiters are retained in the output as individual tokens. Initially, vtkTokenizer has no delimiters defined, so you must set some delimiters before use.

Users can reset and append to the lists of delimiters for each category. Delimiters are specified as half-open ranges of Unicode code points. This makes it easy to tokenize logosyllabic scripts such as Chinese, Korean, and Japanese by specifying an entire range of logograms as "kept" delimiters, so that individual glyphs become tokens.

Inputs: Input port 0: (required) A vtkTable containing zero-to-many "documents", with one document per table row, a vtkIdTypeArray column containing document ids, and a vtkUnicodeStringArray column containing the contents of each document. Input port 1: (optional) A vtkTable containing zero-to-many document ranges to be processed, with one range per table row, a vtkIdTypeArray column containing document ids, a vtkIdTypeArray containing begin offsets, and a vtkIdTypeArray column containing end offsets. If input port 1 is left unconnected, the filter will automatically process the entire contents of every input document.

Outputs: Output port 0: A vtkTable containing "document", "begin", "end", "type", and "text" columns.

Use SetInputArrayToProcess(0, ...) to specify the input table column that contains document ids (must be a vtkIdTypeArray). Default: "document"

Use SetInputArrayToProcess(1, ...) to specify the input table column that contains document contents (must be a vtkUnicodeStringArray). Default: "text"

Use SetInputArrayToProcess(2, 1, ...) to specify the input table column that contains range document ids (must be a vtkIdTypeArray). Defaults to "document".

Use SetInputArrayToProcess(3, 1, ...) to specify the input table column that contains range begin offsets (must be a vtkIdTypeArray). Defaults to "begin".

Use SetInputArrayToProcess(4, 1, ...) to specify the input table column that contains range end offsets (must be a vtkIdTypeArray). Defaults to "end".

Thanks:
Developed by Timothy M. Shead (tshea.nosp@m.d@sa.nosp@m.ndia..nosp@m.gov) at Sandia National Laboratories.
Events:
vtkCommand::ProgressEvent
Tests:
vtkTokenizer (Tests)

Definition at line 87 of file vtkTokenizer.h.

Member Typedef Documentation

Definition at line 92 of file vtkTokenizer.h.

Defines storage for a half-open range of Unicode characters [begin, end).

Definition at line 99 of file vtkTokenizer.h.

Defines storage for a half-open range of Unicode characters [begin, end).

Definition at line 102 of file vtkTokenizer.h.

Constructor & Destructor Documentation

vtkTokenizer::vtkTokenizer ( )
protected
vtkTokenizer::~vtkTokenizer ( )
protected

Member Function Documentation

static vtkTokenizer* vtkTokenizer::New ( )
static
virtual const char* vtkTokenizer::GetClassName ( )
virtual

Reimplemented from vtkTableAlgorithm.

static int vtkTokenizer::IsTypeOf ( const char *  type)
static
virtual int vtkTokenizer::IsA ( const char *  name)
virtual

Return 1 if this class is the same type of (or a subclass of) the named class. Returns 0 otherwise. This method works in combination with vtkTypeMacro found in vtkSetGet.h.

Reimplemented from vtkTableAlgorithm.

static vtkTokenizer* vtkTokenizer::SafeDownCast ( vtkObject o)
static
void vtkTokenizer::PrintSelf ( ostream &  os,
vtkIndent  indent 
)
virtual

Methods invoked by print to print information about the object including superclasses. Typically not called by the user (use Print() instead) but used in the hierarchical print process to combine the output of several classes.

Reimplemented from vtkTableAlgorithm.

static const DelimiterRanges vtkTokenizer::Punctuation ( )
static

Returns a set of delimiter ranges that match Unicode punctuation codepoints.

static const DelimiterRanges vtkTokenizer::Whitespace ( )
static

Returns a set of delimiter ranges that match Unicode punctuation codepoints.

static const DelimiterRanges vtkTokenizer::Logosyllabic ( )
static

Returns a set of delimiter ranges that match Unicode punctuation codepoints.

void vtkTokenizer::AddDroppedDelimiters ( vtkUnicodeString::value_type  begin,
vtkUnicodeString::value_type  end 
)

Adds the half-open range of Unicode characters [begin, end) to the set of "dropped" delimiters.

void vtkTokenizer::AddDroppedDelimiters ( const DelimiterRanges ranges)

Adds the half-open range of Unicode characters [begin, end) to the set of "dropped" delimiters.

void vtkTokenizer::AddKeptDelimiters ( vtkUnicodeString::value_type  begin,
vtkUnicodeString::value_type  end 
)

Adds the half-open range of Unicode characters [begin, end) to the set of "kept" delimiters.

void vtkTokenizer::AddKeptDelimiters ( const DelimiterRanges ranges)

Adds the half-open range of Unicode characters [begin, end) to the set of "kept" delimiters.

void vtkTokenizer::DropPunctuation ( )

Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.

void vtkTokenizer::DropWhitespace ( )

Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.

void vtkTokenizer::KeepPunctuation ( )

Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.

void vtkTokenizer::KeepWhitespace ( )

Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.

void vtkTokenizer::KeepLogosyllabic ( )

Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.

void vtkTokenizer::ClearDroppedDelimiters ( )

Clears the set of "dropped" delimiters.

void vtkTokenizer::ClearKeptDelimiters ( )

Clears the set of "dropped" delimiters.

int vtkTokenizer::FillInputPortInformation ( int  port,
vtkInformation info 
)
protectedvirtual

Fill the input port information objects for this algorithm. This is invoked by the first call to GetInputPortInformation for each port so subclasses can specify what they can handle.

Reimplemented from vtkTableAlgorithm.

virtual int vtkTokenizer::RequestData ( vtkInformation request,
vtkInformationVector **  inputVector,
vtkInformationVector outputVector 
)
protectedvirtual

This is called by the superclass. This is the method you should override.

Reimplemented from vtkTableAlgorithm.


The documentation for this class was generated from the following file: