VTK
|
Converts a document collection into a term collection. More...
#include <vtkTokenizer.h>
Public Types | |
typedef vtkTableAlgorithm | Superclass |
typedef vtkstd::pair< vtkUnicodeString::value_type, vtkUnicodeString::value_type > | DelimiterRange |
typedef vtkstd::vector< DelimiterRange > | DelimiterRanges |
![]() | |
typedef vtkAlgorithm | Superclass |
![]() | |
typedef vtkObject | Superclass |
![]() | |
typedef vtkObjectBase | Superclass |
Public Member Functions | |
virtual const char * | GetClassName () |
virtual int | IsA (const char *type) |
void | PrintSelf (ostream &os, vtkIndent indent) |
void | AddDroppedDelimiters (vtkUnicodeString::value_type begin, vtkUnicodeString::value_type end) |
void | AddDroppedDelimiters (const DelimiterRanges &ranges) |
void | AddKeptDelimiters (vtkUnicodeString::value_type begin, vtkUnicodeString::value_type end) |
void | AddKeptDelimiters (const DelimiterRanges &ranges) |
void | DropPunctuation () |
void | DropWhitespace () |
void | KeepPunctuation () |
void | KeepWhitespace () |
void | KeepLogosyllabic () |
void | ClearDroppedDelimiters () |
void | ClearKeptDelimiters () |
![]() | |
virtual int | ProcessRequest (vtkInformation *, vtkInformationVector **, vtkInformationVector *) |
vtkTable * | GetOutput () |
vtkTable * | GetOutput (int index) |
void | SetInput (vtkDataObject *obj) |
void | SetInput (int index, vtkDataObject *obj) |
![]() | |
int | HasExecutive () |
vtkExecutive * | GetExecutive () |
virtual void | SetExecutive (vtkExecutive *executive) |
virtual int | ModifyRequest (vtkInformation *request, int when) |
vtkInformation * | GetInputPortInformation (int port) |
vtkInformation * | GetOutputPortInformation (int port) |
int | GetNumberOfInputPorts () |
int | GetNumberOfOutputPorts () |
void | UpdateProgress (double amount) |
vtkInformation * | GetInputArrayInformation (int idx) |
void | RemoveAllInputs () |
vtkDataObject * | GetOutputDataObject (int port) |
virtual void | RemoveInputConnection (int port, vtkAlgorithmOutput *input) |
int | GetNumberOfInputConnections (int port) |
int | GetTotalNumberOfInputConnections () |
vtkAlgorithmOutput * | GetInputConnection (int port, int index) |
virtual void | Update () |
virtual void | UpdateInformation () |
virtual void | UpdateWholeExtent () |
void | ConvertTotalInputToPortConnection (int ind, int &port, int &conn) |
virtual double | ComputePriority () |
int | ProcessRequest (vtkInformation *request, vtkCollection *inInfo, vtkInformationVector *outInfo) |
virtual int | ComputePipelineMTime (vtkInformation *request, vtkInformationVector **inInfoVec, vtkInformationVector *outInfoVec, int requestFromOutputPort, unsigned long *mtime) |
virtual vtkInformation * | GetInformation () |
virtual void | SetInformation (vtkInformation *) |
virtual void | Register (vtkObjectBase *o) |
virtual void | UnRegister (vtkObjectBase *o) |
virtual void | SetAbortExecute (int) |
virtual int | GetAbortExecute () |
virtual void | AbortExecuteOn () |
virtual void | AbortExecuteOff () |
virtual void | SetProgress (double) |
virtual double | GetProgress () |
void | SetProgressText (const char *ptext) |
virtual char * | GetProgressText () |
virtual unsigned long | GetErrorCode () |
virtual void | SetInputArrayToProcess (int idx, int port, int connection, int fieldAssociation, const char *name) |
virtual void | SetInputArrayToProcess (int idx, int port, int connection, int fieldAssociation, int fieldAttributeType) |
virtual void | SetInputArrayToProcess (int idx, vtkInformation *info) |
virtual void | SetInputArrayToProcess (int idx, int port, int connection, const char *fieldAssociation, const char *attributeTypeorName) |
vtkDataObject * | GetInputDataObject (int port, int connection) |
virtual void | SetInputConnection (int port, vtkAlgorithmOutput *input) |
virtual void | SetInputConnection (vtkAlgorithmOutput *input) |
virtual void | AddInputConnection (int port, vtkAlgorithmOutput *input) |
virtual void | AddInputConnection (vtkAlgorithmOutput *input) |
vtkAlgorithmOutput * | GetOutputPort (int index) |
vtkAlgorithmOutput * | GetOutputPort () |
virtual void | SetReleaseDataFlag (int) |
virtual int | GetReleaseDataFlag () |
void | ReleaseDataFlagOn () |
void | ReleaseDataFlagOff () |
int | UpdateExtentIsEmpty (vtkDataObject *output) |
int | UpdateExtentIsEmpty (vtkInformation *pinfo, int extentType) |
![]() | |
virtual void | DebugOn () |
virtual void | DebugOff () |
unsigned char | GetDebug () |
void | SetDebug (unsigned char debugFlag) |
virtual void | Modified () |
virtual unsigned long | GetMTime () |
unsigned long | AddObserver (unsigned long event, vtkCommand *, float priority=0.0f) |
unsigned long | AddObserver (const char *event, vtkCommand *, float priority=0.0f) |
vtkCommand * | GetCommand (unsigned long tag) |
void | RemoveObserver (vtkCommand *) |
void | RemoveObservers (unsigned long event, vtkCommand *) |
void | RemoveObservers (const char *event, vtkCommand *) |
int | HasObserver (unsigned long event, vtkCommand *) |
int | HasObserver (const char *event, vtkCommand *) |
void | RemoveObserver (unsigned long tag) |
void | RemoveObservers (unsigned long event) |
void | RemoveObservers (const char *event) |
void | RemoveAllObservers () |
int | HasObserver (unsigned long event) |
int | HasObserver (const char *event) |
template<class U , class T > | |
unsigned long | AddObserver (unsigned long event, U observer, void(T::*callback)(), float priority=0.0f) |
template<class U , class T > | |
unsigned long | AddObserver (unsigned long event, U observer, void(T::*callback)(vtkObject *, unsigned long, void *), float priority=0.0f) |
int | InvokeEvent (unsigned long event, void *callData) |
int | InvokeEvent (const char *event, void *callData) |
int | InvokeEvent (unsigned long event) |
int | InvokeEvent (const char *event) |
![]() | |
const char * | GetClassName () const |
virtual void | Delete () |
virtual void | FastDelete () |
void | Print (ostream &os) |
void | SetReferenceCount (int) |
void | PrintRevisions (ostream &os) |
virtual void | PrintHeader (ostream &os, vtkIndent indent) |
virtual void | PrintTrailer (ostream &os, vtkIndent indent) |
int | GetReferenceCount () |
Additional Inherited Members | |
![]() | |
int | AbortExecute |
![]() | |
static vtkInformationIntegerKey * | PORT_REQUIREMENTS_FILLED () |
![]() | |
vtkInformation * | Information |
double | Progress |
char * | ProgressText |
unsigned long | ErrorCode |
![]() | |
unsigned char | Debug |
vtkTimeStamp | MTime |
vtkSubjectHelper * | SubjectHelper |
![]() | |
int | ReferenceCount |
vtkWeakPointerBase ** | WeakPointers |
![]() | |
static vtkExecutive * | DefaultExecutivePrototype |
Converts a document collection into a term collection.
Given an artifact table containing text documents, splits each document into its component tokens, producing a feature table containing the results.
Tokenization is performed by splitting input text into tokens based on character delimiters. Delimiters are divided into two categories: "dropped" and "kept". "Dropped" delimiters are discarded from the output, while "kept" delimiters are retained in the output as individual tokens. Initially, vtkTokenizer has no delimiters defined, so you must set some delimiters before use.
Users can reset and append to the lists of delimiters for each category. Delimiters are specified as half-open ranges of Unicode code points. This makes it easy to tokenize logosyllabic scripts such as Chinese, Korean, and Japanese by specifying an entire range of logograms as "kept" delimiters, so that individual glyphs become tokens.
Inputs: Input port 0: (required) A vtkTable containing zero-to-many "documents", with one document per table row, a vtkIdTypeArray column containing document ids, and a vtkUnicodeStringArray column containing the contents of each document. Input port 1: (optional) A vtkTable containing zero-to-many document ranges to be processed, with one range per table row, a vtkIdTypeArray column containing document ids, a vtkIdTypeArray containing begin offsets, and a vtkIdTypeArray column containing end offsets. If input port 1 is left unconnected, the filter will automatically process the entire contents of every input document.
Outputs: Output port 0: A vtkTable containing "document", "begin", "end", "type", and "text" columns.
Use SetInputArrayToProcess(0, ...) to specify the input table column that contains document ids (must be a vtkIdTypeArray). Default: "document"
Use SetInputArrayToProcess(1, ...) to specify the input table column that contains document contents (must be a vtkUnicodeStringArray). Default: "text"
Use SetInputArrayToProcess(2, 1, ...) to specify the input table column that contains range document ids (must be a vtkIdTypeArray). Defaults to "document".
Use SetInputArrayToProcess(3, 1, ...) to specify the input table column that contains range begin offsets (must be a vtkIdTypeArray). Defaults to "begin".
Use SetInputArrayToProcess(4, 1, ...) to specify the input table column that contains range end offsets (must be a vtkIdTypeArray). Defaults to "end".
Definition at line 87 of file vtkTokenizer.h.
Definition at line 92 of file vtkTokenizer.h.
typedef vtkstd::pair<vtkUnicodeString::value_type, vtkUnicodeString::value_type> vtkTokenizer::DelimiterRange |
Defines storage for a half-open range of Unicode characters [begin, end).
Definition at line 99 of file vtkTokenizer.h.
typedef vtkstd::vector<DelimiterRange> vtkTokenizer::DelimiterRanges |
Defines storage for a half-open range of Unicode characters [begin, end).
Definition at line 102 of file vtkTokenizer.h.
|
protected |
|
protected |
|
static |
|
virtual |
Reimplemented from vtkTableAlgorithm.
|
static |
|
virtual |
Return 1 if this class is the same type of (or a subclass of) the named class. Returns 0 otherwise. This method works in combination with vtkTypeMacro found in vtkSetGet.h.
Reimplemented from vtkTableAlgorithm.
|
static |
|
virtual |
Methods invoked by print to print information about the object including superclasses. Typically not called by the user (use Print() instead) but used in the hierarchical print process to combine the output of several classes.
Reimplemented from vtkTableAlgorithm.
|
static |
Returns a set of delimiter ranges that match Unicode punctuation codepoints.
|
static |
Returns a set of delimiter ranges that match Unicode punctuation codepoints.
|
static |
Returns a set of delimiter ranges that match Unicode punctuation codepoints.
void vtkTokenizer::AddDroppedDelimiters | ( | vtkUnicodeString::value_type | begin, |
vtkUnicodeString::value_type | end | ||
) |
Adds the half-open range of Unicode characters [begin, end) to the set of "dropped" delimiters.
void vtkTokenizer::AddDroppedDelimiters | ( | const DelimiterRanges & | ranges | ) |
Adds the half-open range of Unicode characters [begin, end) to the set of "dropped" delimiters.
void vtkTokenizer::AddKeptDelimiters | ( | vtkUnicodeString::value_type | begin, |
vtkUnicodeString::value_type | end | ||
) |
Adds the half-open range of Unicode characters [begin, end) to the set of "kept" delimiters.
void vtkTokenizer::AddKeptDelimiters | ( | const DelimiterRanges & | ranges | ) |
Adds the half-open range of Unicode characters [begin, end) to the set of "kept" delimiters.
void vtkTokenizer::DropPunctuation | ( | ) |
Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.
void vtkTokenizer::DropWhitespace | ( | ) |
Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.
void vtkTokenizer::KeepPunctuation | ( | ) |
Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.
void vtkTokenizer::KeepWhitespace | ( | ) |
Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.
void vtkTokenizer::KeepLogosyllabic | ( | ) |
Convenience functions to specify delimiters, mainly intended for use from Python and the ParaView server manager. C++ developers are strongly encouraged to use AddDroppedDelimiters(...) and AddKeptDelimiters(...) instead.
void vtkTokenizer::ClearDroppedDelimiters | ( | ) |
Clears the set of "dropped" delimiters.
void vtkTokenizer::ClearKeptDelimiters | ( | ) |
Clears the set of "dropped" delimiters.
|
protectedvirtual |
Fill the input port information objects for this algorithm. This is invoked by the first call to GetInputPortInformation for each port so subclasses can specify what they can handle.
Reimplemented from vtkTableAlgorithm.
|
protectedvirtual |
This is called by the superclass. This is the method you should override.
Reimplemented from vtkTableAlgorithm.