1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 r"""fieldactions.py: Definitions and implementations of field actions.
19
20 """
21 __docformat__ = "restructuredtext en"
22
23 import _checkxapian
24 import errors
25 import marshall
26 from replaylog import log
27 import xapian
28 import parsedate
29
30 -def _act_store_content(fieldname, doc, value, context):
31 """Perform the STORE_CONTENT action.
32
33 """
34 try:
35 fielddata = doc.data[fieldname]
36 except KeyError:
37 fielddata = []
38 doc.data[fieldname] = fielddata
39 fielddata.append(value)
40
42 """Perform the INDEX_EXACT action.
43
44 """
45 doc.add_term(fieldname, value, 0)
46
47 -def _act_tag(fieldname, doc, value, context):
48 """Perform the TAG action.
49
50 """
51 doc.add_term(fieldname, value.lower(), 0)
52
53 -def _act_facet(fieldname, doc, value, context, type=None):
54 """Perform the FACET action.
55
56 """
57 if type is None or type == 'string':
58 value = value.lower()
59 doc.add_term(fieldname, value, 0)
60 serialiser = log(xapian.StringListSerialiser,
61 doc.get_value(fieldname, 'facet'))
62 serialiser.append(value)
63 doc.add_value(fieldname, serialiser.get(), 'facet')
64 else:
65 marshaller = SortableMarshaller()
66 fn = marshaller.get_marshall_function(fieldname, type)
67 doc.add_value(fieldname, fn(fieldname, value), 'facet')
68
69 -def _act_index_freetext(fieldname, doc, value, context, weight=1,
70 language=None, stop=None, spell=False,
71 nopos=False,
72 allow_field_specific=True,
73 search_by_default=True):
74 """Perform the INDEX_FREETEXT action.
75
76 """
77 termgen = log(xapian.TermGenerator)
78 if language is not None:
79 termgen.set_stemmer(log(xapian.Stem, language))
80
81 if stop is not None:
82 stopper = log(xapian.SimpleStopper)
83 for term in stop:
84 stopper.add (term)
85 termgen.set_stopper (stopper)
86
87 if spell:
88 termgen.set_database(context.index)
89 termgen.set_flags(termgen.FLAG_SPELLING)
90
91 termgen.set_document(doc._doc)
92
93 if search_by_default:
94 termgen.set_termpos(context.current_position)
95
96
97 if nopos:
98 termgen.index_text_without_positions(value, weight, '')
99 else:
100 termgen.index_text(value, weight, '')
101
102 if allow_field_specific:
103
104
105 prefix = doc._fieldmappings.get_prefix(fieldname)
106 if len(prefix) != 0:
107 termgen.set_termpos(context.current_position)
108 if nopos:
109 termgen.index_text_without_positions(value, weight, prefix)
110 else:
111 termgen.index_text(value, weight, prefix)
112
113
114
115 termgen.increase_termpos(10)
116 context.current_position = termgen.get_termpos()
117
119 """Implementation of marshalling for sortable values.
120
121 """
127
129 """Marshall a value for sorting in lexicograpical order.
130
131 This returns the input as the output, since strings already sort in
132 lexicographical order.
133
134 """
135 return value
136
138 """Marshall a value for sorting as a floating point value.
139
140 """
141
142 try:
143 value = float(value)
144 except ValueError:
145 raise self._err("Value supplied to field %r must be a "
146 "valid floating point number: was %r" %
147 (fieldname, value))
148 return marshall.float_to_string(value)
149
161
163 """Get a function used to marshall values of a given sorttype.
164
165 """
166 try:
167 return {
168 None: self.marshall_string,
169 'string': self.marshall_string,
170 'float': self.marshall_float,
171 'date': self.marshall_date,
172 }[sorttype]
173 except KeyError:
174 raise self._err("Unknown sort type %r for field %r" %
175 (sorttype, fieldname))
176
177
186
187 -class ActionContext(object):
188 """The context in which an action is performed.
189
190 This is just used to pass term generators, word positions, and the like
191 around.
192
193 """
194 - def __init__(self, index):
195 self.current_language = None
196 self.current_position = 0
197 self.index = index
198
200 """An object describing the actions to be performed on a field.
201
202 The supported actions are:
203
204 - `STORE_CONTENT`: store the unprocessed content of the field in the search
205 engine database. All fields which need to be displayed or used when
206 displaying the search results need to be given this action.
207
208 - `INDEX_EXACT`: index the exact content of the field as a single search
209 term. Fields whose contents need to be searchable as an "exact match"
210 need to be given this action.
211
212 - `INDEX_FREETEXT`: index the content of this field as text. The content
213 will be split into terms, allowing free text searching of the field. Four
214 optional parameters may be supplied:
215
216 - 'weight' is a multiplier to apply to the importance of the field. This
217 must be an integer, and the default value is 1.
218 - 'language' is the language to use when processing the field. This can
219 be expressed as an ISO 2-letter language code. The supported languages
220 are those supported by the xapian core in use.
221 - 'stop' is an iterable of stopwords to filter out of the generated
222 terms. Note that due to Xapian design, only non-positional terms are
223 affected, so this is of limited use.
224 - 'spell' is a boolean flag - if true, the contents of the field will be
225 used for spelling correction.
226 - 'nopos' is a boolean flag - if true, positional information is not
227 stored.
228 - 'allow_field_specific' is a boolean flag - if False, prevents terms with the field
229 prefix being generated. This means that searches specific to this
230 field will not work, and thus should only be used when only non-field
231 specific searches are desired. Defaults to True.
232 - 'search_by_default' is a boolean flag - if False, the field will not be
233 searched by non-field specific searches. If True, or omitted, the
234 field will be included in searches for non field-specific searches.
235
236 - `SORTABLE`: index the content of the field such that it can be used to
237 sort result sets. It also allows result sets to be restricted to those
238 documents with a field values in a given range. One optional parameter
239 may be supplied:
240
241 - 'type' is a value indicating how to sort the field. It has several
242 possible values:
243
244 - 'string' - sort in lexicographic (ie, alphabetical) order.
245 This is the default, used if no type is set.
246 - 'float' - treat the values as (decimal representations of) floating
247 point numbers, and sort in numerical order. The values in the field
248 must be valid floating point numbers (according to Python's float()
249 function).
250 - 'date' - sort in date order. The values must be valid dates (either
251 Python datetime.date objects, or ISO 8601 format (ie, YYYYMMDD or
252 YYYY-MM-DD).
253
254 - `COLLAPSE`: index the content of the field such that it can be used to
255 "collapse" result sets, such that only the highest result with each value
256 of the field will be returned.
257
258 - `TAG`: the field contains tags; these are strings, which will be matched
259 in a case insensitive way, but otherwise must be exact matches. Tag
260 fields can be searched for by making an explict query (ie, using
261 query_field(), but not with query_parse()). A list of the most frequent
262 tags in a result set can also be accessed easily.
263
264 - `FACET`: the field represents a classification facet; these are strings
265 which will be matched exactly, but a list of all the facets present in
266 the result set can also be accessed easily - in addition, a suitable
267 subset of the facets, and a selection of the facet values, present in the
268 result set can be calculated. One optional parameter may be supplied:
269
270 - 'type' is a value indicating the type of facet contained in the field:
271
272 - 'string' - the facet values are exact binary strings.
273 - 'float' - the facet values are floating point numbers.
274
275 """
276
277
278 STORE_CONTENT = 1
279 INDEX_EXACT = 2
280 INDEX_FREETEXT = 3
281 SORTABLE = 4
282 COLLAPSE = 5
283 TAG = 6
284 FACET = 7
285
286
287
288
289 SORT_AND_COLLAPSE = -1
290
291 _unsupported_actions = []
292
293 if 'tags' in _checkxapian.missing_features:
294 _unsupported_actions.append(TAG)
295 if 'facets' in _checkxapian.missing_features:
296 _unsupported_actions.append(FACET)
297
299
300 self._actions = {}
301 self._fieldname = fieldname
302
303 - def add(self, field_mappings, action, **kwargs):
304 """Add an action to perform on a field.
305
306 """
307 if action in self._unsupported_actions:
308 raise errors.IndexerError("Action unsupported with this release of xapian")
309
310 if action not in (FieldActions.STORE_CONTENT,
311 FieldActions.INDEX_EXACT,
312 FieldActions.INDEX_FREETEXT,
313 FieldActions.SORTABLE,
314 FieldActions.COLLAPSE,
315 FieldActions.TAG,
316 FieldActions.FACET,
317 ):
318 raise errors.IndexerError("Unknown field action: %r" % action)
319
320 info = self._action_info[action]
321
322
323 for key in kwargs.keys():
324 if key not in info[1]:
325 raise errors.IndexerError("Unknown parameter name for action %r: %r" % (info[0], key))
326
327
328
329
330 if action == FieldActions.INDEX_EXACT:
331 if FieldActions.INDEX_FREETEXT in self._actions:
332 raise errors.IndexerError("Field %r is already marked for indexing "
333 "as free text: cannot mark for indexing "
334 "as exact text as well" % self._fieldname)
335 if action == FieldActions.INDEX_FREETEXT:
336 if FieldActions.INDEX_EXACT in self._actions:
337 raise errors.IndexerError("Field %r is already marked for indexing "
338 "as exact text: cannot mark for indexing "
339 "as free text as well" % self._fieldname)
340
341
342
343
344
345
346 if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE:
347 if action == FieldActions.COLLAPSE:
348 sorttype = None
349 else:
350 try:
351 sorttype = kwargs['type']
352 except KeyError:
353 sorttype = 'string'
354 kwargs['type'] = sorttype
355 action = FieldActions.SORT_AND_COLLAPSE
356
357 try:
358 oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE]
359 except KeyError:
360 oldsortactions = ()
361
362 if len(oldsortactions) > 0:
363 for oldsortaction in oldsortactions:
364 oldsorttype = oldsortaction['type']
365
366 if sorttype == oldsorttype or oldsorttype is None:
367
368 self._actions[action] = []
369 elif sorttype is None:
370
371 return
372 else:
373 raise errors.IndexerError("Field %r is already marked for "
374 "sorting, with a different "
375 "sort type" % self._fieldname)
376
377 if 'prefix' in info[3]:
378 field_mappings.add_prefix(self._fieldname)
379 if 'slot' in info[3]:
380 purposes = info[3]['slot']
381 if isinstance(purposes, basestring):
382 field_mappings.add_slot(self._fieldname, purposes)
383 else:
384 slotnum = None
385 for purpose in purposes:
386 slotnum = field_mappings.get_slot(self._fieldname, purpose)
387 if slotnum is not None:
388 break
389 for purpose in purposes:
390 field_mappings.add_slot(self._fieldname, purpose, slotnum=slotnum)
391
392
393 if action not in self._actions:
394 self._actions[action] = []
395
396
397 for old_action in self._actions[action]:
398 if old_action == kwargs:
399 return
400
401
402 self._actions[action].append(kwargs)
403
416
417 _action_info = {
418 STORE_CONTENT: ('STORE_CONTENT', (), _act_store_content, {}, ),
419 INDEX_EXACT: ('INDEX_EXACT', (), _act_index_exact, {'prefix': True}, ),
420 INDEX_FREETEXT: ('INDEX_FREETEXT', ('weight', 'language', 'stop', 'spell', 'nopos', 'allow_field_specific', 'search_by_default', ),
421 _act_index_freetext, {'prefix': True, }, ),
422 SORTABLE: ('SORTABLE', ('type', ), None, {'slot': 'collsort',}, ),
423 COLLAPSE: ('COLLAPSE', (), None, {'slot': 'collsort',}, ),
424 TAG: ('TAG', (), _act_tag, {'prefix': True,}, ),
425 FACET: ('FACET', ('type', ), _act_facet, {'prefix': True, 'slot': 'facet',}, ),
426
427 SORT_AND_COLLAPSE: ('SORT_AND_COLLAPSE', ('type', ), _act_sort_and_collapse, {'slot': 'collsort',}, ),
428 }
429
430 if __name__ == '__main__':
431 import doctest, sys
432 doctest.testmod (sys.modules[__name__])
433