1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 r"""datastructures.py: Datastructures for search engine core.
19
20 """
21 __docformat__ = "restructuredtext en"
22
23 import errors
24 from replaylog import log
25 import xapian
26 import cPickle
27
29
30
31 __slots__ = 'name', 'value'
32
36
38 return 'Field(%r, %r)' % (self.name, self.value)
39
41 """A unprocessed document to be passed to the indexer.
42
43 This represents an item to be processed and stored in the search engine.
44 Each document will be processed by the indexer to generate a
45 ProcessedDocument, which can then be stored in the search engine index.
46
47 Note that some information in an UnprocessedDocument will not be
48 represented in the ProcessedDocument: therefore, it is not possible to
49 retrieve an UnprocessedDocument from the search engine index.
50
51 An unprocessed document is a simple container with two attributes:
52
53 - `fields` is a list of Field objects, or an iterator returning Field
54 objects.
55 - `id` is a string holding a unique identifier for the document (or
56 None to get the database to allocate a unique identifier automatically
57 when the document is added).
58
59 """
60
61 __slots__ = 'id', 'fields',
62 - def __init__(self, id=None, fields=None):
68
70 return 'UnprocessedDocument(%r, %r)' % (self.id, self.fields)
71
73 """A processed document, as stored in the index.
74
75 This represents an item which is ready to be stored in the search engine,
76 or which has been returned by the search engine.
77
78 """
79
80 __slots__ = '_doc', '_fieldmappings', '_data',
81 - def __init__(self, fieldmappings, xapdoc=None):
82 """Create a ProcessedDocument.
83
84 `fieldmappings` is the configuration from a database connection used lookup
85 the configuration to use to store each field.
86
87 If supplied, `xapdoc` is a Xapian document to store in the processed
88 document. Otherwise, a new Xapian document is created.
89
90 """
91 if xapdoc is None:
92 self._doc = log(xapian.Document)
93 else:
94 self._doc = xapdoc
95 self._fieldmappings = fieldmappings
96 self._data = None
97
98 - def add_term(self, field, term, wdfinc=1, positions=None):
99 """Add a term to the document.
100
101 Terms are the main unit of information used for performing searches.
102
103 - `field` is the field to add the term to.
104 - `term` is the term to add.
105 - `wdfinc` is the value to increase the within-document-frequency
106 measure for the term by.
107 - `positions` is the positional information to add for the term.
108 This may be None to indicate that there is no positional information,
109 or may be an integer to specify one position, or may be a sequence of
110 integers to specify several positions. (Note that the wdf is not
111 increased automatically for each position: if you add a term at 7
112 positions, and the wdfinc value is 2, the total wdf for the term will
113 only be increased by 2, not by 14.)
114
115 """
116 prefix = self._fieldmappings.get_prefix(field)
117 if len(term) > 0:
118
119
120
121 if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'):
122 prefix = prefix + ':'
123
124
125
126
127
128
129
130
131
132
133
134
135
136 if len(prefix + term) > 220:
137 raise errors.IndexerError("Field %r is too long: maximum length "
138 "220 - was %d (%r)" %
139 (field, len(prefix + term),
140 prefix + term))
141
142 if positions is None:
143 self._doc.add_term(prefix + term, wdfinc)
144 elif isinstance(positions, int):
145 self._doc.add_posting(prefix + term, positions, wdfinc)
146 else:
147 self._doc.add_term(prefix + term, wdfinc)
148 for pos in positions:
149 self._doc.add_posting(prefix + term, pos, 0)
150
151 - def add_value(self, field, value, purpose=''):
152 """Add a value to the document.
153
154 Values are additional units of information used when performing
155 searches. Note that values are _not_ intended to be used to store
156 information for display in the search results - use the document data
157 for that. The intention is that as little information as possible is
158 stored in values, so that they can be accessed as quickly as possible
159 during the search operation.
160
161 Unlike terms, each document may have at most one value in each field
162 (whereas there may be an arbitrary number of terms in a given field).
163 If an attempt to add multiple values to a single field is made, only
164 the last value added will be stored.
165
166 """
167 slot = self._fieldmappings.get_slot(field, purpose)
168 self._doc.add_value(slot, value)
169
171 """Get a value from the document.
172
173 """
174 slot = self._fieldmappings.get_slot(field, purpose)
175 return self._doc.get_value(slot)
176
178 """Prepare the document for adding to a xapian database.
179
180 This updates the internal xapian document with any changes which have
181 been made, and then returns it.
182
183 """
184 if self._data is not None:
185 self._doc.set_data(cPickle.dumps(self._data, 2))
186 self._data = None
187 return self._doc
188
190 if self._data is None:
191 rawdata = self._doc.get_data()
192 if rawdata == '':
193 self._data = {}
194 else:
195 self._data = cPickle.loads(rawdata)
196 return self._data
198 if not isinstance(data, dict):
199 raise TypeError("Cannot set data to any type other than a dict")
200 self._data = data
201 data = property(_get_data, _set_data, doc=
202 """The data stored in this processed document.
203
204 This data is a dictionary of entries, where the key is a fieldname, and the
205 value is a list of strings.
206
207 """)
208
210 tl = self._doc.termlist()
211 try:
212 term = tl.skip_to('Q').term
213 if len(term) == 0 or term[0] != 'Q':
214 return None
215 except StopIteration:
216 return None
217 return term[1:]
219 tl = self._doc.termlist()
220 try:
221 term = tl.skip_to('Q').term
222 except StopIteration:
223 term = ''
224 if len(term) != 0 and term[0] == 'Q':
225 self._doc.remove_term(term)
226 if id is not None:
227 self._doc.add_term('Q' + id, 0)
228 id = property(_get_id, _set_id, doc=
229 """The unique ID for this document.
230
231 """)
232
234 return '<ProcessedDocument(%r)>' % (self.id)
235
236 if __name__ == '__main__':
237 import doctest, sys
238 doctest.testmod (sys.modules[__name__])
239