Package xappy :: Module searchconnection
[frames] | no frames]

Source Code for Module xappy.searchconnection

   1  #!/usr/bin/env python 
   2  # 
   3  # Copyright (C) 2007 Lemur Consulting Ltd 
   4  # 
   5  # This program is free software; you can redistribute it and/or modify 
   6  # it under the terms of the GNU General Public License as published by 
   7  # the Free Software Foundation; either version 2 of the License, or 
   8  # (at your option) any later version. 
   9  # 
  10  # This program is distributed in the hope that it will be useful, 
  11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
  12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  13  # GNU General Public License for more details. 
  14  #  
  15  # You should have received a copy of the GNU General Public License along 
  16  # with this program; if not, write to the Free Software Foundation, Inc., 
  17  # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 
  18  r"""searchconnection.py: A connection to the search engine for searching. 
  19   
  20  """ 
  21  __docformat__ = "restructuredtext en" 
  22   
  23  import _checkxapian 
  24  import os as _os 
  25  import cPickle as _cPickle 
  26  import math 
  27   
  28  import xapian as _xapian 
  29  from datastructures import * 
  30  from fieldactions import * 
  31  import fieldmappings as _fieldmappings 
  32  import highlight as _highlight  
  33  import errors as _errors 
  34  import indexerconnection as _indexerconnection 
  35  import re as _re 
  36  from replaylog import log as _log 
  37   
38 -class SearchResult(ProcessedDocument):
39 """A result from a search. 40 41 As well as being a ProcessedDocument representing the document in the 42 database, the result has several members which may be used to get 43 information about how well the document matches the search: 44 45 - `rank`: The rank of the document in the search results, starting at 0 46 (ie, 0 is the "top" result, 1 is the second result, etc). 47 48 - `weight`: A floating point number indicating the weight of the result 49 document. The value is only meaningful relative to other results for a 50 given search - a different search, or the same search with a different 51 database, may give an entirely different scale to the weights. This 52 should not usually be displayed to users, but may be useful if trying to 53 perform advanced reweighting operations on search results. 54 55 - `percent`: A percentage value for the weight of a document. This is 56 just a rescaled form of the `weight` member. It doesn't represent any 57 kind of probability value; the only real meaning of the numbers is that, 58 within a single set of results, a document with a higher percentage 59 corresponds to a better match. Because the percentage doesn't really 60 represent a probability, or a confidence value, it is probably unhelpful 61 to display it to most users, since they tend to place an over emphasis 62 on its meaning. However, it is included because it may be useful 63 occasionally. 64 65 """
66 - def __init__(self, msetitem, results):
67 ProcessedDocument.__init__(self, results._fieldmappings, msetitem.document) 68 self.rank = msetitem.rank 69 self.weight = msetitem.weight 70 self.percent = msetitem.percent 71 self._results = results
72
73 - def _get_language(self, field):
74 """Get the language that should be used for a given field. 75 76 Raises a KeyError if the field is not known. 77 78 """ 79 actions = self._results._conn._field_actions[field]._actions 80 for action, kwargslist in actions.iteritems(): 81 if action == FieldActions.INDEX_FREETEXT: 82 for kwargs in kwargslist: 83 try: 84 return kwargs['language'] 85 except KeyError: 86 pass 87 return 'none'
88
89 - def summarise(self, field, maxlen=600, hl=('<b>', '</b>'), query=None):
90 """Return a summarised version of the field specified. 91 92 This will return a summary of the contents of the field stored in the 93 search result, with words which match the query highlighted. 94 95 The maximum length of the summary (in characters) may be set using the 96 maxlen parameter. 97 98 The return value will be a string holding the summary, with 99 highlighting applied. If there are multiple instances of the field in 100 the document, the instances will be joined with a newline character. 101 102 To turn off highlighting, set hl to None. Each highlight will consist 103 of the first entry in the `hl` list being placed before the word, and 104 the second entry in the `hl` list being placed after the word. 105 106 Any XML or HTML style markup tags in the field will be stripped before 107 the summarisation algorithm is applied. 108 109 If `query` is supplied, it should contain a Query object, as returned 110 from SearchConnection.query_parse() or related methods, which will be 111 used as the basis of the summarisation and highlighting rather than the 112 query which was used for the search. 113 114 Raises KeyError if the field is not known. 115 116 """ 117 highlighter = _highlight.Highlighter(language_code=self._get_language(field)) 118 field = self.data[field] 119 results = [] 120 text = '\n'.join(field) 121 if query is None: 122 query = self._results._query 123 return highlighter.makeSample(text, query, maxlen, hl)
124
125 - def highlight(self, field, hl=('<b>', '</b>'), strip_tags=False, query=None):
126 """Return a highlighted version of the field specified. 127 128 This will return all the contents of the field stored in the search 129 result, with words which match the query highlighted. 130 131 The return value will be a list of strings (corresponding to the list 132 of strings which is the raw field data). 133 134 Each highlight will consist of the first entry in the `hl` list being 135 placed before the word, and the second entry in the `hl` list being 136 placed after the word. 137 138 If `strip_tags` is True, any XML or HTML style markup tags in the field 139 will be stripped before highlighting is applied. 140 141 If `query` is supplied, it should contain a Query object, as returned 142 from SearchConnection.query_parse() or related methods, which will be 143 used as the basis of the summarisation and highlighting rather than the 144 query which was used for the search. 145 146 Raises KeyError if the field is not known. 147 148 """ 149 highlighter = _highlight.Highlighter(language_code=self._get_language(field)) 150 field = self.data[field] 151 results = [] 152 if query is None: 153 query = self._results._query 154 for text in field: 155 results.append(highlighter.highlight(text, query, hl, strip_tags)) 156 return results
157
158 - def __repr__(self):
159 return ('<SearchResult(rank=%d, id=%r, data=%r)>' % 160 (self.rank, self.id, self.data))
161 162
163 -class SearchResultIter(object):
164 """An iterator over a set of results from a search. 165 166 """
167 - def __init__(self, results, order):
168 self._results = results 169 self._order = order 170 if self._order is None: 171 self._iter = iter(results._mset) 172 else: 173 self._iter = iter(self._order)
174
175 - def next(self):
176 if self._order is None: 177 msetitem = self._iter.next() 178 else: 179 index = self._iter.next() 180 msetitem = self._results._mset.get_hit(index) 181 return SearchResult(msetitem, self._results)
182 183
184 -def _get_significant_digits(value, lower, upper):
185 """Get the significant digits of value which are constrained by the 186 (inclusive) lower and upper bounds. 187 188 If there are no significant digits which are definitely within the 189 bounds, exactly one significant digit will be returned in the result. 190 191 >>> _get_significant_digits(15,15,15) 192 15 193 >>> _get_significant_digits(15,15,17) 194 20 195 >>> _get_significant_digits(4777,208,6000) 196 5000 197 >>> _get_significant_digits(4777,4755,4790) 198 4800 199 >>> _get_significant_digits(4707,4695,4710) 200 4700 201 >>> _get_significant_digits(4719,4717,4727) 202 4720 203 >>> _get_significant_digits(0,0,0) 204 0 205 >>> _get_significant_digits(9,9,10) 206 9 207 >>> _get_significant_digits(9,9,100) 208 9 209 210 """ 211 assert(lower <= value) 212 assert(value <= upper) 213 diff = upper - lower 214 215 # Get the first power of 10 greater than the difference. 216 # This corresponds to the magnitude of the smallest significant digit. 217 if diff == 0: 218 pos_pow_10 = 1 219 else: 220 pos_pow_10 = int(10 ** math.ceil(math.log10(diff))) 221 222 # Special case for situation where we don't have any significant digits: 223 # get the magnitude of the most significant digit in value. 224 if pos_pow_10 > value: 225 if value == 0: 226 pos_pow_10 = 1 227 else: 228 pos_pow_10 = int(10 ** math.floor(math.log10(value))) 229 230 # Return the value, rounded to the nearest multiple of pos_pow_10 231 return ((value + pos_pow_10 // 2) // pos_pow_10) * pos_pow_10
232
233 -class SearchResults(object):
234 """A set of results of a search. 235 236 """
237 - def __init__(self, conn, enq, query, mset, fieldmappings, tagspy, 238 tagfields, facetspy, facetfields, facethierarchy, 239 facetassocs):
240 self._conn = conn 241 self._enq = enq 242 self._query = query 243 self._mset = mset 244 self._mset_order = None 245 self._fieldmappings = fieldmappings 246 self._tagspy = tagspy 247 if tagfields is None: 248 self._tagfields = None 249 else: 250 self._tagfields = set(tagfields) 251 self._facetspy = facetspy 252 self._facetfields = facetfields 253 self._facethierarchy = facethierarchy 254 self._facetassocs = facetassocs 255 self._numeric_ranges_built = {}
256
257 - def _cluster(self, num_clusters, maxdocs, fields=None):
258 """Cluster results based on similarity. 259 260 Note: this method is experimental, and will probably disappear or 261 change in the future. 262 263 The number of clusters is specified by num_clusters: unless there are 264 too few results, there will be exaclty this number of clusters in the 265 result. 266 267 """ 268 clusterer = _xapian.ClusterSingleLink() 269 xapclusters = _xapian.ClusterAssignments() 270 docsim = _xapian.DocSimCosine() 271 source = _xapian.MSetDocumentSource(self._mset, maxdocs) 272 273 if fields is None: 274 clusterer.cluster(self._conn._index, xapclusters, docsim, source, num_clusters) 275 else: 276 decider = self._make_expand_decider(fields) 277 clusterer.cluster(self._conn._index, xapclusters, docsim, source, decider, num_clusters) 278 279 newid = 0 280 idmap = {} 281 clusters = {} 282 for item in self._mset: 283 docid = item.docid 284 clusterid = xapclusters.cluster(docid) 285 if clusterid not in idmap: 286 idmap[clusterid] = newid 287 newid += 1 288 clusterid = idmap[clusterid] 289 if clusterid not in clusters: 290 clusters[clusterid] = [] 291 clusters[clusterid].append(item.rank) 292 return clusters
293
294 - def _reorder_by_clusters(self, clusters):
295 """Reorder the mset based on some clusters. 296 297 """ 298 if self.startrank != 0: 299 raise _errors.SearchError("startrank must be zero to reorder by clusters") 300 reordered = False 301 tophits = [] 302 nottophits = [] 303 304 clusterstarts = dict(((c[0], None) for c in clusters.itervalues())) 305 for i in xrange(self.endrank): 306 if i in clusterstarts: 307 tophits.append(i) 308 else: 309 nottophits.append(i) 310 self._mset_order = tophits 311 self._mset_order.extend(nottophits)
312
313 - def _make_expand_decider(self, fields):
314 """Make an expand decider which accepts only terms in the specified 315 field. 316 317 """ 318 prefixes = {} 319 if isinstance(fields, basestring): 320 fields = [fields] 321 for field in fields: 322 try: 323 actions = self._conn._field_actions[field]._actions 324 except KeyError: 325 continue 326 for action, kwargslist in actions.iteritems(): 327 if action == FieldActions.INDEX_FREETEXT: 328 prefix = self._conn._field_mappings.get_prefix(field) 329 prefixes[prefix] = None 330 prefixes['Z' + prefix] = None 331 if action in (FieldActions.INDEX_EXACT, 332 FieldActions.TAG, 333 FieldActions.FACET,): 334 prefix = self._conn._field_mappings.get_prefix(field) 335 prefixes[prefix] = None 336 prefix_re = _re.compile('|'.join([_re.escape(x) + '[^A-Z]' for x in prefixes.keys()])) 337 class decider(_xapian.ExpandDecider): 338 def __call__(self, term): 339 return prefix_re.match(term) is not None
340 return decider() 341
342 - def _reorder_by_similarity(self, count, maxcount, max_similarity, 343 fields=None):
344 """Reorder results based on similarity. 345 346 The top `count` documents will be chosen such that they are relatively 347 dissimilar. `maxcount` documents will be considered for moving around, 348 and `max_similarity` is a value between 0 and 1 indicating the maximum 349 similarity to the previous document before a document is moved down the 350 result set. 351 352 Note: this method is experimental, and will probably disappear or 353 change in the future. 354 355 """ 356 if self.startrank != 0: 357 raise _errors.SearchError("startrank must be zero to reorder by similiarity") 358 ds = _xapian.DocSimCosine() 359 ds.set_termfreqsource(_xapian.DatabaseTermFreqSource(self._conn._index)) 360 361 if fields is not None: 362 ds.set_expand_decider(self._make_expand_decider(fields)) 363 364 tophits = [] 365 nottophits = [] 366 full = False 367 reordered = False 368 369 sim_count = 0 370 new_order = [] 371 end = min(self.endrank, maxcount) 372 for i in xrange(end): 373 if full: 374 new_order.append(i) 375 continue 376 hit = self._mset.get_hit(i) 377 if len(tophits) == 0: 378 tophits.append(hit) 379 continue 380 381 # Compare each incoming hit to tophits 382 maxsim = 0.0 383 for tophit in tophits[-1:]: 384 sim_count += 1 385 sim = ds.similarity(hit.document, tophit.document) 386 if sim > maxsim: 387 maxsim = sim 388 389 # If it's not similar to an existing hit, add to tophits. 390 if maxsim < max_similarity: 391 tophits.append(hit) 392 else: 393 nottophits.append(hit) 394 reordered = True 395 396 # If we're full of hits, append to the end. 397 if len(tophits) >= count: 398 for hit in tophits: 399 new_order.append(hit.rank) 400 for hit in nottophits: 401 new_order.append(hit.rank) 402 full = True 403 if not full: 404 for hit in tophits: 405 new_order.append(hit.rank) 406 for hit in nottophits: 407 new_order.append(hit.rank) 408 if end != self.endrank: 409 new_order.extend(range(end, self.endrank)) 410 assert len(new_order) == self.endrank 411 if reordered: 412 self._mset_order = new_order 413 else: 414 assert new_order == range(self.endrank)
415
416 - def __repr__(self):
417 return ("<SearchResults(startrank=%d, " 418 "endrank=%d, " 419 "more_matches=%s, " 420 "matches_lower_bound=%d, " 421 "matches_upper_bound=%d, " 422 "matches_estimated=%d, " 423 "estimate_is_exact=%s)>" % 424 ( 425 self.startrank, 426 self.endrank, 427 self.more_matches, 428 self.matches_lower_bound, 429 self.matches_upper_bound, 430 self.matches_estimated, 431 self.estimate_is_exact, 432 ))
433
434 - def _get_more_matches(self):
435 # This check relies on us having asked for at least one more result 436 # than retrieved to be checked. 437 return (self.matches_lower_bound > self.endrank)
438 more_matches = property(_get_more_matches, doc= 439 """Check whether there are further matches after those in this result set. 440 441 """) 442
443 - def _get_startrank(self):
444 return self._mset.get_firstitem()
445 startrank = property(_get_startrank, doc= 446 """Get the rank of the first item in the search results. 447 448 This corresponds to the "startrank" parameter passed to the search() method. 449 450 """) 451
452 - def _get_endrank(self):
453 return self._mset.get_firstitem() + len(self._mset)
454 endrank = property(_get_endrank, doc= 455 """Get the rank of the item after the end of the search results. 456 457 If there are sufficient results in the index, this corresponds to the 458 "endrank" parameter passed to the search() method. 459 460 """) 461
462 - def _get_lower_bound(self):
463 return self._mset.get_matches_lower_bound()
464 matches_lower_bound = property(_get_lower_bound, doc= 465 """Get a lower bound on the total number of matching documents. 466 467 """) 468
469 - def _get_upper_bound(self):
470 return self._mset.get_matches_upper_bound()
471 matches_upper_bound = property(_get_upper_bound, doc= 472 """Get an upper bound on the total number of matching documents. 473 474 """) 475
476 - def _get_human_readable_estimate(self):
477 lower = self._mset.get_matches_lower_bound() 478 upper = self._mset.get_matches_upper_bound() 479 est = self._mset.get_matches_estimated() 480 return _get_significant_digits(est, lower, upper)
481 matches_human_readable_estimate = property(_get_human_readable_estimate, 482 doc= 483 """Get a human readable estimate of the number of matching documents. 484 485 This consists of the value returned by the "matches_estimated" property, 486 rounded to an appropriate number of significant digits (as determined by 487 the values of the "matches_lower_bound" and "matches_upper_bound" 488 properties). 489 490 """) 491
492 - def _get_estimated(self):
493 return self._mset.get_matches_estimated()
494 matches_estimated = property(_get_estimated, doc= 495 """Get an estimate for the total number of matching documents. 496 497 """) 498
499 - def _estimate_is_exact(self):
500 return self._mset.get_matches_lower_bound() == \ 501 self._mset.get_matches_upper_bound()
502 estimate_is_exact = property(_estimate_is_exact, doc= 503 """Check whether the estimated number of matching documents is exact. 504 505 If this returns true, the estimate given by the `matches_estimated` 506 property is guaranteed to be correct. 507 508 If this returns false, it is possible that the actual number of matching 509 documents is different from the number given by the `matches_estimated` 510 property. 511 512 """) 513
514 - def get_hit(self, index):
515 """Get the hit with a given index. 516 517 """ 518 if self._mset_order is None: 519 msetitem = self._mset.get_hit(index) 520 else: 521 msetitem = self._mset.get_hit(self._mset_order[index]) 522 return SearchResult(msetitem, self)
523 __getitem__ = get_hit 524
525 - def __iter__(self):
526 """Get an iterator over the hits in the search result. 527 528 The iterator returns the results in increasing order of rank. 529 530 """ 531 return SearchResultIter(self, self._mset_order)
532
533 - def __len__(self):
534 """Get the number of hits in the search result. 535 536 Note that this is not (usually) the number of matching documents for 537 the search. If startrank is non-zero, it's not even the rank of the 538 last document in the search result. It's simply the number of hits 539 stored in the search result. 540 541 It is, however, the number of items returned by the iterator produced 542 by calling iter() on this SearchResults object. 543 544 """ 545 return len(self._mset)
546
547 - def get_top_tags(self, field, maxtags):
548 """Get the most frequent tags in a given field. 549 550 - `field` - the field to get tags for. This must have been specified 551 in the "gettags" argument of the search() call. 552 - `maxtags` - the maximum number of tags to return. 553 554 Returns a sequence of 2-item tuples, in which the first item in the 555 tuple is the tag, and the second is the frequency of the tag in the 556 matches seen (as an integer). 557 558 """ 559 if 'tags' in _checkxapian.missing_features: 560 raise errors.SearchError("Tags unsupported with this release of xapian") 561 if self._tagspy is None or field not in self._tagfields: 562 raise _errors.SearchError("Field %r was not specified for getting tags" % field) 563 prefix = self._conn._field_mappings.get_prefix(field) 564 return self._tagspy.get_top_terms(prefix, maxtags)
565
566 - def get_suggested_facets(self, maxfacets=5, desired_num_of_categories=7, 567 required_facets=None):
568 """Get a suggested set of facets, to present to the user. 569 570 This returns a list, in descending order of the usefulness of the 571 facet, in which each item is a tuple holding: 572 573 - fieldname of facet. 574 - sequence of 2-tuples holding the suggested values or ranges for that 575 field: 576 577 For facets of type 'string', the first item in the 2-tuple will 578 simply be the string supplied when the facet value was added to its 579 document. For facets of type 'float', it will be a 2-tuple, holding 580 floats giving the start and end of the suggested value range. 581 582 The second item in the 2-tuple will be the frequency of the facet 583 value or range in the result set. 584 585 If required_facets is not None, it must be a field name, or a sequence 586 of field names. Any field names mentioned in required_facets will be 587 returned if there are any facet values at all in the search results for 588 that field. The facet will only be omitted if there are no facet 589 values at all for the field. 590 591 The value of maxfacets will be respected as far as possible; the 592 exception is that if there are too many fields listed in 593 required_facets with at least one value in the search results, extra 594 facets will be returned (ie, obeying the required_facets parameter is 595 considered more important than the maxfacets parameter). 596 597 If facet_hierarchy was indicated when search() was called, and the 598 query included facets, then only subfacets of those query facets and 599 top-level facets will be included in the returned list. Furthermore 600 top-level facets will only be returned if there are remaining places 601 in the list after it has been filled with subfacets. Note that 602 required_facets is still respected regardless of the facet hierarchy. 603 604 If a query type was specified when search() was called, and the query 605 included facets, then facets with an association of Never to the 606 query type are never returned, even if mentioned in required_facets. 607 Facets with an association of Preferred are listed before others in 608 the returned list. 609 610 """ 611 if 'facets' in _checkxapian.missing_features: 612 raise errors.SearchError("Facets unsupported with this release of xapian") 613 if self._facetspy is None: 614 raise _errors.SearchError("Facet selection wasn't enabled when the search was run") 615 if isinstance(required_facets, basestring): 616 required_facets = [required_facets] 617 scores = [] 618 facettypes = {} 619 for field, slot, kwargslist in self._facetfields: 620 type = None 621 for kwargs in kwargslist: 622 type = kwargs.get('type', None) 623 if type is not None: break 624 if type is None: type = 'string' 625 626 if type == 'float': 627 if field not in self._numeric_ranges_built: 628 self._facetspy.build_numeric_ranges(slot, desired_num_of_categories) 629 self._numeric_ranges_built[field] = None 630 facettypes[field] = type 631 score = self._facetspy.score_categorisation(slot, desired_num_of_categories) 632 scores.append((score, field, slot)) 633 634 # Sort on whether facet is top-level ahead of score (use subfacets first), 635 # and on whether facet is preferred for the query type ahead of anything else 636 if self._facethierarchy: 637 # Note, tuple[-2] is the value of 'field' in a scores tuple 638 scores = [(tuple[-2] not in self._facethierarchy,) + tuple for tuple in scores] 639 if self._facetassocs: 640 preferred = _indexerconnection.IndexerConnection.FacetQueryType_Preferred 641 scores = [(self._facetassocs.get(tuple[-2]) != preferred,) + tuple for tuple in scores] 642 scores.sort() 643 if self._facethierarchy: 644 index = 1 645 else: 646 index = 0 647 if self._facetassocs: 648 index += 1 649 if index > 0: 650 scores = [tuple[index:] for tuple in scores] 651 652 results = [] 653 required_results = [] 654 for score, field, slot in scores: 655 # Check if the facet is required 656 required = False 657 if required_facets is not None: 658 required = field in required_facets 659 660 # If we've got enough facets, and the field isn't required, skip it 661 if not required and len(results) + len(required_results) >= maxfacets: 662 continue 663 664 # Get the values 665 values = self._facetspy.get_values_as_dict(slot) 666 if field in self._numeric_ranges_built: 667 if '' in values: 668 del values[''] 669 670 # Required facets must occur at least once, other facets must occur 671 # at least twice. 672 if required: 673 if len(values) < 1: 674 continue 675 else: 676 if len(values) <= 1: 677 continue 678 679 newvalues = [] 680 if facettypes[field] == 'float': 681 # Convert numbers to python numbers, and number ranges to a 682 # python tuple of two numbers. 683 for value, frequency in values.iteritems(): 684 if len(value) <= 9: 685 value1 = _log(_xapian.sortable_unserialise, value) 686 value2 = value1 687 else: 688 value1 = _log(_xapian.sortable_unserialise, value[:9]) 689 value2 = _log(_xapian.sortable_unserialise, value[9:]) 690 newvalues.append(((value1, value2), frequency)) 691 else: 692 for value, frequency in values.iteritems(): 693 newvalues.append((value, frequency)) 694 695 newvalues.sort() 696 if required: 697 required_results.append((score, field, newvalues)) 698 else: 699 results.append((score, field, newvalues)) 700 701 # Throw away any excess results if we have more required_results to 702 # insert. 703 maxfacets = maxfacets - len(required_results) 704 if maxfacets <= 0: 705 results = required_results 706 else: 707 results = results[:maxfacets] 708 results.extend(required_results) 709 results.sort() 710 711 # Throw away the scores because they're not meaningful outside this 712 # algorithm. 713 results = [(field, newvalues) for (score, field, newvalues) in results] 714 return results
715 716
717 -class SearchConnection(object):
718 """A connection to the search engine for searching. 719 720 The connection will access a view of the database. 721 722 """ 723 _qp_flags_base = _xapian.QueryParser.FLAG_LOVEHATE 724 _qp_flags_phrase = _xapian.QueryParser.FLAG_PHRASE 725 _qp_flags_synonym = (_xapian.QueryParser.FLAG_AUTO_SYNONYMS | 726 _xapian.QueryParser.FLAG_AUTO_MULTIWORD_SYNONYMS) 727 _qp_flags_bool = _xapian.QueryParser.FLAG_BOOLEAN 728 729 _index = None 730
731 - def __init__(self, indexpath):
732 """Create a new connection to the index for searching. 733 734 There may only an arbitrary number of search connections for a 735 particular database open at a given time (regardless of whether there 736 is a connection for indexing open as well). 737 738 If the database doesn't exist, an exception will be raised. 739 740 """ 741 self._index = _log(_xapian.Database, indexpath) 742 self._indexpath = indexpath 743 744 # Read the actions. 745 self._load_config() 746 747 self._close_handlers = []
748
749 - def __del__(self):
750 self.close()
751
752 - def append_close_handler(self, handler, userdata=None):
753 """Append a callback to the list of close handlers. 754 755 These will be called when the SearchConnection is closed. This happens 756 when the close() method is called, or when the SearchConnection object 757 is deleted. The callback will be passed two arguments: the path to the 758 SearchConnection object, and the userdata supplied to this method. 759 760 The handlers will be called in the order in which they were added. 761 762 The handlers will be called after the connection has been closed, so 763 cannot prevent it closing: their return value will be ignored. In 764 addition, they should not raise any exceptions. 765 766 """ 767 self._close_handlers.append((handler, userdata))
768
769 - def _get_sort_type(self, field):
770 """Get the sort type that should be used for a given field. 771 772 """ 773 try: 774 actions = self._field_actions[field]._actions 775 except KeyError: 776 actions = {} 777 for action, kwargslist in actions.iteritems(): 778 if action == FieldActions.SORT_AND_COLLAPSE: 779 for kwargs in kwargslist: 780 return kwargs['type']
781
782 - def _load_config(self):
783 """Load the configuration for the database. 784 785 """ 786 # Note: this code is basically duplicated in the IndexerConnection 787 # class. Move it to a shared location. 788 assert self._index is not None 789 790 config_str = _log(self._index.get_metadata, '_xappy_config') 791 if len(config_str) == 0: 792 self._field_actions = {} 793 self._field_mappings = _fieldmappings.FieldMappings() 794 self._facet_hierarchy = {} 795 self._facet_query_table = {} 796 return 797 798 try: 799 (self._field_actions, mappings, self._facet_hierarchy, self._facet_query_table, self._next_docid) = _cPickle.loads(config_str) 800 except ValueError: 801 # Backwards compatibility - configuration used to lack _facet_hierarchy and _facet_query_table 802 (self._field_actions, mappings, self._next_docid) = _cPickle.loads(config_str) 803 self._facet_hierarchy = {} 804 self._facet_query_table = {} 805 self._field_mappings = _fieldmappings.FieldMappings(mappings)
806
807 - def reopen(self):
808 """Reopen the connection. 809 810 This updates the revision of the index which the connection references 811 to the latest flushed revision. 812 813 """ 814 if self._index is None: 815 raise _errors.SearchError("SearchConnection has been closed") 816 self._index.reopen() 817 # Re-read the actions. 818 self._load_config()
819
820 - def close(self):
821 """Close the connection to the database. 822 823 It is important to call this method before allowing the class to be 824 garbage collected to ensure that the connection is cleaned up promptly. 825 826 No other methods may be called on the connection after this has been 827 called. (It is permissible to call close() multiple times, but 828 only the first call will have any effect.) 829 830 If an exception occurs, the database will be closed, but changes since 831 the last call to flush may be lost. 832 833 """ 834 if self._index is None: 835 return 836 837 # Remember the index path 838 indexpath = self._indexpath 839 840 # There is currently no "close()" method for xapian databases, so 841 # we have to rely on the garbage collector. Since we never copy 842 # the _index property out of this class, there should be no cycles, 843 # so the standard python implementation should garbage collect 844 # _index straight away. A close() method is planned to be added to 845 # xapian at some point - when it is, we should call it here to make 846 # the code more robust. 847 self._index = None 848 self._indexpath = None 849 self._field_actions = None 850 self._field_mappings = None 851 852 # Call the close handlers. 853 for handler, userdata in self._close_handlers: 854 try: 855 handler(indexpath, userdata) 856 except Exception, e: 857 import sys, traceback 858 print >>sys.stderr, "WARNING: unhandled exception in handler called by SearchConnection.close(): %s" % traceback.format_exception_only(type(e), e)
859
860 - def get_doccount(self):
861 """Count the number of documents in the database. 862 863 This count will include documents which have been added or removed but 864 not yet flushed(). 865 866 """ 867 if self._index is None: 868 raise _errors.SearchError("SearchConnection has been closed") 869 return self._index.get_doccount()
870 871 OP_AND = _xapian.Query.OP_AND 872 OP_OR = _xapian.Query.OP_OR
873 - def query_composite(self, operator, queries):
874 """Build a composite query from a list of queries. 875 876 The queries are combined with the supplied operator, which is either 877 SearchConnection.OP_AND or SearchConnection.OP_OR. 878 879 """ 880 if self._index is None: 881 raise _errors.SearchError("SearchConnection has been closed") 882 return _log(_xapian.Query, operator, list(queries))
883
884 - def query_multweight(self, query, multiplier):
885 """Build a query which modifies the weights of a subquery. 886 887 This produces a query which returns the same documents as the subquery, 888 and in the same order, but with the weights assigned to each document 889 multiplied by the value of "multiplier". "multiplier" may be any floating 890 point value, but negative values will be clipped to 0, since Xapian 891 doesn't support negative weights. 892 893 This can be useful when producing queries to be combined with 894 query_composite, because it allows the relative importance of parts of 895 the query to be adjusted. 896 897 """ 898 return _log(_xapian.Query, _xapian.Query.OP_SCALE_WEIGHT, query, multiplier)
899
900 - def query_filter(self, query, filter, exclude=False):
901 """Filter a query with another query. 902 903 If exclude is False (or not specified), documents will only match the 904 resulting query if they match the both the first and second query: the 905 results of the first query are "filtered" to only include those which 906 also match the second query. 907 908 If exclude is True, documents will only match the resulting query if 909 they match the first query, but not the second query: the results of 910 the first query are "filtered" to only include those which do not match 911 the second query. 912 913 Documents will always be weighted according to only the first query. 914 915 - `query`: The query to filter. 916 - `filter`: The filter to apply to the query. 917 - `exclude`: If True, the sense of the filter is reversed - only 918 documents which do not match the second query will be returned. 919 920 """ 921 if self._index is None: 922 raise _errors.SearchError("SearchConnection has been closed") 923 if not isinstance(filter, _xapian.Query): 924 raise _errors.SearchError("Filter must be a Xapian Query object") 925 if exclude: 926 return _log(_xapian.Query, _xapian.Query.OP_AND_NOT, query, filter) 927 else: 928 return _log(_xapian.Query, _xapian.Query.OP_FILTER, query, filter)
929
930 - def query_adjust(self, primary, secondary):
931 """Adjust the weights of one query with a secondary query. 932 933 Documents will be returned from the resulting query if and only if they 934 match the primary query (specified by the "primary" parameter). 935 However, the weights (and hence, the relevance rankings) of the 936 documents will be adjusted by adding weights from the secondary query 937 (specified by the "secondary" parameter). 938 939 """ 940 if self._index is None: 941 raise _errors.SearchError("SearchConnection has been closed") 942 return _log(_xapian.Query, _xapian.Query.OP_AND_MAYBE, primary, secondary)
943
944 - def query_range(self, field, begin, end):
945 """Create a query for a range search. 946 947 This creates a query which matches only those documents which have a 948 field value in the specified range. 949 950 Begin and end must be appropriate values for the field, according to 951 the 'type' parameter supplied to the SORTABLE action for the field. 952 953 The begin and end values are both inclusive - any documents with a 954 value equal to begin or end will be returned (unless end is less than 955 begin, in which case no documents will be returned). 956 957 Begin or end may be set to None in order to create an open-ended 958 range. (They may also both be set to None, which will generate a query 959 which matches all documents containing any value for the field.) 960 961 """ 962 if self._index is None: 963 raise _errors.SearchError("SearchConnection has been closed") 964 965 if begin is None and end is None: 966 # Return a "match everything" query 967 return _log(_xapian.Query, '') 968 969 try: 970 slot = self._field_mappings.get_slot(field, 'collsort') 971 except KeyError: 972 # Return a "match nothing" query 973 return _log(_xapian.Query) 974 975 sorttype = self._get_sort_type(field) 976 marshaller = SortableMarshaller(False) 977 fn = marshaller.get_marshall_function(field, sorttype) 978 979 if begin is not None: 980 begin = fn(field, begin) 981 if end is not None: 982 end = fn(field, end) 983 984 if begin is None: 985 return _log(_xapian.Query, _xapian.Query.OP_VALUE_LE, slot, end) 986 987 if end is None: 988 return _log(_xapian.Query, _xapian.Query.OP_VALUE_GE, slot, begin) 989 990 return _log(_xapian.Query, _xapian.Query.OP_VALUE_RANGE, slot, begin, end)
991
992 - def query_facet(self, field, val):
993 """Create a query for a facet value. 994 995 This creates a query which matches only those documents which have a 996 facet value in the specified range. 997 998 For a numeric range facet, val should be a tuple holding the start and 999 end of the range, or a comma separated string holding two floating 1000 point values. For other facets, val should be the value to look 1001 for. 1002 1003 The start and end values are both inclusive - any documents with a 1004 value equal to start or end will be returned (unless end is less than 1005 start, in which case no documents will be returned). 1006 1007 """ 1008 if self._index is None: 1009 raise _errors.SearchError("SearchConnection has been closed") 1010 if 'facets' in _checkxapian.missing_features: 1011 raise errors.SearchError("Facets unsupported with this release of xapian") 1012 1013 try: 1014 actions = self._field_actions[field]._actions 1015 except KeyError: 1016 actions = {} 1017 facettype = None 1018 for action, kwargslist in actions.iteritems(): 1019 if action == FieldActions.FACET: 1020 for kwargs in kwargslist: 1021 facettype = kwargs.get('type', None) 1022 if facettype is not None: 1023 break 1024 if facettype is not None: 1025 break 1026 1027 if facettype == 'float': 1028 if isinstance(val, basestring): 1029 val = [float(v) for v in val.split(',', 2)] 1030 assert(len(val) == 2) 1031 try: 1032 slot = self._field_mappings.get_slot(field, 'facet') 1033 except KeyError: 1034 return _log(_xapian.Query) 1035 # FIXME - check that sorttype == self._get_sort_type(field) 1036 sorttype = 'float' 1037 marshaller = SortableMarshaller(False) 1038 fn = marshaller.get_marshall_function(field, sorttype) 1039 begin = fn(field, val[0]) 1040 end = fn(field, val[1]) 1041 return _log(_xapian.Query, _xapian.Query.OP_VALUE_RANGE, slot, begin, end) 1042 else: 1043 assert(facettype == 'string' or facettype is None) 1044 prefix = self._field_mappings.get_prefix(field) 1045 return _log(_xapian.Query, prefix + val.lower())
1046 1047
1048 - def _prepare_queryparser(self, allow, deny, default_op, default_allow, 1049 default_deny):
1050 """Prepare (and return) a query parser using the specified fields and 1051 operator. 1052 1053 """ 1054 if self._index is None: 1055 raise _errors.SearchError("SearchConnection has been closed") 1056 1057 if isinstance(allow, basestring): 1058 allow = (allow, ) 1059 if isinstance(deny, basestring): 1060 deny = (deny, ) 1061 if allow is not None and len(allow) == 0: 1062 allow = None 1063 if deny is not None and len(deny) == 0: 1064 deny = None 1065 if allow is not None and deny is not None: 1066 raise _errors.SearchError("Cannot specify both `allow` and `deny` " 1067 "(got %r and %r)" % (allow, deny)) 1068 1069 if isinstance(default_allow, basestring): 1070 default_allow = (default_allow, ) 1071 if isinstance(default_deny, basestring): 1072 default_deny = (default_deny, ) 1073 if default_allow is not None and len(default_allow) == 0: 1074 default_allow = None 1075 if default_deny is not None and len(default_deny) == 0: 1076 default_deny = None 1077 if default_allow is not None and default_deny is not None: 1078 raise _errors.SearchError("Cannot specify both `default_allow` and `default_deny` " 1079 "(got %r and %r)" % (default_allow, default_deny)) 1080 1081 qp = _log(_xapian.QueryParser) 1082 qp.set_database(self._index) 1083 qp.set_default_op(default_op) 1084 1085 if allow is None: 1086 allow = [key for key in self._field_actions] 1087 if deny is not None: 1088 allow = [key for key in allow if key not in deny] 1089 1090 for field in allow: 1091 try: 1092 actions = self._field_actions[field]._actions 1093 except KeyError: 1094 actions = {} 1095 for action, kwargslist in actions.iteritems(): 1096 if action == FieldActions.INDEX_EXACT: 1097 # FIXME - need patched version of xapian to add exact prefixes 1098 #qp.add_exact_prefix(field, self._field_mappings.get_prefix(field)) 1099 qp.add_prefix(field, self._field_mappings.get_prefix(field)) 1100 if action == FieldActions.INDEX_FREETEXT: 1101 allow_field_specific = True 1102 for kwargs in kwargslist: 1103 allow_field_specific = allow_field_specific or kwargs.get('allow_field_specific', True) 1104 if not allow_field_specific: 1105 continue 1106 qp.add_prefix(field, self._field_mappings.get_prefix(field)) 1107 for kwargs in kwargslist: 1108 try: 1109 lang = kwargs['language'] 1110 my_stemmer = _log(_xapian.Stem, lang) 1111 qp.my_stemmer = my_stemmer 1112 qp.set_stemmer(my_stemmer) 1113 qp.set_stemming_strategy(qp.STEM_SOME) 1114 except KeyError: 1115 pass 1116 1117 if default_allow is not None or default_deny is not None: 1118 if default_allow is None: 1119 default_allow = [key for key in self._field_actions] 1120 if default_deny is not None: 1121 default_allow = [key for key in default_allow if key not in default_deny] 1122 for field in default_allow: 1123 try: 1124 actions = self._field_actions[field]._actions 1125 except KeyError: 1126 actions = {} 1127 for action, kwargslist in actions.iteritems(): 1128 if action == FieldActions.INDEX_FREETEXT: 1129 qp.add_prefix('', self._field_mappings.get_prefix(field)) 1130 # FIXME - set stemming options for the default prefix 1131 1132 return qp
1133
1134 - def _query_parse_with_prefix(self, qp, string, flags, prefix):
1135 """Parse a query, with an optional prefix. 1136 1137 """ 1138 if prefix is None: 1139 return qp.parse_query(string, flags) 1140 else: 1141 return qp.parse_query(string, flags, prefix)
1142
1143 - def _query_parse_with_fallback(self, qp, string, prefix=None):
1144 """Parse a query with various flags. 1145 1146 If the initial boolean pass fails, fall back to not using boolean 1147 operators. 1148 1149 """ 1150 try: 1151 q1 = self._query_parse_with_prefix(qp, string, 1152 self._qp_flags_base | 1153 self._qp_flags_phrase | 1154 self._qp_flags_synonym | 1155 self._qp_flags_bool, 1156 prefix) 1157 except _xapian.QueryParserError, e: 1158 # If we got a parse error, retry without boolean operators (since 1159 # these are the usual cause of the parse error). 1160 q1 = self._query_parse_with_prefix(qp, string, 1161 self._qp_flags_base | 1162 self._qp_flags_phrase | 1163 self._qp_flags_synonym, 1164 prefix) 1165 1166 qp.set_stemming_strategy(qp.STEM_NONE) 1167 try: 1168 q2 = self._query_parse_with_prefix(qp, string, 1169 self._qp_flags_base | 1170 self._qp_flags_bool, 1171 prefix) 1172 except _xapian.QueryParserError, e: 1173 # If we got a parse error, retry without boolean operators (since 1174 # these are the usual cause of the parse error). 1175 q2 = self._query_parse_with_prefix(qp, string, 1176 self._qp_flags_base, 1177 prefix) 1178 1179 return _log(_xapian.Query, _xapian.Query.OP_AND_MAYBE, q1, q2)
1180
1181 - def query_parse(self, string, allow=None, deny=None, default_op=OP_AND, 1182 default_allow=None, default_deny=None):
1183 """Parse a query string. 1184 1185 This is intended for parsing queries entered by a user. If you wish to 1186 combine structured queries, it is generally better to use the other 1187 query building methods, such as `query_composite` (though you may wish 1188 to create parts of the query to combine with such methods with this 1189 method). 1190 1191 The string passed to this method can have various operators in it. In 1192 particular, it may contain field specifiers (ie, field names, followed 1193 by a colon, followed by some text to search for in that field). For 1194 example, if "author" is a field in the database, the search string 1195 could contain "author:richard", and this would be interpreted as 1196 "search for richard in the author field". By default, any fields in 1197 the database which are indexed with INDEX_EXACT or INDEX_FREETEXT will 1198 be available for field specific searching in this way - however, this 1199 can be modified using the "allow" or "deny" parameters, and also by the 1200 allow_field_specific tag on INDEX_FREETEXT fields. 1201 1202 Any text which isn't prefixed by a field specifier is used to search 1203 the "default set" of fields. By default, this is the full set of 1204 fields in the database which are indexed with INDEX_FREETEXT and for 1205 which the search_by_default flag set (ie, if the text is found in any 1206 of those fields, the query will match). However, this may be modified 1207 with the "default_allow" and "default_deny" parameters. (Note that 1208 fields which are indexed with INDEX_EXACT aren't allowed to be used in 1209 the default list of fields.) 1210 1211 - `string`: The string to parse. 1212 - `allow`: A list of fields to allow in the query. 1213 - `deny`: A list of fields not to allow in the query. 1214 - `default_op`: The default operator to combine query terms with. 1215 - `default_allow`: A list of fields to search for by default. 1216 - `default_deny`: A list of fields not to search for by default. 1217 1218 Only one of `allow` and `deny` may be specified. 1219 1220 Only one of `default_allow` and `default_deny` may be specified. 1221 1222 If any of the entries in `allow` are not present in the configuration 1223 for the database, or are not specified for indexing (either as 1224 INDEX_EXACT or INDEX_FREETEXT), they will be ignored. If any of the 1225 entries in `deny` are not present in the configuration for the 1226 database, they will be ignored. 1227 1228 Returns a Query object, which may be passed to the search() method, or 1229 combined with other queries. 1230 1231 """ 1232 qp = self._prepare_queryparser(allow, deny, default_op, default_allow, 1233 default_deny) 1234 return self._query_parse_with_fallback(qp, string)
1235
1236 - def query_field(self, field, value, default_op=OP_AND):
1237 """A query for a single field. 1238 1239 """ 1240 if self._index is None: 1241 raise _errors.SearchError("SearchConnection has been closed") 1242 try: 1243 actions = self._field_actions[field]._actions 1244 except KeyError: 1245 actions = {} 1246 1247 # need to check on field type, and stem / split as appropriate 1248 for action, kwargslist in actions.iteritems(): 1249 if action in (FieldActions.INDEX_EXACT, 1250 FieldActions.TAG, 1251 FieldActions.FACET,): 1252 prefix = self._field_mappings.get_prefix(field) 1253 if len(value) > 0: 1254 chval = ord(value[0]) 1255 if chval >= ord('A') and chval <= ord('Z'): 1256 prefix = prefix + ':' 1257 return _log(_xapian.Query, prefix + value) 1258 if action == FieldActions.INDEX_FREETEXT: 1259 qp = _log(_xapian.QueryParser) 1260 qp.set_default_op(default_op) 1261 prefix = self._field_mappings.get_prefix(field) 1262 for kwargs in kwargslist: 1263 try: 1264 lang = kwargs['language'] 1265 qp.set_stemmer(_log(_xapian.Stem, lang)) 1266 qp.set_stemming_strategy(qp.STEM_SOME) 1267 except KeyError: 1268 pass 1269 return self._query_parse_with_fallback(qp, value, prefix) 1270 1271 return _log(_xapian.Query)
1272
1273 - def query_similar(self, ids, allow=None, deny=None, simterms=10):
1274 """Get a query which returns documents which are similar to others. 1275 1276 The list of document IDs to base the similarity search on is given in 1277 `ids`. This should be an iterable, holding a list of strings. If 1278 any of the supplied IDs cannot be found in the database, they will be 1279 ignored. (If no IDs can be found in the database, the resulting query 1280 will not match any documents.) 1281 1282 By default, all fields which have been indexed for freetext searching 1283 will be used for the similarity calculation. The list of fields used 1284 for this can be customised using the `allow` and `deny` parameters 1285 (only one of which may be specified): 1286 1287 - `allow`: A list of fields to base the similarity calculation on. 1288 - `deny`: A list of fields not to base the similarity calculation on. 1289 - `simterms`: Number of terms to use for the similarity calculation. 1290 1291 For convenience, any of `ids`, `allow`, or `deny` may be strings, which 1292 will be treated the same as a list of length 1. 1293 1294 Regardless of the setting of `allow` and `deny`, only fields which have 1295 been indexed for freetext searching will be used for the similarity 1296 measure - all other fields will always be ignored for this purpose. 1297 1298 """ 1299 eterms, prefixes = self._get_eterms(ids, allow, deny, simterms) 1300 1301 # Use the "elite set" operator, which chooses the terms with the 1302 # highest query weight to use. 1303 q = _log(_xapian.Query, _xapian.Query.OP_ELITE_SET, eterms, simterms) 1304 return q
1305
1306 - def significant_terms(self, ids, maxterms=10, allow=None, deny=None):
1307 """Get a set of "significant" terms for a document, or documents. 1308 1309 This has a similar interface to query_similar(): it takes a list of 1310 ids, and an optional specification of a set of fields to consider. 1311 Instead of returning a query, it returns a list of terms from the 1312 document (or documents), which appear "significant". Roughly, 1313 in this situation significant means that the terms occur more 1314 frequently in the specified document than in the rest of the corpus. 1315 1316 The list is in decreasing order of "significance". 1317 1318 By default, all terms related to fields which have been indexed for 1319 freetext searching will be considered for the list of significant 1320 terms. The list of fields used for this can be customised using the 1321 `allow` and `deny` parameters (only one of which may be specified): 1322 1323 - `allow`: A list of fields to consider. 1324 - `deny`: A list of fields not to consider. 1325 1326 For convenience, any of `ids`, `allow`, or `deny` may be strings, which 1327 will be treated the same as a list of length 1. 1328 1329 Regardless of the setting of `allow` and `deny`, only fields which have 1330 been indexed for freetext searching will be considered - all other 1331 fields will always be ignored for this purpose. 1332 1333 The maximum number of terms to return may be specified by the maxterms 1334 parameter. 1335 1336 """ 1337 eterms, prefixes = self._get_eterms(ids, allow, deny, maxterms) 1338 terms = [] 1339 for term in eterms: 1340 pos = 0 1341 for char in term: 1342 if not char.isupper(): 1343 break 1344 pos += 1 1345 field = prefixes[term[:pos]] 1346 value = term[pos:] 1347 terms.append((field, value)) 1348 return terms
1349
1350 - def _get_eterms(self, ids, allow, deny, simterms):
1351 """Get a set of terms for an expand 1352 1353 """ 1354 if self._index is None: 1355 raise _errors.SearchError("SearchConnection has been closed") 1356 if allow is not None and deny is not None: 1357 raise _errors.SearchError("Cannot specify both `allow` and `deny`") 1358 1359 if isinstance(ids, basestring): 1360 ids = (ids, ) 1361 if isinstance(allow, basestring): 1362 allow = (allow, ) 1363 if isinstance(deny, basestring): 1364 deny = (deny, ) 1365 1366 # Set "allow" to contain a list of all the fields to use. 1367 if allow is None: 1368 allow = [key for key in self._field_actions] 1369 if deny is not None: 1370 allow = [key for key in allow if key not in deny] 1371 1372 # Set "prefixes" to contain a list of all the prefixes to use. 1373 prefixes = {} 1374 for field in allow: 1375 try: 1376 actions = self._field_actions[field]._actions 1377 except KeyError: 1378 actions = {} 1379 for action, kwargslist in actions.iteritems(): 1380 if action == FieldActions.INDEX_FREETEXT: 1381 prefixes[self._field_mappings.get_prefix(field)] = field 1382 1383 # Repeat the expand until we don't get a DatabaseModifiedError 1384 while True: 1385 try: 1386 eterms = self._perform_expand(ids, prefixes, simterms) 1387 break; 1388 except _xapian.DatabaseModifiedError, e: 1389 self.reopen() 1390 return eterms, prefixes
1391
1392 - class ExpandDecider(_xapian.ExpandDecider):
1393 - def __init__(self, prefixes):
1394 _xapian.ExpandDecider.__init__(self) 1395 self._prefixes = prefixes
1396
1397 - def __call__(self, term):
1398 pos = 0 1399 for char in term: 1400 if not char.isupper(): 1401 break 1402 pos += 1 1403 if term[:pos] in self._prefixes: 1404 return True 1405 return False
1406
1407 - def _perform_expand(self, ids, prefixes, simterms):
1408 """Perform an expand operation to get the terms for a similarity 1409 search, given a set of ids (and a set of prefixes to restrict the 1410 similarity operation to). 1411 1412 """ 1413 # Set idquery to be a query which returns the documents listed in 1414 # "ids". 1415 idquery = _log(_xapian.Query, _xapian.Query.OP_OR, ['Q' + id for id in ids]) 1416 1417 enq = _log(_xapian.Enquire, self._index) 1418 enq.set_query(idquery) 1419 rset = _log(_xapian.RSet) 1420 for id in ids: 1421 pl = self._index.postlist('Q' + id) 1422 try: 1423 xapid = pl.next() 1424 rset.add_document(xapid.docid) 1425 except StopIteration: 1426 pass 1427 1428 expanddecider = _log(self.ExpandDecider, prefixes) 1429 eset = enq.get_eset(simterms, rset, 0, 1.0, expanddecider) 1430 return [term.term for term in eset]
1431
1432 - def query_all(self):
1433 """A query which matches all the documents in the database. 1434 1435 """ 1436 return _log(_xapian.Query, '')
1437
1438 - def query_none(self):
1439 """A query which matches no documents in the database. 1440 1441 This may be useful as a placeholder in various situations. 1442 1443 """ 1444 return _log(_xapian.Query)
1445
1446 - def spell_correct(self, querystr, allow=None, deny=None, default_op=OP_AND, 1447 default_allow=None, default_deny=None):
1448 """Correct a query spelling. 1449 1450 This returns a version of the query string with any misspelt words 1451 corrected. 1452 1453 - `allow`: A list of fields to allow in the query. 1454 - `deny`: A list of fields not to allow in the query. 1455 - `default_op`: The default operator to combine query terms with. 1456 - `default_allow`: A list of fields to search for by default. 1457 - `default_deny`: A list of fields not to search for by default. 1458 1459 Only one of `allow` and `deny` may be specified. 1460 1461 Only one of `default_allow` and `default_deny` may be specified. 1462 1463 If any of the entries in `allow` are not present in the configuration 1464 for the database, or are not specified for indexing (either as 1465 INDEX_EXACT or INDEX_FREETEXT), they will be ignored. If any of the 1466 entries in `deny` are not present in the configuration for the 1467 database, they will be ignored. 1468 1469 Note that it is possible that the resulting spell-corrected query will 1470 still match no documents - the user should usually check that some 1471 documents are matched by the corrected query before suggesting it to 1472 users. 1473 1474 """ 1475 qp = self._prepare_queryparser(allow, deny, default_op, default_allow, 1476 default_deny) 1477 try: 1478 qp.parse_query(querystr, 1479 self._qp_flags_base | 1480 self._qp_flags_phrase | 1481 self._qp_flags_synonym | 1482 self._qp_flags_bool | 1483 qp.FLAG_SPELLING_CORRECTION) 1484 except _xapian.QueryParserError: 1485 qp.parse_query(querystr, 1486 self._qp_flags_base | 1487 self._qp_flags_phrase | 1488 self._qp_flags_synonym | 1489 qp.FLAG_SPELLING_CORRECTION) 1490 corrected = qp.get_corrected_query_string() 1491 if len(corrected) == 0: 1492 if isinstance(querystr, unicode): 1493 # Encode as UTF-8 for consistency - this happens automatically 1494 # to values passed to Xapian. 1495 return querystr.encode('utf-8') 1496 return querystr 1497 return corrected
1498
1499 - def can_collapse_on(self, field):
1500 """Check if this database supports collapsing on a specified field. 1501 1502 """ 1503 if self._index is None: 1504 raise _errors.SearchError("SearchConnection has been closed") 1505 try: 1506 self._field_mappings.get_slot(field, 'collsort') 1507 except KeyError: 1508 return False 1509 return True
1510
1511 - def can_sort_on(self, field):
1512 """Check if this database supports sorting on a specified field. 1513 1514 """ 1515 if self._index is None: 1516 raise _errors.SearchError("SearchConnection has been closed") 1517 try: 1518 self._field_mappings.get_slot(field, 'collsort') 1519 except KeyError: 1520 return False 1521 return True
1522
1523 - def _get_prefix_from_term(self, term):
1524 """Get the prefix of a term. 1525 1526 Prefixes are any initial capital letters, with the exception that R always 1527 ends a prefix, even if followed by capital letters. 1528 1529 """ 1530 for p in xrange(len(term)): 1531 if term[p].islower(): 1532 return term[:p] 1533 elif term[p] == 'R': 1534 return term[:p+1] 1535 return term
1536
1537 - def _facet_query_never(self, facet, query_type):
1538 """Check if a facet must never be returned by a particular query type. 1539 1540 Returns True if the facet must never be returned. 1541 1542 Returns False if the facet may be returned - either becuase there is no 1543 entry for the query type, or because the entry is not 1544 FacetQueryType_Never. 1545 1546 """ 1547 if query_type is None: 1548 return False 1549 if query_type not in self._facet_query_table: 1550 return False 1551 if facet not in self._facet_query_table[query_type]: 1552 return False 1553 return self._facet_query_table[query_type][facet] == _indexerconnection.IndexerConnection.FacetQueryType_Never
1554
1555 - def search(self, query, startrank, endrank, 1556 checkatleast=0, sortby=None, collapse=None, 1557 gettags=None, 1558 getfacets=None, allowfacets=None, denyfacets=None, usesubfacets=None, 1559 percentcutoff=None, weightcutoff=None, 1560 query_type=None):
1561 """Perform a search, for documents matching a query. 1562 1563 - `query` is the query to perform. 1564 - `startrank` is the rank of the start of the range of matching 1565 documents to return (ie, the result with this rank will be returned). 1566 ranks start at 0, which represents the "best" matching document. 1567 - `endrank` is the rank at the end of the range of matching documents 1568 to return. This is exclusive, so the result with this rank will not 1569 be returned. 1570 - `checkatleast` is the minimum number of results to check for: the 1571 estimate of the total number of matches will always be exact if 1572 the number of matches is less than `checkatleast`. A value of ``-1`` 1573 can be specified for the checkatleast parameter - this has the 1574 special meaning of "check all matches", and is equivalent to passing 1575 the result of get_doccount(). 1576 - `sortby` is the name of a field to sort by. It may be preceded by a 1577 '+' or a '-' to indicate ascending or descending order 1578 (respectively). If the first character is neither '+' or '-', the 1579 sort will be in ascending order. 1580 - `collapse` is the name of a field to collapse the result documents 1581 on. If this is specified, there will be at most one result in the 1582 result set for each value of the field. 1583 - `gettags` is the name of a field to count tag occurrences in, or a 1584 list of fields to do so. 1585 - `getfacets` is a boolean - if True, the matching documents will be 1586 examined to build up a list of the facet values contained in them. 1587 - `allowfacets` is a list of the fieldnames of facets to consider. 1588 - `denyfacets` is a list of fieldnames of facets which will not be 1589 considered. 1590 - `usesubfacets` is a boolean - if True, only top-level facets and 1591 subfacets of facets appearing in the query are considered (taking 1592 precedence over `allowfacets` and `denyfacets`). 1593 - `percentcutoff` is the minimum percentage a result must have to be 1594 returned. 1595 - `weightcutoff` is the minimum weight a result must have to be 1596 returned. 1597 - `query_type` is a value indicating the type of query being 1598 performed. If not None, the value is used to influence which facets 1599 are be returned by the get_suggested_facets() function. If the 1600 value of `getfacets` is False, it has no effect. 1601 1602 If neither 'allowfacets' or 'denyfacets' is specified, all fields 1603 holding facets will be considered (but see 'usesubfacets'). 1604 1605 """ 1606 if self._index is None: 1607 raise _errors.SearchError("SearchConnection has been closed") 1608 if 'facets' in _checkxapian.missing_features: 1609 if getfacets is not None or \ 1610 allowfacets is not None or \ 1611 denyfacets is not None or \ 1612 usesubfacets is not None or \ 1613 query_type is not None: 1614 raise errors.SearchError("Facets unsupported with this release of xapian") 1615 if 'tags' in _checkxapian.missing_features: 1616 if gettags is not None: 1617 raise errors.SearchError("Tags unsupported with this release of xapian") 1618 if checkatleast == -1: 1619 checkatleast = self._index.get_doccount() 1620 1621 enq = _log(_xapian.Enquire, self._index) 1622 enq.set_query(query) 1623 1624 if sortby is not None: 1625 asc = True 1626 if sortby[0] == '-': 1627 asc = False 1628 sortby = sortby[1:] 1629 elif sortby[0] == '+': 1630 sortby = sortby[1:] 1631 1632 try: 1633 slotnum = self._field_mappings.get_slot(sortby, 'collsort') 1634 except KeyError: 1635 raise _errors.SearchError("Field %r was not indexed for sorting" % sortby) 1636 1637 # Note: we invert the "asc" parameter, because xapian treats 1638 # "ascending" as meaning "higher values are better"; in other 1639 # words, it considers "ascending" to mean return results in 1640 # descending order. 1641 enq.set_sort_by_value_then_relevance(slotnum, not asc) 1642 1643 if collapse is not None: 1644 try: 1645 slotnum = self._field_mappings.get_slot(collapse, 'collsort') 1646 except KeyError: 1647 raise _errors.SearchError("Field %r was not indexed for collapsing" % collapse) 1648 enq.set_collapse_key(slotnum) 1649 1650 maxitems = max(endrank - startrank, 0) 1651 # Always check for at least one more result, so we can report whether 1652 # there are more matches. 1653 checkatleast = max(checkatleast, endrank + 1) 1654 1655 # Build the matchspy. 1656 matchspies = [] 1657 1658 # First, add a matchspy for any gettags fields 1659 if isinstance(gettags, basestring): 1660 if len(gettags) != 0: 1661 gettags = [gettags] 1662 tagspy = None 1663 if gettags is not None and len(gettags) != 0: 1664 tagspy = _log(_xapian.TermCountMatchSpy) 1665 for field in gettags: 1666 try: 1667 prefix = self._field_mappings.get_prefix(field) 1668 tagspy.add_prefix(prefix) 1669 except KeyError: 1670 raise _errors.SearchError("Field %r was not indexed for tagging" % field) 1671 matchspies.append(tagspy) 1672 1673 1674 # add a matchspy for facet selection here. 1675 facetspy = None 1676 facetfields = [] 1677 if getfacets: 1678 if allowfacets is not None and denyfacets is not None: 1679 raise _errors.SearchError("Cannot specify both `allowfacets` and `denyfacets`") 1680 if allowfacets is None: 1681 allowfacets = [key for key in self._field_actions] 1682 if denyfacets is not None: 1683 allowfacets = [key for key in allowfacets if key not in denyfacets] 1684 1685 # include None in queryfacets so a top-level facet will 1686 # satisfy self._facet_hierarchy.get(field) in queryfacets 1687 # (i.e. always include top-level facets) 1688 queryfacets = set([None]) 1689 if usesubfacets: 1690 # add facets used in the query to queryfacets 1691 termsiter = query.get_terms_begin() 1692 termsend = query.get_terms_end() 1693 while termsiter != termsend: 1694 prefix = self._get_prefix_from_term(termsiter.get_term()) 1695 field = self._field_mappings.get_fieldname_from_prefix(prefix) 1696 if field and FieldActions.FACET in self._field_actions[field]._actions: 1697 queryfacets.add(field) 1698 termsiter.next() 1699 1700 for field in allowfacets: 1701 try: 1702 actions = self._field_actions[field]._actions 1703 except KeyError: 1704 actions = {} 1705 for action, kwargslist in actions.iteritems(): 1706 if action == FieldActions.FACET: 1707 # filter out non-top-level facets that aren't subfacets 1708 # of a facet in the query 1709 if usesubfacets and self._facet_hierarchy.get(field) not in queryfacets: 1710 continue 1711 # filter out facets that should never be returned for the query type 1712 if self._facet_query_never(field, query_type): 1713 continue 1714 slot = self._field_mappings.get_slot(field, 'facet') 1715 if facetspy is None: 1716 facetspy = _log(_xapian.CategorySelectMatchSpy) 1717 facettype = None 1718 for kwargs in kwargslist: 1719 facettype = kwargs.get('type', None) 1720 if facettype is not None: 1721 break 1722 if facettype is None or facettype == 'string': 1723 facetspy.add_slot(slot, True) 1724 else: 1725 facetspy.add_slot(slot) 1726 facetfields.append((field, slot, kwargslist)) 1727 1728 if facetspy is None: 1729 # Set facetspy to False, to distinguish from no facet 1730 # calculation being performed. (This will prevent an 1731 # error being thrown when the list of suggested facets is 1732 # requested - instead, an empty list will be returned.) 1733 facetspy = False 1734 else: 1735 matchspies.append(facetspy) 1736 1737 1738 # Finally, build a single matchspy to pass to get_mset(). 1739 if len(matchspies) == 0: 1740 matchspy = None 1741 elif len(matchspies) == 1: 1742 matchspy = matchspies[0] 1743 else: 1744 matchspy = _log(_xapian.MultipleMatchDecider) 1745 for spy in matchspies: 1746 matchspy.append(spy) 1747 1748 enq.set_docid_order(enq.DONT_CARE) 1749 1750 # Set percentage and weight cutoffs 1751 if percentcutoff is not None or weightcutoff is not None: 1752 if percentcutoff is None: 1753 percentcutoff = 0 1754 if weightcutoff is None: 1755 weightcutoff = 0 1756 enq.set_cutoff(percentcutoff, weightcutoff) 1757 1758 # Repeat the search until we don't get a DatabaseModifiedError 1759 while True: 1760 try: 1761 if matchspy is None: 1762 mset = enq.get_mset(startrank, maxitems, checkatleast) 1763 else: 1764 mset = enq.get_mset(startrank, maxitems, checkatleast, 1765 None, None, matchspy) 1766 break 1767 except _xapian.DatabaseModifiedError, e: 1768 self.reopen() 1769 facet_hierarchy = None 1770 if usesubfacets: 1771 facet_hierarchy = self._facet_hierarchy 1772 1773 return SearchResults(self, enq, query, mset, self._field_mappings, 1774 tagspy, gettags, facetspy, facetfields, 1775 facet_hierarchy, 1776 self._facet_query_table.get(query_type))
1777
1778 - def iterids(self):
1779 """Get an iterator which returns all the ids in the database. 1780 1781 The unqiue_ids are currently returned in binary lexicographical sort 1782 order, but this should not be relied on. 1783 1784 Note that the iterator returned by this method may raise a 1785 xapian.DatabaseModifiedError exception if modifications are committed 1786 to the database while the iteration is in progress. If this happens, 1787 the search connection must be reopened (by calling reopen) and the 1788 iteration restarted. 1789 1790 """ 1791 if self._index is None: 1792 raise _errors.SearchError("SearchConnection has been closed") 1793 return _indexerconnection.PrefixedTermIter('Q', self._index.allterms())
1794
1795 - def get_document(self, id):
1796 """Get the document with the specified unique ID. 1797 1798 Raises a KeyError if there is no such document. Otherwise, it returns 1799 a ProcessedDocument. 1800 1801 """ 1802 if self._index is None: 1803 raise _errors.SearchError("SearchConnection has been closed") 1804 while True: 1805 try: 1806 postlist = self._index.postlist('Q' + id) 1807 try: 1808 plitem = postlist.next() 1809 except StopIteration: 1810 # Unique ID not found 1811 raise KeyError('Unique ID %r not found' % id) 1812 try: 1813 postlist.next() 1814 raise _errors.IndexerError("Multiple documents " #pragma: no cover 1815 "found with same unique ID") 1816 except StopIteration: 1817 # Only one instance of the unique ID found, as it should be. 1818 pass 1819 1820 result = ProcessedDocument(self._field_mappings) 1821 result.id = id 1822 result._doc = self._index.get_document(plitem.docid) 1823 return result 1824 except _xapian.DatabaseModifiedError, e: 1825 self.reopen()
1826
1827 - def iter_synonyms(self, prefix=""):
1828 """Get an iterator over the synonyms. 1829 1830 - `prefix`: if specified, only synonym keys with this prefix will be 1831 returned. 1832 1833 The iterator returns 2-tuples, in which the first item is the key (ie, 1834 a 2-tuple holding the term or terms which will be synonym expanded, 1835 followed by the fieldname specified (or None if no fieldname)), and the 1836 second item is a tuple of strings holding the synonyms for the first 1837 item. 1838 1839 These return values are suitable for the dict() builtin, so you can 1840 write things like: 1841 1842 >>> conn = _indexerconnection.IndexerConnection('foo') 1843 >>> conn.add_synonym('foo', 'bar') 1844 >>> conn.add_synonym('foo bar', 'baz') 1845 >>> conn.add_synonym('foo bar', 'foo baz') 1846 >>> conn.flush() 1847 >>> conn = SearchConnection('foo') 1848 >>> dict(conn.iter_synonyms()) 1849 {('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')} 1850 1851 """ 1852 if self._index is None: 1853 raise _errors.SearchError("SearchConnection has been closed") 1854 return _indexerconnection.SynonymIter(self._index, self._field_mappings, prefix)
1855
1856 - def get_metadata(self, key):
1857 """Get an item of metadata stored in the connection. 1858 1859 This returns a value stored by a previous call to 1860 IndexerConnection.set_metadata. 1861 1862 If the value is not found, this will return the empty string. 1863 1864 """ 1865 if self._index is None: 1866 raise _errors.IndexerError("SearchConnection has been closed") 1867 if not hasattr(self._index, 'get_metadata'): 1868 raise _errors.IndexerError("Version of xapian in use does not support metadata") 1869 return _log(self._index.get_metadata, key)
1870 1871 if __name__ == '__main__': 1872 import doctest, sys 1873 doctest.testmod (sys.modules[__name__]) 1874