Package Gnumed :: Package pycommon :: Module gmMatchProvider
[frames] | no frames]

Source Code for Module Gnumed.pycommon.gmMatchProvider

  1  """Base classes for match providers. 
  2   
  3  They are used by business objects to give 
  4  phrasewheels the ability to guess phrases. 
  5   
  6  Copyright (C) GNUMed developers 
  7  license: GPL v2 or later 
  8  """ 
  9  __version__ = "$Revision: 1.34 $" 
 10  __author__  = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>" 
 11   
 12  # std lib 
 13  import re as regex, logging 
 14   
 15   
 16  # GNUmed 
 17  from Gnumed.pycommon import gmPG2 
 18   
 19   
 20  _log = logging.getLogger('gm.ui') 
 21  _log.info(__version__) 
 22   
 23   
 24  # these are stripped from the fragment passed to the 
 25  # match provider before looking for matches: 
 26  default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"' 
 27   
 28  # these are used to detect word boundaries which is, 
 29  # in turn, used to normalize word boundaries in the 
 30  # input fragment 
 31  default_word_separators = '[- \t=+&:@]+' 
 32  #============================================================ 
33 -class cMatchProvider(object):
34 """Base class for match providing objects. 35 36 Match sources might be: 37 - database tables 38 - flat files 39 - previous input 40 - config files 41 - in-memory list created on the fly 42 """ 43 print_queries = False 44 #--------------------------------------------------------
45 - def __init__(self):
46 self.setThresholds() 47 48 self._context_vals = {} 49 self.__ignored_chars = regex.compile(default_ignored_chars) 50 # used to normalize word boundaries: 51 self.__word_separators = regex.compile(default_word_separators)
52 #-------------------------------------------------------- 53 # actions 54 #--------------------------------------------------------
55 - def getMatches(self, aFragment = None):
56 """Return matches according to aFragment and matching thresholds. 57 58 FIXME: design decision: we dont worry about data source changes 59 during the lifetime of a MatchProvider 60 FIXME: append _("*get all items*") on truncation 61 """ 62 # sanity check 63 if aFragment is None: 64 raise ValueError, 'Cannot find matches without a fragment.' 65 66 # user explicitly wants all matches 67 if aFragment == u'*': 68 return self.getAllMatches() 69 70 # case insensitivity 71 tmpFragment = aFragment.lower() 72 # remove ignored chars 73 if self.__ignored_chars is not None: 74 tmpFragment = self.__ignored_chars.sub('', tmpFragment) 75 # normalize word separators 76 if self.__word_separators is not None: 77 tmpFragment = u' '.join(self.__word_separators.split(tmpFragment)) 78 # length in number of significant characters only 79 lngFragment = len(tmpFragment) 80 81 # order is important ! 82 if lngFragment >= self.__threshold_substring: 83 return self.getMatchesBySubstr(tmpFragment) 84 elif lngFragment >= self.__threshold_word: 85 return self.getMatchesByWord(tmpFragment) 86 elif lngFragment >= self.__threshold_phrase: 87 return self.getMatchesByPhrase(tmpFragment) 88 else: 89 return (False, [])
90 #--------------------------------------------------------
91 - def getAllMatches(self):
92 raise NotImplementedError
93 #--------------------------------------------------------
94 - def getMatchesByPhrase(self, aFragment):
95 raise NotImplementedError
96 #--------------------------------------------------------
97 - def getMatchesByWord(self, aFragment):
98 raise NotImplementedError
99 #--------------------------------------------------------
100 - def getMatchesBySubstr(self, aFragment):
101 raise NotImplementedError
102 #--------------------------------------------------------
103 - def get_match_by_data(self, data=None):
104 return None
105 #-------------------------------------------------------- 106 # configuration 107 #--------------------------------------------------------
108 - def setThresholds(self, aPhrase = 1, aWord = 3, aSubstring = 5):
109 """Set match location thresholds. 110 111 - the fragment passed to getMatches() must contain at least this many 112 characters before it triggers a match search at: 113 1) phrase_start - start of phrase (first word) 114 2) word_start - start of any word within phrase 115 3) in_word - _inside_ any word within phrase 116 """ 117 # sanity checks 118 if aSubstring < aWord: 119 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word)) 120 return False 121 if aWord < aPhrase: 122 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase)) 123 return False 124 125 # now actually reassign thresholds 126 self.__threshold_phrase = aPhrase 127 self.__threshold_word = aWord 128 self.__threshold_substring = aSubstring 129 130 return True
131 #--------------------------------------------------------
132 - def _set_word_separators(self, word_separators=None):
133 if word_separators is None: 134 self.__word_separators = None 135 else: 136 self.__word_separators = regex.compile(word_separators)
137
138 - def _get_word_separators(self):
139 if self.__word_separators is None: 140 return None 141 return self.__word_separators.pattern
142 143 word_separators = property(_get_word_separators, _set_word_separators) 144 #--------------------------------------------------------
145 - def _set_ignored_chars(self, ignored_chars=None):
146 if ignored_chars is None: 147 self.__ignored_chars = None 148 else: 149 self.__ignored_chars = regex.compile(ignored_chars)
150
151 - def _get_ignored_chars(self):
152 if self.__ignored_chars is None: 153 return None 154 return self.__ignored_chars.pattern
155 156 ignored_chars = property(_get_ignored_chars, _set_ignored_chars) 157 #--------------------------------------------------------
158 - def set_context (self, context=None, val=None):
159 """Set value to provide context information for matches. 160 161 The matching code may ignore it depending on its exact 162 implementation. Names and values of the context depend 163 on what is being matched. 164 165 <context> -- the *placeholder* key *inside* the context 166 definition, not the context *definition* key 167 """ 168 if context is None: 169 return False 170 self._context_vals[context] = val 171 return True
172 #--------------------------------------------------------
173 - def unset_context(self, context=None):
174 try: 175 del self._context_vals[context] 176 except KeyError: 177 pass
178 #------------------------------------------------------------ 179 # usable instances 180 #------------------------------------------------------------
181 -class cMatchProvider_FixedList(cMatchProvider):
182 """Match provider where all possible options can be held 183 in a reasonably sized, pre-allocated list. 184 """
185 - def __init__(self, aSeq = None):
186 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight) 187 """ 188 if not type(aSeq) in [type(None), type([]), type(())]: 189 _log.error('fixed list match provider argument must be a list/tuple of dicts/None') 190 raise TypeError('fixed list match provider argument must be a list/tuple of dicts/None') 191 192 self.__items = aSeq 193 cMatchProvider.__init__(self)
194 #-------------------------------------------------------- 195 # internal matching algorithms 196 # 197 # if we end up here: 198 # - aFragment will not be "None" 199 # - aFragment will be lower case 200 # - we _do_ deliver matches (whether we find any is a different story) 201 #--------------------------------------------------------
202 - def getMatchesByPhrase(self, aFragment):
203 """Return matches for aFragment at start of phrases.""" 204 matches = [] 205 # look for matches 206 for item in self.__items: 207 # at start of phrase, that is 208 if item['list_label'].lower().startswith(aFragment.lower()): 209 matches.append(item) 210 # no matches found 211 if len(matches) == 0: 212 return (False, []) 213 214 matches.sort(self.__cmp_items) 215 return (True, matches)
216 #--------------------------------------------------------
217 - def getMatchesByWord(self, aFragment):
218 """Return matches for aFragment at start of words inside phrases.""" 219 matches = [] 220 # look for matches 221 for item in self.__items: 222 item_label = item['list_label'].lower() 223 fragment_pos = item_label.find(aFragment.lower()) 224 # found at start of phrase 225 if fragment_pos == 0: 226 matches.append(item) 227 # found as a true substring 228 elif fragment_pos > 0: 229 # but use only if substring is at start of a word 230 if item_label[fragment_pos-1] == u' ': 231 matches.append(item) 232 # no matches found 233 if len(matches) == 0: 234 return (False, []) 235 236 matches.sort(self.__cmp_items) 237 return (True, matches)
238 #--------------------------------------------------------
239 - def getMatchesBySubstr(self, aFragment):
240 """Return matches for aFragment as a true substring.""" 241 matches = [] 242 # look for matches 243 for item in self.__items: 244 if item['list_label'].lower().find(aFragment.lower()) != -1: 245 matches.append(item) 246 # no matches found 247 if len(matches) == 0: 248 return (False, []) 249 250 matches.sort(self.__cmp_items) 251 return (True, matches)
252 #--------------------------------------------------------
253 - def getAllMatches(self):
254 """Return all items.""" 255 matches = self.__items 256 # no matches found 257 if len(matches) == 0: 258 return (False, []) 259 260 matches.sort(self.__cmp_items) 261 return (True, matches)
262 #--------------------------------------------------------
263 - def set_items(self, items):
264 """items must be a list of dicts. Each dict must have the keys (data, list_label, weight)""" 265 self.__items = items
266 #--------------------------------------------------------
267 - def __cmp_items(self, item1, item2):
268 """Compare items based on weight.""" 269 if item1['weight'] == item2['weight']: 270 return 0 271 272 # do it the wrong way round to do sorting/reversing at once 273 if item1['weight'] < item2['weight']: 274 return 1 275 if item1['weight'] > item2['weight']: 276 return -1
277 # ===========================================================
278 -class cMatchProvider_Func(cMatchProvider):
279 """Match provider which searches matches 280 in the results of a function call. 281 """
282 - def __init__(self, get_candidates = None):
283 """get_candidates() must return a list of strings.""" 284 if get_candidates is None: 285 _log.error('must define function to retrieve match candidates list') 286 raise ValueError('must define function to retrieve match candidates list') 287 288 self._get_candidates = get_candidates 289 cMatchProvider.__init__(self)
290 #-------------------------------------------------------- 291 # internal matching algorithms 292 # 293 # if we end up here: 294 # - aFragment will not be "None" 295 # - aFragment will be lower case 296 # - we _do_ deliver matches (whether we find any is a different story) 297 #--------------------------------------------------------
298 - def getMatchesByPhrase(self, aFragment):
299 """Return matches for aFragment at start of phrases.""" 300 matches = [] 301 candidates = self._get_candidates() 302 # look for matches 303 for candidate in candidates: 304 # at start of phrase, that is 305 if aFragment.startswith(candidate['list_label'].lower()): 306 matches.append(candidate) 307 # no matches found 308 if len(matches) == 0: 309 return (False, []) 310 311 matches.sort(self.__cmp_candidates) 312 return (True, matches)
313 #--------------------------------------------------------
314 - def getMatchesByWord(self, aFragment):
315 """Return matches for aFragment at start of words inside phrases.""" 316 matches = [] 317 candidates = self._get_candidates() 318 # look for matches 319 for candidate in candidates: 320 pos = candidate['list_label'].lower().find(aFragment) 321 # pos = string.find(string.lower(candidate['list_label']), aFragment) 322 # found as a true substring 323 # but use only if substring is at start of a word 324 # FIXME: use word seps 325 if (pos == 0) or (candidate['list_label'][pos-1] == u' '): 326 matches.append(candidate) 327 # no matches found 328 if len(matches) == 0: 329 return (False, []) 330 331 matches.sort(self.__cmp_candidates) 332 return (True, matches)
333 #--------------------------------------------------------
334 - def getMatchesBySubstr(self, aFragment):
335 """Return matches for aFragment as a true substring.""" 336 matches = [] 337 candidates = self._get_candidates() 338 # look for matches 339 for candidate in candidates: 340 if candidate['list_label'].lower().find(aFragment) != -1: 341 # if string.find(string.lower(candidate['list_label']), aFragment) != -1: 342 matches.append(candidate) 343 # no matches found 344 if len(matches) == 0: 345 return (False, []) 346 347 matches.sort(self.__cmp_candidates) 348 return (True, matches)
349 #--------------------------------------------------------
350 - def getAllMatches(self):
351 """Return all candidates.""" 352 return self._get_candidates()
353 #--------------------------------------------------------
354 - def __cmp_candidates(self, candidate1, candidate2):
355 """naive ordering""" 356 return 0
357 # FIXME: do ordering 358 # if candidate1 < candidate2: 359 # return -1 360 # if candidate1 == candidate2: 361 # return 0 362 # return 1 363 364 # ===========================================================
365 -class cMatchProvider_SQL2(cMatchProvider):
366 """Match provider which searches matches 367 in possibly several database tables. 368 369 queries: 370 - a list of unicode strings 371 - each string is a query 372 - each string must contain: "... WHERE <column> %(fragment_condition)s ..." 373 - each string can contain in the where clause: "... %(<ctxt_key1>)s ..." 374 - each query must return (data, list_label, field_label) 375 376 context definitions to be used in the queries, example: 377 {'ctxt_key1': {'where_part': 'AND country = %(country)s', 'placeholder': 'country'}} 378 379 client code using .set_context() must use the 'placeholder': 380 <phrasewheel>/<match provider>.set_context('country', 'Germany') 381 382 full example query: 383 384 query = u" " " 385 SELECT DISTINCT ON (list_label) 386 pk_encounter 387 AS data, 388 to_char(started, 'YYYY Mon DD (HH24:MI)') || ': ' || l10n_type || ' [#' || pk_encounter || ']' 389 AS list_label, 390 to_char(started, 'YYYY Mon DD') || ': ' || l10n_type 391 AS field_label 392 FROM 393 clin.v_pat_encounters 394 WHERE 395 ( 396 l10n_type %(fragment_condition)s 397 OR 398 type %(fragment_condition)s 399 ) %(ctxt_patient)s 400 ORDER BY 401 list_label 402 LIMIT 403 30 404 " " " 405 context = {'ctxt_patient': { 406 'where_part': u'AND pk_patient = %(PLACEHOLDER)s', 407 'placeholder': u'PLACEHOLDER' 408 }} 409 self.mp = gmMatchProvider.cMatchProvider_SQL2(queries = query, context = context) 410 self.set_context(context = 'PLACEHOLDER', val = '<THE VALUE>') 411 412 _SQL_data2match: 413 SQL to retrieve a match by, say, primary key 414 wherein the only keyword argument is 'pk' 415 """
416 - def __init__(self, queries = None, context = None):
417 418 cMatchProvider.__init__(self) 419 420 if type(queries) == type([]): 421 self._queries = queries 422 else: 423 self._queries = [queries] 424 425 if context is None: 426 self._context = {} 427 else: 428 self._context = context 429 430 self._args = {} 431 432 self._SQL_data2match = None
433 #-------------------------------------------------------- 434 # internal matching algorithms 435 # 436 # if we end up here: 437 # - aFragment will not be "None" 438 # - aFragment will be lower case 439 # - we _do_ deliver matches (whether we find any is a different story) 440 #--------------------------------------------------------
441 - def getMatchesByPhrase(self, aFragment):
442 """Return matches for aFragment at start of phrases.""" 443 444 fragment_condition = u"ILIKE %(fragment)s" 445 self._args['fragment'] = u"%s%%" % aFragment 446 447 return self._find_matches(fragment_condition)
448 #--------------------------------------------------------
449 - def getMatchesByWord(self, aFragment):
450 """Return matches for aFragment at start of words inside phrases.""" 451 452 fragment_condition = u"~* %(fragment)s" 453 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False) 454 self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment) 455 456 return self._find_matches(fragment_condition)
457 #--------------------------------------------------------
458 - def getMatchesBySubstr(self, aFragment):
459 """Return matches for aFragment as a true substring.""" 460 461 fragment_condition = u"ILIKE %(fragment)s" 462 self._args['fragment'] = u"%%%s%%" % aFragment 463 464 return self._find_matches(fragment_condition)
465 #--------------------------------------------------------
466 - def getAllMatches(self):
467 """Return all items.""" 468 return self.getMatchesBySubstr(u'')
469 #--------------------------------------------------------
470 - def get_match_by_data(self, data=None):
471 if self._SQL_data2match is None: 472 return None 473 474 query = {'cmd': self._SQL_data2match, 'args': {'pk': data}} 475 try: 476 rows, idx = gmPG2.run_ro_queries(queries = [query], get_col_idx = False) 477 except: 478 _log.exception('[%s]: error running _SQL_data2match, dropping query', self.__class__.__name__) 479 self._SQL_data2match = None 480 return None 481 482 # hopefully the most frequent case: 483 if len(rows) == 1: 484 return rows[0] 485 486 _log.error('[%s]: 0 or >1 rows found by running _SQL_data2match, ambiguous, ignoring', self.__class__.__name__) 487 return None
488 #--------------------------------------------------------
489 - def _find_matches(self, fragment_condition):
490 if self.print_queries: 491 print "----------------------" 492 matches = [] 493 for query in self._queries: 494 where_fragments = {'fragment_condition': fragment_condition} 495 496 for context_key, context_def in self._context.items(): 497 try: 498 placeholder = context_def['placeholder'] 499 where_part = context_def['where_part'] 500 self._args[placeholder] = self._context_vals[placeholder] 501 # we do have a context value for this key, so add the where condition 502 where_fragments[context_key] = where_part 503 if self.print_queries: 504 print "ctxt ph:", placeholder 505 print "ctxt where:", where_part 506 print "ctxt val:", self._context_vals[placeholder] 507 except KeyError: 508 # we don't have a context value for this key, so skip the where condition 509 where_fragments[context_key] = u'' 510 if self.print_queries: 511 print "invalid ctxt key:", context_key 512 513 cmd = query % where_fragments 514 515 if self.print_queries: 516 print "class:", self.__class__.__name__ 517 print "ctxt:", self._context_vals 518 print "args:", self._args 519 print "query:", cmd 520 521 try: 522 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}], get_col_idx = False) 523 except: 524 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__) 525 idx = self._queries.index(query) 526 del self._queries[idx] 527 break 528 529 # no matches found: try next query 530 if len(rows) == 0: 531 continue 532 533 for row in rows: 534 match = {'weight': 0} 535 536 try: 537 match['data'] = row['data'] 538 except KeyError: 539 match['data'] = row[0] 540 541 try: 542 match['list_label'] = row['list_label'] 543 except KeyError: 544 match['list_label'] = row[1] 545 546 # explicit "field_label" in result ? 547 try: 548 match['field_label'] = row['field_label'] 549 # no 550 except KeyError: 551 # but does row[2] exist ? 552 try: 553 match['field_label'] = row[2] 554 # no: reuse "list_label" 555 except IndexError: 556 match['field_label'] = match['list_label'] 557 558 # try: 559 # match['label'] = row['label'] 560 # except KeyError: 561 # match['label'] = match['list_label'] 562 563 matches.append(match) 564 565 return (True, matches) 566 567 # none found whatsoever 568 return (False, [])
569 #================================================================ 570 if __name__ == '__main__': 571 pass 572 573 #================================================================ 574