Package csb :: Package bio :: Package sequence
[frames] | no frames]

Source Code for Package csb.bio.sequence

   1  """ 
   2  Sequence and sequence alignment APIs. 
   3   
   4  This module defines the base interfaces for biological sequences and alignments: 
   5  L{AbstractSequence} and L{AbstractAlignment}. These are the central abstractions 
   6  here. This module provides also a number of useful enumerations, like L{SequenceTypes} 
   7  and L{SequenceAlphabets}. 
   8   
   9  Sequences 
  10  ========= 
  11  L{AbstractSequence} has a number of implementations. These are of course interchangeable, 
  12  but have different intents and may differ significantly in performance. The standard 
  13  L{Sequence} implementation is what you are after if all you need is high performance 
  14  and efficient storage (e.g. when you are parsing big files). L{Sequence} objects store 
  15  their underlying sequences as strings. L{RichSequence}s on the other hand will store 
  16  their residues as L{ResidueInfo} objects, which have the same basic interface as the  
  17  L{csb.bio.structure.Residue} objects. This of course comes at the expense of degraded 
  18  performance. A L{ChainSequence} is a special case of a rich sequence, whose residue 
  19  objects are I{actually} real L{csb.bio.structure.Residue}s. 
  20   
  21  Basic usage: 
  22   
  23      >>> seq = RichSequence('id', 'desc', 'sequence', SequenceTypes.Protein) 
  24      >>> seq.residues[1] 
  25      <ResidueInfo [1]: SER> 
  26      >>> seq.dump(sys.stdout) 
  27      >desc 
  28      SEQUENCE 
  29   
  30  See L{AbstractSequence} for details.     
  31   
  32  Alignments 
  33  ========== 
  34  L{AbstractAlignment} defines a table-like interface to access the data in an 
  35  alignment: 
  36   
  37      >>> ali = SequenceAlignment.parse(">a\\nABC\\n>b\\nA-C") 
  38      >>> ali[0, 0] 
  39      <SequenceAlignment>   # a new alignment, constructed from row #1, column #1 
  40      >>> ali[0, 1:3] 
  41      <SequenceAlignment>   # a new alignment, constructed from row #1, columns #2..#3 
  42   
  43  which is just a shorthand for using the standard 1-based interface: 
  44   
  45      >>> ali.rows[1] 
  46      <AlignedSequenceAdapter: a, 3>                        # row #1 (first sequence) 
  47      >>> ali.columns[1] 
  48      (<ColumnInfo a [1]: ALA>, <ColumnInfo b [1]: ALA>)    # residues at column #1 
  49   
  50  See L{AbstractAlignment} for all details and more examples. 
  51   
  52  There are a number of L{AbstractAlignment} implementations defined here. 
  53  L{SequenceAlignment} is the default one, nothing surprising. L{A3MAlignment} 
  54  is a more special one: the first sequence in the alignment is a master sequence. 
  55  This alignment is usually used in the context of HHpred. More important is the 
  56  L{StructureAlignment}, which is an alignment of L{csb.bio.structure.Chain} objects. 
  57  The residues in every aligned sequence are really the L{csb.bio.structure.Residue} 
  58  objects taken from those chains. 
  59  """ 
  60   
  61  import re 
  62  import csb.core 
  63  import csb.io 
  64   
  65  from abc import ABCMeta, abstractmethod, abstractproperty 
66 67 68 -class AlignmentFormats(csb.core.enum):
69 """ 70 Enumeration of multiple sequence alignment formats 71 """ 72 A3M='a3m'; FASTA='fa'; PIR='pir'
73
74 -class SequenceTypes(csb.core.enum):
75 """ 76 Enumeration of sequence types 77 """ 78 NucleicAcid='NA'; DNA='DNA'; RNA='RNA'; Protein='Protein'; Unknown='Unknown'
79
80 -class AlignmentTypes(csb.core.enum):
81 """ 82 Enumeration of alignment strategies 83 """ 84 Global='global'; Local='local'
85
86 -class NucleicAlphabet(csb.core.enum):
87 """ 88 Nucleic sequence alphabet 89 """ 90 Adenine='A'; Cytosine='C'; Guanine='G'; Thymine='T'; Uracil='U'; Purine='R'; Pyrimidine='Y'; Ketone='K'; 91 Amino='M'; Strong='S'; Weak='W'; NotA='B'; NotC='D'; NotG='H'; NotT='V'; Any='N'; Masked='X'; GAP='-'; INSERTION='.';
92
93 -class ProteinAlphabet(csb.core.enum):
94 """ 95 Protein sequence alphabet 96 """ 97 ALA='A'; ASX='B'; CYS='C'; ASP='D'; GLU='E'; PHE='F'; GLY='G'; HIS='H'; ILE='I'; LYS='K'; LEU='L'; MET='M'; ASN='N'; 98 PYL='O'; PRO='P'; GLN='Q'; ARG='R'; SER='S'; THR='T'; SEC='U'; VAL='V'; TRP='W'; TYR='Y'; GLX='Z'; UNK='X'; GAP='-'; 99 INSERTION='.'; STOP='*'
100
101 -class StdProteinAlphabet(csb.core.enum):
102 """ 103 Standard protein sequence alphabet 104 """ 105 ALA='A'; CYS='C'; ASP='D'; GLU='E'; PHE='F'; GLY='G'; HIS='H'; ILE='I'; LYS='K'; LEU='L'; MET='M'; ASN='N'; 106 PRO='P'; GLN='Q'; ARG='R'; SER='S'; THR='T'; VAL='V'; TRP='W'; TYR='Y'
107
108 -class UnknownAlphabet(csb.core.enum):
109 """ 110 Unknown sequence alphabet 111 """ 112 UNK='X'; GAP='-'; INSERTION='.'
113
114 -class SequenceAlphabets(object):
115 """ 116 Sequence alphabet enumerations. 117 118 @note: This class is kept for backwards compatibility. The individual 119 alphabet classes must be defined in the top level namespace, 120 otherwise the new enum types cannot be pickled properly. 121 """ 122 Nucleic = NucleicAlphabet 123 Protein = ProteinAlphabet 124 StdProtein = StdProteinAlphabet 125 Unknown = UnknownAlphabet 126 127 MAP = { SequenceTypes.Protein: ProteinAlphabet, 128 SequenceTypes.NucleicAcid: NucleicAlphabet, 129 SequenceTypes.DNA: NucleicAlphabet, 130 SequenceTypes.RNA: NucleicAlphabet, 131 SequenceTypes.Unknown: UnknownAlphabet } 132 133 assert set(MAP) == csb.core.Enum.members(SequenceTypes) 134 135 @staticmethod
136 - def get(type):
137 """ 138 Get the alphabet corresponding to the specified sequence C{type} 139 """ 140 return SequenceAlphabets.MAP[type]
141
142 143 -class SequenceError(ValueError):
144 pass
145
146 -class PositionError(IndexError):
147
148 - def __init__(self, index=None, start=1, end=None):
149 150 if end == 0: 151 start = 0 152 153 self.index = index 154 self.start = start 155 self.end = end 156 157 super(PositionError, self).__init__(index, start, end)
158
159 - def __str__(self):
160 161 if self.index is not None: 162 s = 'Position {0.index} is out of range [{0.start}, {0.end}]' 163 else: 164 s = 'Out of range [{0.start}, {0.end}]' 165 166 return s.format(self)
167
168 -class SequencePositionError(PositionError):
169 pass
170
171 -class ColumnPositionError(PositionError):
172 pass
173
174 -class SequenceNotFoundError(KeyError):
175 pass
176
177 -class DuplicateSequenceError(KeyError):
178 pass
179
180 -class ResidueInfo(object):
181
182 - def __init__(self, rank, type):
183 184 self._type = None 185 self._rank = rank 186 187 self.type = type
188 189 @property
190 - def type(self):
191 """ 192 Residue type - a member of any sequence alphabet 193 @rtype: enum item 194 """ 195 return self._type
196 @type.setter
197 - def type(self, type):
198 if type.enum not in (ProteinAlphabet, NucleicAlphabet, UnknownAlphabet): 199 raise TypeError(type) 200 self._type = type
201 202 @property
203 - def rank(self):
204 """ 205 Residue position (1-based) 206 @rtype: int 207 """ 208 return self._rank
209
210 - def __repr__(self):
211 return '<{1} [{0.rank}]: {0.type!r}>'.format(self, self.__class__.__name__)
212
213 214 -class ColumnInfo(object):
215
216 - def __init__(self, column, id, rank, residue):
217 218 self.column = column 219 self.id = id 220 self.rank = rank 221 self.residue = residue
222
223 - def __repr__(self):
224 return '<{0.__class__.__name__} {0.id} [{0.column}]: {0.residue.type!r}>'.format(self)
225
226 -class SequenceIndexer(object):
227
228 - def __init__(self, container):
229 self._container = container
230
231 - def __getitem__(self, rank):
232 233 if not 1 <= rank <= self._container.length: 234 raise SequencePositionError(rank, 1, self._container.length) 235 236 return self._container._get(rank)
237
238 - def __iter__(self):
239 return iter(self._container)
240
241 -class UngappedSequenceIndexer(SequenceIndexer):
242
243 - def __getitem__(self, rank):
244 try: 245 return self._container._get_ungapped(rank) 246 except SequencePositionError: 247 raise SequencePositionError(rank, 1)
248
249 - def __iter__(self):
250 for c in self._container: 251 if c.residue.type not in (self._container.alphabet.GAP, self._container.alphabet.INSERTION): 252 yield c.residue
253
254 -class ColumnIndexer(SequenceIndexer):
255
256 - def __getitem__(self, column):
257 258 if not 1 <= column <= self._container.length: 259 raise ColumnPositionError(column, 1, self._container.length) 260 261 return self._container._get_column(column)
262
263 264 -class SequenceCollection(csb.core.ReadOnlyCollectionContainer):
265 """ 266 Represents a list of L{AbstractSequence}s. 267 """ 268
269 - def __init__(self, sequences):
270 super(SequenceCollection, self).__init__(items=sequences, type=AbstractSequence)
271
272 - def to_fasta(self, output_file):
273 """ 274 Dump the whole collection in mFASTA format. 275 276 @param output_file: write the output to this file or stream 277 @type output_file: str or stream 278 """ 279 from csb.bio.io.fasta import FASTAOutputBuilder 280 281 with csb.io.EntryWriter(output_file, close=False) as out: 282 builder = FASTAOutputBuilder(out.stream, headers=True) 283 284 for s in self: 285 builder.add_sequence(s)
286
287 288 -class AbstractSequence(object):
289 """ 290 Base abstract class for all Sequence objects. 291 292 Provides 1-based access to the residues in the sequence via the 293 sequence.residues property. The sequence object itself also behaves like 294 a collection and provides 0-based access to its elements (residues). 295 296 @param id: FASTA ID of this sequence (e.g. accession number) 297 @type id: str 298 @param header: FASTA sequence header 299 @type header: str 300 @param residues: sequence residues 301 @type residues: str or collection of L{ResidueInfo} 302 @param type: a L{SequenceTypes} member (defaults to protein) 303 @type type: L{EnumItem} 304 """ 305 306 __metaclass__ = ABCMeta 307 308 DELIMITER = '>' 309
310 - def __init__(self, id, header, residues, type=SequenceTypes.Unknown):
311 312 self._id = None 313 self._header = None 314 self._residues = [] 315 self._type = None 316 317 self.id = id 318 self.header = header 319 self.type = type 320 321 for residue in residues: 322 self._add(residue)
323
324 - def __getitem__(self, spec):
325 326 if isinstance(spec, slice): 327 spec = SliceHelper(spec, 0, self.length) 328 return self.subregion(spec.start + 1, spec.stop) 329 else: 330 if not 0 <= spec < self.length: 331 raise IndexError(spec) 332 return self._get(spec + 1)
333
334 - def __iter__(self):
335 for index in range(self.length): 336 yield self[index]
337 338 @abstractmethod
339 - def _add(self, residue):
340 """ 341 Append a C{residue} to the sequence. 342 343 This is a hook method invoked internally for each residue during object 344 construction. By implementing this method, sub-classes define how 345 residues are attached to the sequence object. 346 """ 347 pass
348 349 @abstractmethod
350 - def _get(self, rank):
351 """ 352 Retrieve the sequence residue at the specified position (1-based, positive). 353 354 This is a hook method which defines the actual behavior of the sequence 355 residue indexer. 356 357 @rtype: L{ResidueInfo} 358 @raise SequencePositionError: when the supplied rank is out of range 359 """ 360 pass
361
362 - def _factory(self, *a, **k):
363 """ 364 Return a new sequence of the current L{AbstractSequence} sub-class. 365 """ 366 return self.__class__(*a, **k)
367
368 - def strip(self):
369 """ 370 Remove all gaps and insertions from the sequence. 371 372 @return: a new sequence instance, containing no gaps 373 @rtype: L{AbstractSequence} 374 """ 375 residues = [r for r in self._residues 376 if r.type not in (self.alphabet.GAP, self.alphabet.INSERTION)] 377 378 return self._factory(self.id, self.header, residues, self.type)
379
380 - def subregion(self, start, end):
381 """ 382 Extract a subsequence, defined by [start, end]. The start and end 383 positions are 1-based, inclusive. 384 385 @param start: start position 386 @type start: int 387 @param end: end position 388 @type end: int 389 390 @return: a new sequence 391 @rtype: L{AbstractSequence} 392 393 @raise SequencePositionError: if start/end positions are out of range 394 """ 395 positions = range(start, end + 1) 396 return self.extract(positions)
397
398 - def extract(self, positions):
399 """ 400 Extract a subsequence, defined by a list of 1-based positions. 401 402 @param positions: positions to extract 403 @type positions: tuple of int 404 405 @return: a new sequence 406 @rtype: L{AbstractSequence} 407 408 @raise SequencePositionError: if any position is out of range 409 """ 410 411 end = self.length 412 residues = [] 413 414 for rank in sorted(set(positions)): 415 if 1 <= rank <= end: 416 residues.append(self._get(rank)) 417 else: 418 raise SequencePositionError(rank, 1, end) 419 420 return self._factory(self.id, self.header, residues, self.type)
421
422 - def dump(self, output_file):
423 """ 424 Dump the sequence in FASTA format. 425 426 @param output_file: write the output to this file or stream 427 @type output_file: str or stream 428 """ 429 from csb.bio.io.fasta import FASTAOutputBuilder 430 431 with csb.io.EntryWriter(output_file, close=False) as out: 432 FASTAOutputBuilder(out.stream, headers=True).add_sequence(self)
433 434 @property
435 - def length(self):
436 """ 437 Number of residues 438 @rtype: int 439 """ 440 return len(self._residues)
441 442 @property
443 - def id(self):
444 """ 445 Sequence identifier 446 @rtype: str 447 """ 448 return self._id
449 @id.setter
450 - def id(self, value):
451 if value is not None: 452 value = str(value).strip() 453 self._id = value
454 455 @property
456 - def header(self):
457 """ 458 Sequence description 459 @rtype: str 460 """ 461 return self._header
462 @header.setter
463 - def header(self, value):
464 if not value: 465 value = 'sequence' 466 else: 467 value = value.strip().lstrip(AbstractSequence.DELIMITER) 468 self._header = value
469 470 @property
471 - def type(self):
472 """ 473 Sequence type - a member of L{SequenceTypes} 474 @rtype: enum item 475 """ 476 return self._type
477 @type.setter
478 - def type(self, value):
479 if isinstance(value, csb.core.string): 480 value = csb.core.Enum.parse(SequenceTypes, value) 481 if value.enum is not SequenceTypes: 482 raise TypeError(value) 483 self._type = value
484 485 @property
486 - def sequence(self):
487 """ 488 The actual sequence 489 @rtype: str 490 """ 491 return ''.join([str(r.type) for r in self._residues])
492 493 @property
494 - def alphabet(self):
495 """ 496 The sequence alphabet corresponding to the current sequence type 497 @rtype: L{csb.core.enum} 498 """ 499 return SequenceAlphabets.get(self._type)
500 501 @property
502 - def residues(self):
503 """ 504 Rank-based access to the underlying L{residues<csb.bio.sequence.ResidueInfo>} 505 @rtype: L{SequenceIndexer} 506 """ 507 return SequenceIndexer(self)
508
509 - def __len__(self):
510 return self.length
511
512 - def __repr__(self):
513 return '<{0.__class__.__name__}: {0.id}, {0.length} residues>'.format(self)
514
515 - def __str__(self):
516 return '{0}{1.header}\n{1.sequence}'.format(AbstractSequence.DELIMITER, self)
517
518 -class Sequence(AbstractSequence):
519 """ 520 High-performance sequence object. The actual sequence is stored internally 521 as a string. The indexer acts as a residue factory, which creates a new 522 L{ResidueInfo} instance each time. 523 524 @note: This class was created with parsing large volumes of data in mind. This 525 comes at the expense of degraded performance of the sequence indexer. 526 527 @param id: FASTA ID of this sequence (e.g. accession number) 528 @type id: str 529 @param header: FASTA sequence header 530 @type header: str 531 @param residues: sequence string 532 @type residues: str 533 @param type: a L{SequenceTypes} member (defaults to protein) 534 @type type: L{EnumItem} 535 """ 536
537 - def __init__(self, id, header, residues, type=SequenceTypes.Unknown):
538 539 self._id = None 540 self._header = None 541 self._residues = '' 542 self._type = None 543 544 self.id = id 545 self.header = header 546 self.type = type 547 548 self._append(residues)
549
550 - def _append(self, string):
551 # this seems to be the fastest method for sanitization and storage 552 self._residues += re.sub('([^\w\-\.])+', '', string)
553
554 - def _add(self, char):
555 self._append(char)
556
557 - def _get(self, rank):
558 559 type = csb.core.Enum.parse(self.alphabet, self._residues[rank - 1]) 560 return ResidueInfo(rank, type)
561
562 - def strip(self):
563 residues = self._residues.replace( 564 str(self.alphabet.GAP), '').replace( 565 str(self.alphabet.INSERTION), '') 566 return self._factory(self.id, self.header, residues, self.type)
567
568 - def subregion(self, start, end):
569 570 if not 1 <= start <= end <= self.length: 571 raise SequencePositionError(None, 1, self.length) 572 573 residues = self._residues[start - 1 : end] 574 return self._factory(self.id, self.header, residues, self.type)
575
576 - def extract(self, positions):
577 578 end = self.length 579 residues = [] 580 581 for rank in sorted(set(positions)): 582 if 1 <= rank <= end: 583 residues.append(self._residues[rank - 1]) 584 else: 585 raise SequencePositionError(rank, 1, end) 586 587 return self._factory(self.id, self.header, ''.join(residues), self.type)
588 589 @property
590 - def sequence(self):
591 return self._residues
592
593 -class RichSequence(AbstractSequence):
594 """ 595 Sequence implementation, which converts the sequence into a list of 596 L{ResidueInfo} objects. See L{AbstractSequence} for details. 597 """ 598
599 - def _add(self, residue):
600 601 if hasattr(residue, 'rank') and hasattr(residue, 'type'): 602 self._residues.append(residue) 603 604 else: 605 if residue.isalpha() or residue in (self.alphabet.GAP, self.alphabet.INSERTION): 606 607 type = csb.core.Enum.parse(self.alphabet, residue) 608 rank = len(self._residues) + 1 609 self._residues.append(ResidueInfo(rank, type))
610
611 - def _get(self, rank):
612 return self._residues[rank - 1]
613 614 @staticmethod
615 - def create(sequence):
616 """ 617 Create a new L{RichSequence} from existing L{AbstractSequence}. 618 619 @type sequence: L{AbstractSequence} 620 @rtype: L{RichSequence} 621 """ 622 return RichSequence( 623 sequence.id, sequence.header, sequence.sequence, sequence.type)
624
625 -class ChainSequence(AbstractSequence):
626 """ 627 Sequence view for L{csb.bio.structure.Chain} objects. 628 See L{AbstractSequence} for details. 629 """ 630
631 - def _add(self, residue):
632 633 if not (hasattr(residue, 'rank') and hasattr(residue, 'type')): 634 raise TypeError(residue) 635 else: 636 self._residues.append(residue)
637
638 - def _get(self, rank):
639 return self._residues[rank - 1]
640 641 @staticmethod
642 - def create(chain):
643 """ 644 Create a new L{ChainSequence} from existing L{Chain} instance. 645 646 @type chain: L{csb.bio.structure.Chain} 647 @rtype: L{ChainSequence} 648 """ 649 return ChainSequence( 650 chain.entry_id, chain.header, chain.residues, chain.type)
651
652 653 -class SequenceAdapter(object):
654 """ 655 Base wrapper class for L{AbstractSequence} objects. 656 Needs to be sub-classed (does not do anything special on its own). 657 658 @param sequence: adaptee 659 @type sequence: L{AbstractSequence} 660 """ 661
662 - def __init__(self, sequence):
663 664 if not isinstance(sequence, AbstractSequence): 665 raise TypeError(sequence) 666 667 self._subject = sequence
668
669 - def __getitem__(self, i):
670 return self._subject[i]
671
672 - def __iter__(self):
673 return iter(self._subject)
674
675 - def __repr__(self):
676 return '<{0.__class__.__name__}: {0.id}, {0.length}>'.format(self)
677
678 - def __str__(self):
679 return str(self._subject)
680
681 - def _add(self):
682 raise NotImplementedError()
683
684 - def _get(self, rank):
685 return self._subject._get(rank)
686
687 - def _factory(self, *a, **k):
688 return self.__class__(self._subject._factory(*a, **k))
689
690 - def strip(self):
691 return self._subject.strip()
692
693 - def subregion(self, start, end):
694 return self._subject.subregion(start, end)
695
696 - def extract(self, positions):
697 return self._subject.extract(positions)
698 699 @property
700 - def id(self):
701 return self._subject.id
702 703 @property
704 - def length(self):
705 return self._subject.length
706 707 @property
708 - def type(self):
709 return self._subject.type
710 711 @property
712 - def header(self):
713 return self._subject.header
714 715 @property
716 - def sequence(self):
717 return self._subject.sequence
718 719 @property
720 - def alphabet(self):
721 return self._subject.alphabet
722
723 -class AlignedSequenceAdapter(SequenceAdapter):
724 """ 725 Adapter, which wraps a gapped L{AbstractSequence} object and makes it 726 compatible with the MSA row/entry interface, expected by L{AbstractAlignment}. 727 728 The C{adapter.residues} property operates with an L{UngappedSequenceIndexer}, 729 which provides a gap-free view of the underlying sequence. 730 731 The C{adapter.columns} property operates with a standard L{ColumnIndexer}, 732 the same indexer which is used to provide the column view in multiple 733 alignments. Adapted sequences therefore act as alignment rows and allow for 734 MSA-column-oriented indexing. 735 736 @param sequence: adaptee 737 @type sequence: L{AbstractSequence} 738 """ 739
740 - def __init__(self, sequence):
741 742 super(AlignedSequenceAdapter, self).__init__(sequence) 743 744 self._fmap = {} 745 self._rmap = {} 746 rank = 0 747 748 for column, residue in enumerate(sequence, start=1): 749 750 if residue.type not in (self.alphabet.GAP, self.alphabet.INSERTION): 751 rank += 1 752 self._fmap[column] = rank 753 self._rmap[rank] = column 754 else: 755 self._fmap[column] = None
756
757 - def __getitem__(self, index):
758 if not 0 <= index < self.length: 759 raise IndexError(index) 760 return self._get_column(index + 1)
761
762 - def __iter__(self):
763 for c in sorted(self._fmap): 764 yield self._get_column(c)
765 766 @property
767 - def columns(self):
768 """ 769 Provides 1-based access to the respective columns in the MSA. 770 @rtype: L{ColumnIndexer} 771 """ 772 return ColumnIndexer(self)
773 774 @property
775 - def residues(self):
776 """ 777 Provides 1-based access to the residues of the unaligned (ungapped) 778 sequence. 779 @rtype: L{UngappedSequenceIndexer} 780 """ 781 return UngappedSequenceIndexer(self)
782
783 - def _get_column(self, column):
784 return ColumnInfo( 785 column, self.id, self._fmap[column], self._subject.residues[column])
786
787 - def _get_ungapped(self, rank):
788 return self._subject.residues[self._rmap[rank]]
789
790 - def map_residue(self, rank):
791 """ 792 Return the MSA column number corresponding to the specified ungapped 793 sequence C{rank}. 794 795 @param rank: 1-based residue rank 796 @type rank: int 797 @rtype: int 798 """ 799 return self._rmap[rank]
800
801 - def map_column(self, column):
802 """ 803 Return the ungapped sequence rank corresponding to the specified MSA 804 C{column} number. 805 806 @param column: 1-based alignment column number 807 @type column: int 808 @rtype: int 809 """ 810 return self._fmap[column]
811
812 -class SliceHelper(object):
813
814 - def __init__(self, slice, start=0, stop=0):
815 816 s, e, t = slice.start, slice.stop, slice.step 817 818 if s is None: 819 s = start 820 if e is None: 821 e = stop 822 if t is None: 823 t = 1 824 825 for value in [s, e, t]: 826 if value < 0: 827 raise IndexError(value) 828 829 self.start = s 830 self.stop = e 831 self.step = t
832
833 -class AlignmentRowsTable(csb.core.BaseDictionaryContainer):
834
835 - def __init__(self, container):
836 837 super(AlignmentRowsTable, self).__init__() 838 839 self._container = container 840 self._map = {}
841
842 - def __getitem__(self, item):
843 844 try: 845 if isinstance(item, int): 846 key = self._map[item] 847 else: 848 key = item 849 850 return super(AlignmentRowsTable, self).__getitem__(key) 851 852 except KeyError: 853 raise SequenceNotFoundError(item)
854
855 - def _append(self, sequence):
856 857 n = 0 858 sequence_id = sequence.id 859 860 while sequence_id in self: 861 n += 1 862 sequence_id = '{0}:A{1}'.format(sequence.id, n) 863 864 super(AlignmentRowsTable, self)._append_item(sequence_id, sequence) 865 self._map[self.length] = sequence_id
866
867 - def __iter__(self):
868 for id in super(AlignmentRowsTable, self).__iter__(): 869 yield self[id]
870
871 872 -class AbstractAlignment(object):
873 """ 874 Base class for all alignment objects. 875 876 Provides 1-based access to the alignment.rows and alignment.columns. 877 Alignment rows can also be accessed by sequence ID. In addition, all 878 alignments support 0-based slicing: 879 880 >>> alignment[rows, columns] 881 AbstractAlignment (sub-alignment) 882 883 where 884 - C{rows} can be a slice, tuple of row indexes or tuple of sequence IDs 885 - columns can be a slice or tuple of column indexes 886 887 For example: 888 889 >>> alignment[:, 2:] 890 AbstractAlignment # all rows, columns [3, alignment.length] 891 >>> alignment[(0, 'seqx'), (3, 5)] 892 AbstractAlignment # rows #1 and 'seq3', columns #4 and #5 893 894 @param sequences: alignment entries (must have equal length) 895 @type sequences: list of L{AbstractSequence}s 896 @param strict: if True, raise {DuplicateSequenceError} when a duplicate ID 897 is found (default=True) 898 @type strict: bool 899 900 @note: if C{strict} is False and there are C{sequences} with redundant identifiers, 901 those sequences will be added to the C{rows} collection with :An suffix, 902 where n is a serial number. Therefore, rows['ID'] will return only one sequence, 903 the first sequence with id=ID. All remaining sequences can be retrieved 904 with C{rows['ID:A1']}, {rows['ID:A2']}, etc. However, the sequence objects will 905 remain intact, e.g. {rows['ID:A1'].id} still returns 'ID' and not 'ID:A1'. 906 """ 907 908 __metaclass__ = ABCMeta 909
910 - def __init__(self, sequences, strict=True):
911 912 self._length = None 913 self._msa = AlignmentRowsTable(self) 914 self._colview = ColumnIndexer(self) 915 self._map = {} 916 self._strict = bool(strict) 917 918 self._construct(sequences)
919
920 - def __getitem__(self, spec):
921 922 # The following code can hardly get more readable than that, sorry. 923 # Don't even think of modifying this before there is a 100% unit test coverage 924 925 # 0. expand the input tuple: (rows/, columns/) => (rows, columns) 926 if not isinstance(spec, tuple) or len(spec) not in (1, 2): 927 raise TypeError('Invalid alignment slice expression') 928 929 if len(spec) == 2: 930 rowspec, colspec = spec 931 else: 932 rowspec, colspec = [spec, slice(None)] 933 934 # 1. interpret the row slice: int, iter(int), iter(str) or slice(int) => list(int, 1-based) 935 if isinstance(rowspec, slice): 936 if isinstance(rowspec.start, csb.core.string) or isinstance(rowspec.stop, csb.core.string): 937 raise TypeError("Invalid row slice: only indexes are supported") 938 rowspec = SliceHelper(rowspec, 0, self.size) 939 rows = range(rowspec.start + 1, rowspec.stop + 1) 940 elif isinstance(rowspec, int): 941 rows = [rowspec + 1] 942 elif csb.core.iterable(rowspec): 943 try: 944 rows = [] 945 for r in rowspec: 946 if isinstance(r, int): 947 rows.append(r + 1) 948 else: 949 rows.append(self._map[r]) 950 except KeyError as ke: 951 raise KeyError('No such Sequence ID: {0!s}'.format(ke)) 952 else: 953 raise TypeError('Unsupported row expression') 954 955 # 2. interpret the column slice: int, iter(int) or slice(int) => list(int, 1-based) 956 if isinstance(colspec, slice): 957 colspec = SliceHelper(colspec, 0, self._length or 0) 958 cols = range(colspec.start + 1, colspec.stop + 1) 959 elif isinstance(colspec, int): 960 cols = [colspec + 1] 961 elif csb.core.iterable(colspec): 962 try: 963 cols = [ c + 1 for c in colspec ] 964 except: 965 raise TypeError('Unsupported column expression') 966 else: 967 raise TypeError('Unsupported column expression') 968 969 # 3. some more checks 970 if len(rows) == 0: 971 raise ValueError("The expression returns zero rows") 972 if len(cols) == 0: 973 raise ValueError("The expression returns zero columns") 974 975 # 4. we are done 976 return self._extract(rows, cols)
977
978 - def _range(self, slice, start, end):
979 980 s, e, t = slice.start, slice.end, slice.step 981 982 if s is None: 983 s = start 984 if e is None: 985 e = end 986 if t is None: 987 t = 1 988 989 return range(s, e, t)
990
991 - def __iter__(self):
992 for cn in range(1, self.length + 1): 993 yield self._get_column(cn)
994 995 @abstractmethod
996 - def _construct(self, sequences):
997 """ 998 Hook method, called internally upon object construction. Subclasses 999 define how the source alignment sequences are handled during alignment 1000 construction. 1001 1002 @param sequences: alignment entries 1003 @type sequences: list of L{AbstractSequence}s 1004 """ 1005 pass
1006
1007 - def _initialize(self, rep_sequence):
1008 """ 1009 Hook method, which is used to initialize various alignment properties 1010 (such as length) from the first alignned sequence. 1011 """ 1012 if rep_sequence.length == 0: 1013 raise SequenceError("Sequence '{0}' is empty".format(rep_sequence.id)) 1014 1015 assert self._length is None 1016 self._length = rep_sequence.length
1017
1018 - def _factory(self, *a, **k):
1019 """ 1020 Return a new sequence of the current L{AbstractAlignment} sub-class. 1021 """ 1022 return self.__class__(*a, **k)
1023
1024 - def add(self, sequence):
1025 """ 1026 Append a new sequence to the alignment. 1027 1028 @type sequence: L{AbstractSequence} 1029 @raise SequenceError: if the new sequence is too short/long 1030 @raise DuplicateSequenceError: if a sequence with same ID already exists 1031 """ 1032 1033 if self._msa.length == 0: 1034 self._initialize(sequence) 1035 1036 if sequence.length != self._length: 1037 raise SequenceError('{0!r} is not of the expected length'.format(sequence)) 1038 1039 if self._strict and sequence.id in self._msa: 1040 raise DuplicateSequenceError(sequence.id) 1041 1042 self._msa._append(AlignedSequenceAdapter(sequence)) 1043 self._map[sequence.id] = self._msa.length
1044 1045 @property
1046 - def length(self):
1047 """ 1048 Number of columns in the alignment 1049 @rtype: int 1050 """ 1051 return self._length or 0
1052 1053 @property
1054 - def size(self):
1055 """ 1056 Number of rows (sequences) in the alignment 1057 @rtype: int 1058 """ 1059 return self._msa.length
1060 1061 @property
1062 - def rows(self):
1063 """ 1064 1-based access to the alignment entries (sequences) 1065 @rtype: L{AlignmentRowsTable} 1066 """ 1067 return self._msa
1068 1069 @property
1070 - def columns(self):
1071 """ 1072 1-based access to the alignment columns 1073 @rtype: L{ColumnIndexer} 1074 """ 1075 return self._colview
1076
1077 - def gap_at(self, column):
1078 """ 1079 Return True of C{column} contains at least one gap. 1080 @param column: column number, 1-based 1081 @type column: int 1082 1083 @rtype: bool 1084 """ 1085 1086 for row in self._msa: 1087 if row.columns[column].residue.type == row.alphabet.GAP: 1088 return True 1089 1090 return False
1091
1092 - def _get_column(self, column):
1093 return tuple(row._get_column(column) for row in self.rows)
1094
1095 - def _extract(self, rows, cols):
1096 1097 rows = set(rows) 1098 cols = set(cols) 1099 1100 if not 1 <= min(rows) <= max(rows) <= self.size: 1101 raise IndexError('Row specification out of range') 1102 1103 if not 1 <= min(cols) <= max(cols) <= self.length: 1104 raise IndexError('Column specification out of range') 1105 1106 sequences = [] 1107 1108 for rn, row in enumerate(self.rows, start=1): 1109 if rn in rows: 1110 sequences.append(row.extract(cols)) 1111 1112 return self._factory(sequences, strict=self._strict)
1113
1114 - def subregion(self, start, end):
1115 """ 1116 Extract a sub-alignment, ranging from C{start} to C{end} columns. 1117 1118 @param start: starting column, 1-based 1119 @type start: int 1120 @param end: ending column, 1-based 1121 @type end: int 1122 1123 @return: a new alignment of the current type 1124 @rtype: L{AbstractAlignment} 1125 1126 @raise ColumnPositionError: if start/end is out of range 1127 """ 1128 if not 1 <= start <= end <= self.length: 1129 raise ColumnPositionError(None, 1, self.length) 1130 1131 sequences = [] 1132 1133 for row in self.rows: 1134 sequences.append(row.subregion(start, end)) 1135 1136 return self._factory(sequences, strict=self._strict)
1137
1138 - def format(self, format=AlignmentFormats.FASTA, headers=True):
1139 """ 1140 Format the alignment as a string. 1141 1142 @param format: alignment format type, member of L{AlignmentFormats} 1143 @type format: L{EnumItem} 1144 @param headers: if False, omit headers 1145 @type headers: bool 1146 1147 @rtype: str 1148 """ 1149 from csb.bio.io.fasta import OutputBuilder 1150 1151 temp = csb.io.MemoryStream() 1152 1153 try: 1154 builder = OutputBuilder.create(format, temp, headers=headers) 1155 builder.add_alignment(self) 1156 1157 return temp.getvalue() 1158 1159 finally: 1160 temp.close()
1161
1162 -class SequenceAlignment(AbstractAlignment):
1163 """ 1164 Multiple sequence alignment. See L{AbstractAlignment} for details. 1165 """ 1166
1167 - def _construct(self, sequences):
1168 1169 for sequence in sequences: 1170 self.add(sequence)
1171 1172 @staticmethod
1173 - def parse(string, strict=True):
1174 """ 1175 Create a new L{SequenceAlignment} from an mFASTA string. 1176 1177 @param string: MSA-formatted string 1178 @type string: str 1179 @param strict: see L{AbstractAlignment} 1180 @type strict: bool 1181 1182 @rtype: L{SequenceAlignment} 1183 """ 1184 from csb.bio.io.fasta import SequenceAlignmentReader 1185 return SequenceAlignmentReader(strict=strict).read_fasta(string)
1186
1187 -class StructureAlignment(AbstractAlignment):
1188 """ 1189 Multiple structure alignment. Similar to a L{SequenceAlignment}, but 1190 the alignment holds the actual L{csb.bio.structure.ProteinResidue} objects, 1191 taken from the corresponding source L{csb.bio.structure.Chain}s. 1192 1193 See L{AbstractAlignment} for details. 1194 """ 1195
1196 - def _construct(self, sequences):
1197 1198 for sequence in sequences: 1199 self.add(sequence)
1200 1201 @staticmethod
1202 - def parse(string, provider, id_factory=None, strict=True):
1203 """ 1204 Create a new L{StructureAlignment} from an mFASTA string. See 1205 L{csb.bio.io.fasta.StructureAlignmentFactory} for details. 1206 1207 @param string: MSA-formatted string 1208 @type string: str 1209 @param provider: data source for all structures found in the alignment 1210 @type provider: L{csb.bio.io.wwpdb.StructureProvider} 1211 @param strict: see L{AbstractAlignment} 1212 @type strict: bool 1213 @param id_factory: callable factory, which transforms a sequence ID into 1214 a L{csb.bio.io.wwpdb.EntryID} object. By default 1215 this is L{csb.bio.io.wwpdb.EntryID.create}. 1216 @type id_factory: callable 1217 @rtype: L{StructureAlignment} 1218 """ 1219 from csb.bio.io.fasta import StructureAlignmentFactory 1220 1221 factory = StructureAlignmentFactory( 1222 provider, id_factory=id_factory, strict=strict) 1223 return factory.make_alignment(string)
1224
1225 -class A3MAlignment(AbstractAlignment):
1226 """ 1227 A specific type of multiple alignment, which provides some operations 1228 relative to a master sequence (the first entry in the alignment). 1229 """ 1230
1231 - def __init__(self, sequences, strict=True):
1232 1233 self._master = None 1234 self._matches = 0 1235 self._insertions = set() 1236 1237 super(A3MAlignment, self).__init__(sequences, strict=strict)
1238
1239 - def _initialize(self, rep_sequence):
1240 1241 super(A3MAlignment, self)._initialize(rep_sequence) 1242 self._alphabet = rep_sequence.alphabet
1243
1244 - def _construct(self, sequences):
1245 1246 for sequence in sequences: 1247 1248 self.add(sequence) 1249 1250 for rank, residue in enumerate(sequence, start=1): 1251 if residue.type == self._alphabet.INSERTION: 1252 self._insertions.add(rank) 1253 1254 if self.size == 0: 1255 raise SequenceError("At least one sequence is required") 1256 1257 self._master = list(self._msa)[0] 1258 self._matches = self._master.strip().length
1259 1260 @property
1261 - def master(self):
1262 """ 1263 The master sequence 1264 @rtype: L{AbstractSequence} 1265 """ 1266 return self._master
1267
1268 - def insertion_at(self, column):
1269 """ 1270 Return True of C{column} contains at least one insertion. 1271 1272 @param column: column number, 1-based 1273 @type column: int 1274 @rtype: bool 1275 """ 1276 return column in self._insertions
1277
1278 - def hmm_subregion(self, match_start, match_end):
1279 """ 1280 Same as L{AbstractAlignment.subregion}, but start/end positions are 1281 ranks in the ungapped master sequence. 1282 """ 1283 1284 if not 1 <= match_start <= match_end <= self.matches: 1285 raise ColumnPositionError(None, 1, self.matches) 1286 1287 start = self._master.map_residue(match_start) 1288 end = self._master.map_residue(match_end) 1289 1290 return self.subregion(start, end)
1291
1292 - def format(self, format=AlignmentFormats.A3M, headers=True):
1293 return super(A3MAlignment, self).format(format, headers)
1294 1295 @property
1296 - def matches(self):
1297 """ 1298 Number of match states (residues in the ungapped master). 1299 @rtype: int 1300 """ 1301 return self._matches
1302 1303 @staticmethod
1304 - def parse(string, strict=True):
1305 """ 1306 Create a new L{A3MAlignment} from an A3M string. 1307 1308 @param string: MSA-formatted string 1309 @type string: str 1310 @param strict: see L{AbstractAlignment} 1311 @type strict: bool 1312 1313 @rtype: L{A3MAlignment} 1314 """ 1315 from csb.bio.io.fasta import SequenceAlignmentReader 1316 return SequenceAlignmentReader(strict=strict).read_a3m(string)
1317