Package csb :: Package apps :: Module precision
[frames] | no frames]

Source Code for Module csb.apps.precision

  1  """ 
  2  Measure the precision and coverage of a fragment library stored in Rosetta 
  3  NNmake format. 
  4  """ 
  5   
  6  import os 
  7  import multiprocessing 
  8  import matplotlib 
  9   
 10  import csb.apps 
 11   
 12  import csb.bio.io.wwpdb  as wwpdb 
 13  import csb.bio.structure as structure 
 14  import csb.bio.fragments 
 15  import csb.bio.fragments.rosetta as rosetta 
 16  import csb.core 
 17  import csb.io.plots 
18 19 20 -class ExitCodes(csb.apps.ExitCodes):
21 22 IO_ERROR = 2 23 INVALID_DATA = 3
24
25 -class AppRunner(csb.apps.AppRunner):
26 27 @property
28 - def target(self):
29 return PrecisionApp
30
31 - def command_line(self):
32 33 cpu = multiprocessing.cpu_count() 34 cmd = csb.apps.ArgHandler(self.program, __doc__) 35 36 cmd.add_scalar_option('pdb', 'p', str, 'the PDB database (a directory containing all PDB files)', required=True) 37 cmd.add_scalar_option('native', 'n', str, 'native structure of the target (PDB file)', required=True) 38 cmd.add_scalar_option('chain', 'c', str, 'chain identifier (if not specified, the first chain)', default=None) 39 40 cmd.add_scalar_option('top', 't', int, 'read top N fragments per position', default=25) 41 cmd.add_scalar_option('cpu', 'C', int, 'maximum degree of parallelism', default=cpu) 42 cmd.add_scalar_option('rmsd', 'r', float, 'RMSD cutoff for precision and coverage', default=1.5) 43 cmd.add_scalar_option('output', 'o', str, 'output directory', default='.') 44 45 cmd.add_boolean_option('save-structures', 's', 'create a PDB file for each fragment, superimposed over the native', default=False) 46 47 cmd.add_positional_argument('library', str, 'Fragment library file in Rosetta NNmake format') 48 49 return cmd
50
51 -class PrecisionApp(csb.apps.Application):
52
53 - def main(self):
54 55 if not os.path.isdir(self.args.output): 56 PrecisionApp.exit('Output directory does not exist', code=ExitCodes.INVALID_DATA, usage=True) 57 58 for file in [self.args.native, self.args.library]: 59 if not os.path.isfile(file): 60 PrecisionApp.exit('File not found: {0}'.format(file), code=ExitCodes.INVALID_DATA, usage=True) 61 62 self.log('\nLoading {0.library} (top {0.top} per position)... '.format(self.args), ending='') 63 64 try: 65 library = rosetta.RosettaFragmentMap.read(self.args.library, top=self.args.top) 66 self.log('{0} fragments'.format(len(library))) 67 68 try: 69 native = wwpdb.RegularStructureParser(self.args.native).parse_structure() 70 native.accession = native.accession[:4] 71 if self.args.chain: 72 native = native.chains[self.args.chain] 73 else: 74 native = native.first_chain 75 except (structure.Broken3DStructureError, wwpdb.PDBParseError) as se: 76 raise ArgumentError("Can't parse native structure: " + str(se)) 77 78 except (csb.core.ItemNotFoundError) as ce: 79 raise ArgumentError("Chain {0!s} not found in native structure".format(ce)) 80 81 self.log('Superimposing fragments...\n') 82 si = LibrarySuperimposer(native, library, self.args.pdb, self.args.output, 83 save=self.args.save_structures, cutoff=self.args.rmsd) 84 matches = si.run(self.args.cpu) 85 86 info = si.plot(matches) 87 self.log(' RMSD Cutoff {0:>6.2f} A'.format(self.args.rmsd)) 88 self.log(' AVG Precision {0.precision:>6.2f} %'.format(info)) 89 self.log(' Coverage {0.coverage:>6.2f} %'.format(info)) 90 self.log(' RW Precision {0.figure}'.format(si)) 91 92 93 94 except ArgumentIOError as ioe: 95 PrecisionApp.exit(str(ioe), code=ExitCodes.INVALID_DATA) 96 97 except ArgumentError as ae: 98 PrecisionApp.exit(str(ae), code=ExitCodes.INVALID_DATA) 99 100 self.log('\nDone.')
101
102 -class ArgumentError(ValueError):
103 pass
104
105 -class ArgumentIOError(ArgumentError):
106 pass
107
108 -class GlobalInfo(object):
109
110 - def __init__(self, precision, coverage):
111 self.precision = precision 112 self.coverage = coverage
113
114 - def __str__(self):
115 return '{0.precision:6.2f} {0.coverage:6.2f}'.format(self)
116
117 - def __sub__(self, rhs):
118 return GlobalInfo(self.precision - rhs.precision, self.coverage - rhs.coverage)
119
120 -class LibrarySuperimposer(object):
121
122 - def __init__(self, native, library, pdb, output, save=False, cutoff=1.5):
123 124 if not isinstance(native, structure.Chain): 125 raise TypeError(native) 126 elif native.length < 1: 127 raise ArgumentError('The native chain has no residues') 128 129 if not isinstance(library, rosetta.RosettaFragmentMap): 130 raise TypeError(library) 131 132 for i in [pdb, output]: 133 if not os.path.isdir(i): 134 raise ArgumentIOError("{0} is not a valid directory".format(i)) 135 136 self._pdb = pdb 137 self._native = native 138 self._library = library 139 self._output = os.path.abspath(output) 140 self._tab = os.path.join(self._output, native.entry_id + '.fragments.tab') 141 self._figure = os.path.join(self._output, native.entry_id + '.precision.png') 142 self._out = open(self._tab, 'w') 143 self._save = bool(save) 144 self._cutoff = float(cutoff)
145
146 - def __del__(self):
147 try: 148 if self._out and not self._out.closed: 149 self._out.close() 150 except: 151 pass
152 153 @property
154 - def table(self):
155 return self._tab
156 157 @property
158 - def figure(self):
159 return self._figure
160
161 - def run(self, cpu=1):
162 163 tasks = [] 164 matches = [] 165 save = None 166 if self._save: 167 save = self._output 168 169 pool = multiprocessing.Pool(cpu) 170 171 for source in self._library.sources: 172 fragments = self._library.fromsource(source) 173 task = pool.apply_async(rmsd, [self._native, source, fragments, self._pdb, save]) 174 tasks.append(task) 175 176 for task in tasks: 177 for match in task.get(): 178 if isinstance(match, csb.core.string): # error 179 self._out.write(' [E] ') 180 self._out.write(match.rstrip()) 181 self._out.write('\n') 182 else: 183 line = '{0.id}\t{0.qstart}\t{0.qend}\t{0.length}\t{0.rmsd}\n'.format(match) 184 self._out.write(line) 185 matches.append(match) 186 187 return matches
188
189 - def plot(self, matches):
190 191 residues = range(1, self._native.length + 1) 192 precision = [] 193 precision2 = [] 194 background = [] 195 covered = set() 196 197 all = {} 198 positive = {} 199 200 for rank in residues: 201 all[rank] = 0 202 positive[rank] = 0 203 204 for match in matches: 205 for rank in range(match.qstart, match.qend + 1): 206 all[rank] += 1 207 208 if match.rmsd <= self._cutoff: 209 positive[rank] += 1 210 covered.add(rank) 211 212 assert sorted(all.keys()) == sorted(positive.keys()) == residues 213 214 for rank in residues: 215 if all[rank] == 0: 216 precision2.append(0) 217 background.append(0) 218 else: 219 p = positive[rank] * 100.0 / all[rank] 220 precision.append(p) 221 precision2.append(p) 222 background.append(100) 223 224 coverage = len(covered) * 100.0 / len(residues) 225 if len(precision) > 0: 226 avg_precision = sum(precision) / len(precision) 227 else: 228 avg_precision = 0 229 230 with csb.io.plots.Chart() as chart: 231 232 chart.plot.bar(residues, background, color='#FFB0B0', linewidth=None, edgecolor='#FFB0B0') 233 chart.plot.bar(residues, precision2, color='#50A6DA', linewidth=None, edgecolor='#50A6DA') 234 235 chart.plot.set_title(self._native.entry_id) 236 chart.plot.set_xlabel('Residue') 237 chart.plot.set_xlim(1, len(residues)) 238 chart.plot.set_ylabel('Precision, %') 239 chart.plot.set_ylim(0, 100) 240 241 xaxis = chart.plot.axes.xaxis 242 xaxis.set_minor_locator(matplotlib.ticker.IndexLocator(1, 0)) 243 xaxis.set_major_locator(matplotlib.ticker.IndexLocator(5, 0)) 244 245 try: 246 chart.width = 15 247 chart.height = 5.5 248 249 chart.save(self._figure) 250 251 except IOError as io: 252 raise ArgumentIOError("Can't save figure: " + str(io)) 253 254 return GlobalInfo(avg_precision, coverage)
255
256 -def rmsd(target, source, fragments, pdb, save=None):
257 258 matches = [] 259 260 try: 261 src_file = wwpdb.find(source, [pdb]) 262 if src_file is None: 263 raise IOError("Can't find structure {0} in {1}".format(source, pdb)) 264 265 src_structure = wwpdb.RegularStructureParser(src_file).parse_structure() 266 267 except (IOError, structure.Broken3DStructureError, wwpdb.PDBParseError) as ex: 268 matches.append('Error parsing {0:5}: {1!s}'.format(source, ex)) 269 return matches 270 271 for fn, fragment in enumerate(fragments): 272 try: 273 if fragment.chain not in ('_', '', None): 274 src_chain = src_structure.chains[fragment.chain] 275 else: 276 src_chain = src_structure.first_chain 277 278 try: 279 query = target.subregion(fragment.qstart, fragment.qend) 280 subject = src_chain.subregion(fragment.start, fragment.end, clone=True) 281 282 except IndexError: 283 matches.append('Fragment {1.source_id:>5} {0.start:>4}-{0.end:>4} is out of range'.format(fragment)) 284 continue 285 286 si = query.align(subject) 287 match = csb.bio.fragments.FragmentMatch(fragment.id, qstart=fragment.qstart, qend=fragment.qend, 288 probability=None, rmsd=si.rmsd, tm_score=None, qlength=target.length) 289 matches.append(match) 290 291 if save: 292 dummy = structure.Structure(subject.entry_id) 293 dummy.chains.append(subject) 294 filename = '{0.qstart:}-{0.qend}-{0.source_id}.{1.entry_id}{2}.frag'.format(fragment, query, fn or '') 295 dummy.to_pdb(os.path.join(save, filename)) 296 297 except (structure.Broken3DStructureError, IOError) as ex: 298 matches.append("Can't superimpose fragment {0}: {1!s}".format(fragment.id, ex)) 299 continue 300 301 return matches
302 303 304 305 if __name__ == '__main__': 306 307 AppRunner().run() 308