ProteoWizard
Serializer_pepXML_Test.cpp
Go to the documentation of this file.
1 //
2 // $Id: Serializer_pepXML_Test.cpp 4129 2012-11-20 00:05:37Z chambm $
3 //
4 //
5 // Original author: Matt Chambers <matt.chambers .@. vanderbilt.edu>
6 //
7 // Copyright 2010 Vanderbilt University - Nashville, TN 37232
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
21 
22 
23 #include "Serializer_pepXML.hpp"
24 #include "Diff.hpp"
25 #include "References.hpp"
26 #include "examples.hpp"
31 #include "TextWriter.hpp"
32 #include "boost/range/adaptor/transformed.hpp"
33 #include "boost/range/algorithm/max_element.hpp"
34 #include "boost/range/algorithm/min_element.hpp"
35 #include <cstring>
36 
37 
38 using namespace pwiz::identdata;
39 using namespace pwiz::identdata::examples;
40 using namespace pwiz::util;
41 namespace proteome = pwiz::proteome;
42 
43 ostream* os_ = 0;
44 
46 {
47  typedef int result_type;
48  int operator()(const EnzymePtr& x) const {return x->terminalSpecificity;}
49 };
50 
52 {
53  typedef int result_type;
54  int operator()(const EnzymePtr& x) const {return x->missedCleavages;}
55 };
56 
58 {
59  mzid.bibliographicReference.clear();
60  mzid.analysisSampleCollection.samples.clear();
61  mzid.auditCollection.clear();
62  mzid.provider = Provider();
63  mzid.dataCollection.inputs.sourceFile.clear();
64 
65  BOOST_FOREACH(AnalysisSoftwarePtr& as, mzid.analysisSoftwareList)
66  {
67  as->URI.clear();
68  as->customizations.clear();
69  as->contactRolePtr.reset();
70  }
71 
73 
74  // pepXML only provides a single min_number_termini and max_num_internal_cleavages for all enzymes
75  int minSpecificity = *boost::range::min_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_specificity()));
76  int maxMissedCleavages = *boost::range::max_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_missedCleavages()));
77  BOOST_FOREACH(const EnzymePtr& ez, sip.enzymes.enzymes)
78  {
79  ez->terminalSpecificity = (proteome::Digestion::Specificity) minSpecificity;
80  ez->missedCleavages = maxMissedCleavages;
81  }
82 
83  // pepXML doesn't map these elements
84  sip.massTable.clear();
85  sip.threshold.clear();
86  sip.databaseFilters.clear();
87  sip.databaseTranslation.reset();
88 
89  // pepXML doesn't map these attributes
90  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->name.clear();
91  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->version.clear();
92  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->releaseDate.clear();
93  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->databaseName.clear();
94 
95  // pepXML doesn't reliably store location or file format
96  string& location = mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->location;
97  location = BFS_STRING(bfs::path(location).replace_extension("").filename());
98  mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->fileFormat = CVParam();
99 
100  string& location2 = mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->location;
101  location2 = BFS_STRING(bfs::path(location2).replace_extension("").filename());
102 
103  // pepXML doesn't support protein sequences
104  BOOST_FOREACH(DBSequencePtr& dbSequence, mzid.sequenceCollection.dbSequences)
105  {
106  dbSequence->seq.clear();
107  dbSequence->length = 0;
108  dbSequence->id = "DBSeq_" + dbSequence->accession;
109  }
110 
111  // pepXML can only support one mass type (we pick the max mass in case one of them is 0)
112  BOOST_FOREACH(PeptidePtr& peptide, mzid.sequenceCollection.peptides)
113  BOOST_FOREACH(ModificationPtr& mod, peptide->modification)
114  mod->monoisotopicMassDelta = mod->avgMassDelta = max(mod->monoisotopicMassDelta, mod->avgMassDelta);
115 
116  // pepXML doesn't support fragment metadata
117  mzid.dataCollection.analysisData.spectrumIdentificationList[0]->fragmentationTable.clear();
118 
119  BOOST_FOREACH(SpectrumIdentificationResultPtr& sir, mzid.dataCollection.analysisData.spectrumIdentificationList[0]->spectrumIdentificationResult)
120  BOOST_FOREACH(SpectrumIdentificationItemPtr& sii, sir->spectrumIdentificationItem)
121  {
122  // pepXML doesn't support fragment metadata or mass tables
123  sii->fragmentation.clear();
124  sii->massTablePtr.reset();
125 
126  for (size_t i=0; i < sii->peptideEvidencePtr.size(); ++i)
127  {
128  PeptideEvidence& pe = *sii->peptideEvidencePtr[i];
129 
130  // pepXML does not store peptide start and end offsets
131  pe.start = pe.end = 0;
132 
133  // pepXML's alternative_proteins do not store prev/next AA or missed cleavages
134  if (i > 0)
135  pe.pre = pe.post = '?';
136  }
137  }
138 
139  // pepXML doesn't have protein assembly
142 }
143 
144 void testTranslation(const string& str)
145 {
146  // test that search engine name is written using preferred name
147  unit_assert(bal::contains(str, "search_engine=\"Mascot\""));
148 
149  // test that score names are written using preferred name
150  unit_assert(bal::contains(str, "name=\"ionscore\""));
151  unit_assert(bal::contains(str, "name=\"homologyscore\""));
152  unit_assert(bal::contains(str, "name=\"identityscore\""));
153  unit_assert(bal::contains(str, "name=\"expect\""));
154  unit_assert(bal::contains(str, "name=\"an extra score\""));
155 
156  // test that nativeID is preserved
157  unit_assert(bal::contains(str, "spectrumNativeID=\"controllerType=0 controllerNumber=1 scan=420\""));
158 }
159 
161 {
162  if (os_) *os_ << "begin testSerialize" << endl;
163 
164  Serializer_pepXML serializer(config);
165  ostringstream oss;
166  serializer.write(oss, mzid, "tiny.pepXML");
167 
168  if (os_) *os_ << "oss:\n" << oss.str() << endl;
169  if (config.readSpectrumQueries)
170  testTranslation(oss.str());
171 
172  shared_ptr<istringstream> iss(new istringstream(oss.str()));
173  IdentData mzid2;
174  serializer.read(iss, mzid2);
175 
176  References::resolve(mzid2);
177 
178  Diff<IdentData, DiffConfig> diff(mzid, mzid2);
179  if (os_ && diff) *os_ << diff << endl;
180  unit_assert(!diff);
181 }
182 
184 {
185  IdentData mzid;
187  stripUnmappedMetadata(mzid);
189 
190 
191  // test non-specific enzyme
192  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
193  EnzymePtr noEnzyme(new Enzyme);
194  noEnzyme->id = "ENZ_1";
195  noEnzyme->cTermGain = "OH";
196  noEnzyme->nTermGain = "H";
197  noEnzyme->missedCleavages = 2;
198  noEnzyme->minDistance = 1;
199  noEnzyme->terminalSpecificity = proteome::Digestion::NonSpecific;
200  noEnzyme->siteRegexp = "(?<=[KR])";
201  noEnzyme->enzymeName.set(MS_Trypsin_P);
202  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(noEnzyme);
204 
205 
206  // test sense="N" enzymes
207  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
208  EnzymePtr aspN(new Enzyme);
209  aspN->id = "ENZ_1";
210  aspN->cTermGain = "OH";
211  aspN->nTermGain = "H";
212  aspN->missedCleavages = 2;
213  aspN->minDistance = 1;
214  aspN->terminalSpecificity = proteome::Digestion::FullySpecific;
215  aspN->siteRegexp = "(?=[BD])";
216  aspN->enzymeName.set(MS_Asp_N);
217  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(aspN);
219 
220  aspN->missedCleavages = 4;
221  aspN->minDistance = 2;
222  aspN->terminalSpecificity = proteome::Digestion::SemiSpecific;
223  aspN->siteRegexp = "(?=[BND])";
224  aspN->enzymeName.clear();
225  aspN->enzymeName.userParams.push_back(UserParam("custom"));
227 
228 
229  // test with readSpectrumQueries == false
230 
231  // clear the original SequenceCollection
232  mzid.sequenceCollection.dbSequences.clear();
233  mzid.sequenceCollection.peptides.clear();
234  mzid.sequenceCollection.peptideEvidence.clear();
235 
236  // clear the original analysis data
237  mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->spectrumIDFormat = CVParam();
238  mzid.analysisCollection.spectrumIdentification[0]->spectrumIdentificationListPtr.reset();
241 
243 }
244 
246 {
247  PepXMLSpecificity result;
248  Enzyme ez;
249 
251  result = pepXMLSpecificity(ez);
252  unit_assert_operator_equal("C", result.sense);
253  unit_assert_operator_equal("KR", result.cut);
254  unit_assert_operator_equal("P", result.no_cut);
255 
256  ez.enzymeName.clear();
258  result = pepXMLSpecificity(ez);
259  unit_assert_operator_equal("C", result.sense);
260  unit_assert_operator_equal("KR", result.cut);
262 
263  ez.enzymeName.clear();
264  ez.enzymeName.userParams.push_back(UserParam("trypsin/p"));
265  result = pepXMLSpecificity(ez);
266  unit_assert_operator_equal("C", result.sense);
267  unit_assert_operator_equal("KR", result.cut);
269 
270  ez.enzymeName.clear();
271  ez.name = "trypsin/p";
272  result = pepXMLSpecificity(ez);
273  unit_assert_operator_equal("C", result.sense);
274  unit_assert_operator_equal("KR", result.cut);
276 
277  ez.name.clear();
278  ez.enzymeName.set(MS_Asp_N);
279  result = pepXMLSpecificity(ez);
280  unit_assert_operator_equal("N", result.sense);
281  unit_assert_operator_equal("BD", result.cut);
283 
284  ez.enzymeName.clear();
286  result = pepXMLSpecificity(ez);
287  unit_assert_operator_equal("C", result.sense);
288  unit_assert_operator_equal("KR", result.cut);
289  unit_assert_operator_equal("P", result.no_cut);
290 
292  result = pepXMLSpecificity(ez);
293  unit_assert_operator_equal("C", result.sense);
294  unit_assert_operator_equal("KR", result.cut);
296 
298  result = pepXMLSpecificity(ez);
299  unit_assert_operator_equal("N", result.sense);
300  unit_assert_operator_equal("BD", result.cut);
302 
303 
304  // REMEMBER: update the pepXMLSpecificity function when new CV enzymes are added
305  bool allCleavageAgentsHandled = true;
306  ez.siteRegexp.clear();
308  try
309  {
310  ez.enzymeName.clear();
311  ez.enzymeName.set(cleavageAgent);
312  result = pepXMLSpecificity(ez);
313  }
314  catch (exception& e)
315  {
316  cerr << e.what() << endl;
317  allCleavageAgentsHandled = false;
318  }
319  unit_assert(allCleavageAgentsHandled);
320 
321 
322  ez.siteRegexp = "(?<=[QWERTY])(?=[QWERTY])";
323  result = pepXMLSpecificity(ez);
324  unit_assert_operator_equal("C", result.sense);
325  unit_assert_operator_equal("QWERTY", result.cut);
326  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.no_cut);
327 
328  ez.siteRegexp = "(?<![QWERTY])(?![QWERTY])";
329  result = pepXMLSpecificity(ez);
330  unit_assert_operator_equal("C", result.sense);
331  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
332  unit_assert_operator_equal("QWERTY", result.no_cut);
333 
334  ez.siteRegexp = "(?<=[QWERTY])";
335  result = pepXMLSpecificity(ez);
336  unit_assert_operator_equal("C", result.sense);
337  unit_assert_operator_equal("QWERTY", result.cut);
339 
340  ez.siteRegexp = "(?=[QWERTY])";
341  result = pepXMLSpecificity(ez);
342  unit_assert_operator_equal("N", result.sense);
343  unit_assert_operator_equal("QWERTY", result.cut);
345 
346  ez.siteRegexp = "(?<![QWERTY])";
347  result = pepXMLSpecificity(ez);
348  unit_assert_operator_equal("C", result.sense);
349  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
351 
352  ez.siteRegexp = "(?![QWERTY])";
353  result = pepXMLSpecificity(ez);
354  unit_assert_operator_equal("N", result.sense);
355  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
357 }
358 
359 
361 {
362  unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123.2"));
363  unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123.12"));
364  unit_assert_operator_equal("basename.2.2", stripChargeFromConventionalSpectrumId("basename.2.2.2"));
365  unit_assert_operator_equal("basename.ext.3.3", stripChargeFromConventionalSpectrumId("basename.ext.3.3.3"));
366  unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123"));
367  unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123"));
368  unit_assert_operator_equal("locus:1.1.1.123", stripChargeFromConventionalSpectrumId("locus:1.1.1.123.2"));
369  unit_assert_operator_equal("basename.123", stripChargeFromConventionalSpectrumId("basename.123"));
371 }
372 
373 
374 int main(int argc, char** argv)
375 {
376  TEST_PROLOG(argc, argv)
377 
378  try
379  {
380  if (argc>1 && !strcmp(argv[1],"-v")) os_ = &cout;
383  testSerialize();
384  }
385  catch (exception& e)
386  {
387  TEST_FAILED(e.what())
388  }
389  catch (...)
390  {
391  TEST_FAILED("Caught unknown exception.")
392  }
393 
395 }
AnalysisProtocolCollection analysisProtocolCollection
Definition: IdentData.hpp:1018
int operator()(const EnzymePtr &x) const
MS_Asp_N
Asp-N:
Definition: cv.hpp:4157
void testStripChargeFromConventionalSpectrumId()
std::vector< PeptideEvidencePtr > peptideEvidence
Definition: IdentData.hpp:656
Implementation of PeptideEvidenceType from the mzIdentML schema.
Definition: IdentData.hpp:626
AnalysisCollection analysisCollection
Definition: IdentData.hpp:1016
void stripUnmappedMetadata(IdentData &mzid)
Implementation of EnzymeType from the mzIdentML schema.
Definition: IdentData.hpp:408
MS_Trypsin
Trypsin:
Definition: cv.hpp:4019
PWIZ_API_DECL proteome::Peptide peptide(const Peptide &peptide)
creates a proteome::Peptide from an identdata::Peptide
void read(boost::shared_ptr< std::istream > is, IdentData &mzid, const pwiz::util::IterationListenerRegistry *=0) const
read in MZIDData object from a pepXML istream
Calculate diffs of objects in a ProteoWizard data model hierarchy.
Definition: diff_std.hpp:142
std::vector< SpectrumIdentificationListPtr > spectrumIdentificationList
Definition: IdentData.hpp:962
#define TEST_EPILOG
Definition: unit.hpp:166
void testPepXMLSpecificity()
NonSpecific
Definition: Digestion.hpp:120
std::vector< SpectrumIdentificationProtocolPtr > spectrumIdentificationProtocol
Definition: IdentData.hpp:911
PWIZ_API_DECL void initializeBasicSpectrumIdentification(IdentData &mzid)
PWIZ_API_DECL PepXMLSpecificity pepXMLSpecificity(const Enzyme &ez)
converts an identdata::Enzyme into a pepXML cut/no_cut/sense tuple
void write(std::ostream &os, const IdentData &mzid, const std::string &filepath, const pwiz::util::IterationListenerRegistry *=0) const
write MZIDData object to ostream as pepXML
MZIDData <-> pepXML stream serialization.
void clear()
clears the collections
DataCollection dataCollection
Definition: IdentData.hpp:1020
std::vector< EnzymePtr > enzymes
Definition: IdentData.hpp:435
void diff(const string &filename1, const string &filename2)
Serializer_pepXML configuration.
Implementation of ProviderType from the mzIdentML schema.
Definition: IdentData.hpp:234
std::vector< PeptidePtr > peptides
Definition: IdentData.hpp:655
Uncontrolled user parameters (essentially allowing free text). Before using these, one should verify whether there is an appropriate CV term available, and if so, use the CV term instead.
Definition: ParamTypes.hpp:185
void testSerializeReally(IdentData &mzid, const Serializer_pepXML::Config &config)
void testTranslation(const string &str)
void testSerialize()
SemiSpecific
neither termini must match digestion motif(s)
Definition: Digestion.hpp:121
std::vector< UserParam > userParams
a collection of uncontrolled user terms
Definition: ParamTypes.hpp:250
#define unit_assert_operator_equal(expected, actual)
Definition: unit.hpp:86
AnalysisSampleCollection analysisSampleCollection
Definition: IdentData.hpp:1012
Implementation of the MzIdentMLType from the mzIdentML schema.
Definition: IdentData.hpp:993
MS_Trypsin_P
Trypsin/P:
Definition: cv.hpp:4184
boost::shared_ptr< Peptide > PeptidePtr
Definition: TraData.hpp:236
ParamContainer enzymeName
Definition: IdentData.hpp:420
#define BFS_STRING(p)
Definition: Filesystem.hpp:53
static const std::set< CVID > & getCleavageAgents()
returns the set of predefined cleavage agents defined in the PSI-MS CV
std::vector< SourceFilePtr > sourceFile
Definition: IdentData.hpp:946
int operator()(const EnzymePtr &x) const
std::vector< SpectrumIdentificationPtr > spectrumIdentification
Definition: IdentData.hpp:897
PWIZ_API_DECL void resolve(ContactRole &cr, IdentData &mzid)
std::vector< BibliographicReferencePtr > bibliographicReference
Definition: IdentData.hpp:1022
Implementation of SpectrumIdentificationProtocolType from the mzIdentML schema.
Definition: IdentData.hpp:545
std::vector< ContactPtr > auditCollection
Definition: IdentData.hpp:1010
#define TEST_FAILED(x)
Definition: unit.hpp:160
std::vector< DBSequencePtr > dbSequences
Definition: IdentData.hpp:654
ostream * os_
SequenceCollection sequenceCollection
Definition: IdentData.hpp:1014
void set(CVID cvid, const std::string &value="", CVID units=CVID_Unknown)
set/add a CVParam (not recursive)
#define TEST_PROLOG(argc, argv)
Definition: unit.hpp:158
PWIZ_API_DECL CVID cleavageAgent(const Enzyme &ez)
returns a cleavage agent CVID for an identdata::Enzyme
KernelTraitsBase< Kernel >::space_type::abscissa_type x
std::vector< AnalysisSoftwarePtr > analysisSoftwareList
Definition: IdentData.hpp:1006
ProteinDetectionListPtr proteinDetectionListPtr
Definition: IdentData.hpp:963
#define unit_assert(x)
Definition: unit.hpp:82
static const std::string & getCleavageAgentRegex(CVID agentCvid)
returns the official PSI Perl regular expression defining the places in a polypeptide or protein that...
PWIZ_API_DECL std::string stripChargeFromConventionalSpectrumId(const std::string &id)
strips charge state from known conventions of the pepXML spectrum attribute; used to find a unique id...
represents a tag-value pair, where the tag comes from the controlled vocabulary
Definition: ParamTypes.hpp:44
int main(int argc, char **argv)
Implementation of ProteinDetectionType from the mzIdentML schema.
Definition: IdentData.hpp:872