ProteoWizard
Serializer_pepXML_Test.cpp
Go to the documentation of this file.
1 //
2 // $Id: Serializer_pepXML_Test.cpp 5156 2013-11-14 23:07:56Z chambm $
3 //
4 //
5 // Original author: Matt Chambers <matt.chambers .@. vanderbilt.edu>
6 //
7 // Copyright 2010 Vanderbilt University - Nashville, TN 37232
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
21 
22 
23 #include "Serializer_pepXML.hpp"
24 #include "Diff.hpp"
25 #include "References.hpp"
26 #include "examples.hpp"
31 #include "TextWriter.hpp"
32 #include "boost/range/adaptor/transformed.hpp"
33 #include "boost/range/algorithm/max_element.hpp"
34 #include "boost/range/algorithm/min_element.hpp"
35 #include <cstring>
36 
37 
38 using namespace pwiz::identdata;
39 using namespace pwiz::identdata::examples;
40 using namespace pwiz::util;
41 namespace proteome = pwiz::proteome;
42 
43 ostream* os_ = 0;
44 
46 {
47  typedef int result_type;
48  int operator()(const EnzymePtr& x) const {return x->terminalSpecificity;}
49 };
50 
52 {
53  typedef int result_type;
54  int operator()(const EnzymePtr& x) const {return x->missedCleavages;}
55 };
56 
58 {
59  mzid.bibliographicReference.clear();
60  mzid.analysisSampleCollection.samples.clear();
61  mzid.auditCollection.clear();
62  mzid.provider = Provider();
63  mzid.dataCollection.inputs.sourceFile.clear();
64 
65  BOOST_FOREACH(AnalysisSoftwarePtr& as, mzid.analysisSoftwareList)
66  {
67  as->URI.clear();
68  as->customizations.clear();
69  as->contactRolePtr.reset();
70  }
71 
73 
74  // pepXML only provides a single min_number_termini and max_num_internal_cleavages for all enzymes
75  int minSpecificity = *boost::range::min_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_specificity()));
76  int maxMissedCleavages = *boost::range::max_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_missedCleavages()));
77  BOOST_FOREACH(const EnzymePtr& ez, sip.enzymes.enzymes)
78  {
79  ez->terminalSpecificity = (proteome::Digestion::Specificity) minSpecificity;
80  ez->missedCleavages = maxMissedCleavages;
81  }
82 
83  // pepXML doesn't map these elements
84  sip.massTable.clear();
85  sip.threshold.clear();
86  sip.databaseFilters.clear();
87  sip.databaseTranslation.reset();
88 
89  // pepXML doesn't map these attributes
90  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->name.clear();
91  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->version.clear();
92  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->releaseDate.clear();
93  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->databaseName.clear();
94 
95  // pepXML doesn't reliably store location or file format
96  string& location = mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->location;
97  location = BFS_STRING(bfs::path(location).replace_extension("").filename());
98  mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->fileFormat = CVParam();
99 
100  string& location2 = mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->location;
101  location2 = BFS_STRING(bfs::path(location2).replace_extension("").filename());
102 
103  // pepXML doesn't support protein sequences
104  BOOST_FOREACH(DBSequencePtr& dbSequence, mzid.sequenceCollection.dbSequences)
105  {
106  dbSequence->seq.clear();
107  dbSequence->length = 0;
108  dbSequence->id = "DBSeq_" + dbSequence->accession;
109  }
110 
111  // pepXML can only support one mass type (we pick the max mass in case one of them is 0)
112  BOOST_FOREACH(PeptidePtr& peptide, mzid.sequenceCollection.peptides)
113  BOOST_FOREACH(ModificationPtr& mod, peptide->modification)
114  mod->monoisotopicMassDelta = mod->avgMassDelta = max(mod->monoisotopicMassDelta, mod->avgMassDelta);
115 
116  // pepXML doesn't support fragment metadata
117  mzid.dataCollection.analysisData.spectrumIdentificationList[0]->fragmentationTable.clear();
118 
119  BOOST_FOREACH(SpectrumIdentificationResultPtr& sir, mzid.dataCollection.analysisData.spectrumIdentificationList[0]->spectrumIdentificationResult)
120  BOOST_FOREACH(SpectrumIdentificationItemPtr& sii, sir->spectrumIdentificationItem)
121  {
122  // pepXML doesn't support fragment metadata or mass tables
123  sii->fragmentation.clear();
124  sii->massTablePtr.reset();
125 
126  for (size_t i=0; i < sii->peptideEvidencePtr.size(); ++i)
127  {
128  PeptideEvidence& pe = *sii->peptideEvidencePtr[i];
129 
130  // pepXML does not store peptide start and end offsets
131  pe.start = pe.end = 0;
132 
133  // pepXML's alternative_proteins do not store prev/next AA or missed cleavages
134  if (i > 0)
135  pe.pre = pe.post = '?';
136  }
137  }
138 
139  // pepXML doesn't have protein assembly
142 
143  // pepXML expects the residues to be '.' or an amino acid list
144  BOOST_FOREACH(SearchModificationPtr& sm, mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->modificationParams)
145  if (sm->residues.empty())
146  sm->residues.push_back('.');
147 }
148 
149 void testTranslation(const string& str)
150 {
151  // test that search engine name is written using preferred name
152  unit_assert(bal::contains(str, "search_engine=\"Mascot\""));
153 
154  // test that score names are written using preferred name
155  unit_assert(bal::contains(str, "name=\"ionscore\""));
156  unit_assert(bal::contains(str, "name=\"homologyscore\""));
157  unit_assert(bal::contains(str, "name=\"identityscore\""));
158  unit_assert(bal::contains(str, "name=\"expect\""));
159  unit_assert(bal::contains(str, "name=\"an extra score\""));
160 
161  // test that nativeID is preserved
162  unit_assert(bal::contains(str, "spectrumNativeID=\"controllerType=0 controllerNumber=1 scan=420\""));
163 }
164 
166 {
167  if (os_) *os_ << "begin testSerialize" << endl;
168 
169  Serializer_pepXML serializer(config);
170  ostringstream oss;
171  serializer.write(oss, mzid, "tiny.pepXML");
172 
173  if (os_) *os_ << "oss:\n" << oss.str() << endl;
174  if (config.readSpectrumQueries)
175  testTranslation(oss.str());
176 
177  shared_ptr<istringstream> iss(new istringstream(oss.str()));
178  IdentData mzid2;
179  serializer.read(iss, mzid2);
180 
181  References::resolve(mzid2);
182 
183  Diff<IdentData, DiffConfig> diff(mzid, mzid2);
184  if (os_ && diff) *os_ << diff << endl;
185  unit_assert(!diff);
186 }
187 
189 {
190  IdentData mzid;
192  stripUnmappedMetadata(mzid);
194 
195 
196  // test non-specific enzyme
197  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
198  EnzymePtr noEnzyme(new Enzyme);
199  noEnzyme->id = "ENZ_1";
200  noEnzyme->cTermGain = "OH";
201  noEnzyme->nTermGain = "H";
202  noEnzyme->missedCleavages = 2;
203  noEnzyme->minDistance = 1;
204  noEnzyme->terminalSpecificity = proteome::Digestion::NonSpecific;
205  noEnzyme->siteRegexp = "(?<=[KR])";
206  noEnzyme->enzymeName.set(MS_Trypsin_P);
207  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(noEnzyme);
209 
210 
211  // test sense="N" enzymes
212  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
213  EnzymePtr aspN(new Enzyme);
214  aspN->id = "ENZ_1";
215  aspN->cTermGain = "OH";
216  aspN->nTermGain = "H";
217  aspN->missedCleavages = 2;
218  aspN->minDistance = 1;
219  aspN->terminalSpecificity = proteome::Digestion::FullySpecific;
220  aspN->siteRegexp = "(?=[BD])";
221  aspN->enzymeName.set(MS_Asp_N);
222  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(aspN);
224 
225  aspN->missedCleavages = 4;
226  aspN->minDistance = 2;
227  aspN->terminalSpecificity = proteome::Digestion::SemiSpecific;
228  aspN->siteRegexp = "(?=[BND])";
229  aspN->enzymeName.clear();
230  aspN->enzymeName.userParams.push_back(UserParam("custom"));
232 
233 
234  // test with readSpectrumQueries == false
235 
236  // clear the original SequenceCollection
237  mzid.sequenceCollection.dbSequences.clear();
238  mzid.sequenceCollection.peptides.clear();
239  mzid.sequenceCollection.peptideEvidence.clear();
240 
241  // clear the original analysis data
242  mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->spectrumIDFormat = CVParam();
243  mzid.analysisCollection.spectrumIdentification[0]->spectrumIdentificationListPtr.reset();
246 
248 }
249 
251 {
252  PepXMLSpecificity result;
253  Enzyme ez;
254 
256  result = pepXMLSpecificity(ez);
257  unit_assert_operator_equal("C", result.sense);
258  unit_assert_operator_equal("KR", result.cut);
259  unit_assert_operator_equal("P", result.no_cut);
260 
261  ez.enzymeName.clear();
263  result = pepXMLSpecificity(ez);
264  unit_assert_operator_equal("C", result.sense);
265  unit_assert_operator_equal("KR", result.cut);
267 
268  ez.enzymeName.clear();
269  ez.enzymeName.userParams.push_back(UserParam("trypsin/p"));
270  result = pepXMLSpecificity(ez);
271  unit_assert_operator_equal("C", result.sense);
272  unit_assert_operator_equal("KR", result.cut);
274 
275  ez.enzymeName.clear();
276  ez.name = "trypsin/p";
277  result = pepXMLSpecificity(ez);
278  unit_assert_operator_equal("C", result.sense);
279  unit_assert_operator_equal("KR", result.cut);
281 
282  ez.name.clear();
283  ez.enzymeName.set(MS_Asp_N);
284  result = pepXMLSpecificity(ez);
285  unit_assert_operator_equal("N", result.sense);
286  unit_assert_operator_equal("BD", result.cut);
288 
289  ez.enzymeName.clear();
291  result = pepXMLSpecificity(ez);
292  unit_assert_operator_equal("C", result.sense);
293  unit_assert_operator_equal("KR", result.cut);
294  unit_assert_operator_equal("P", result.no_cut);
295 
297  result = pepXMLSpecificity(ez);
298  unit_assert_operator_equal("C", result.sense);
299  unit_assert_operator_equal("KR", result.cut);
301 
303  result = pepXMLSpecificity(ez);
304  unit_assert_operator_equal("N", result.sense);
305  unit_assert_operator_equal("BD", result.cut);
307 
308 
309  // REMEMBER: update the pepXMLSpecificity function when new CV enzymes are added
310  bool allCleavageAgentsHandled = true;
311  ez.siteRegexp.clear();
313  try
314  {
315  ez.enzymeName.clear();
316  ez.enzymeName.set(cleavageAgent);
317  result = pepXMLSpecificity(ez);
318  }
319  catch (exception& e)
320  {
321  cerr << e.what() << endl;
322  allCleavageAgentsHandled = false;
323  }
324  unit_assert(allCleavageAgentsHandled);
325 
326 
327  ez.siteRegexp = "(?<=[QWERTY])(?=[QWERTY])";
328  result = pepXMLSpecificity(ez);
329  unit_assert_operator_equal("C", result.sense);
330  unit_assert_operator_equal("QWERTY", result.cut);
331  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.no_cut);
332 
333  ez.siteRegexp = "(?<![QWERTY])(?![QWERTY])";
334  result = pepXMLSpecificity(ez);
335  unit_assert_operator_equal("C", result.sense);
336  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
337  unit_assert_operator_equal("QWERTY", result.no_cut);
338 
339  ez.siteRegexp = "(?<=[QWERTY])";
340  result = pepXMLSpecificity(ez);
341  unit_assert_operator_equal("C", result.sense);
342  unit_assert_operator_equal("QWERTY", result.cut);
344 
345  ez.siteRegexp = "(?=[QWERTY])";
346  result = pepXMLSpecificity(ez);
347  unit_assert_operator_equal("N", result.sense);
348  unit_assert_operator_equal("QWERTY", result.cut);
350 
351  ez.siteRegexp = "(?<![QWERTY])";
352  result = pepXMLSpecificity(ez);
353  unit_assert_operator_equal("C", result.sense);
354  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
356 
357  ez.siteRegexp = "(?![QWERTY])";
358  result = pepXMLSpecificity(ez);
359  unit_assert_operator_equal("N", result.sense);
360  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
362 }
363 
364 
366 {
367  unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123.2"));
368  unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123.12"));
369  unit_assert_operator_equal("basename.2.2", stripChargeFromConventionalSpectrumId("basename.2.2.2"));
370  unit_assert_operator_equal("basename.ext.3.3", stripChargeFromConventionalSpectrumId("basename.ext.3.3.3"));
371  unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123"));
372  unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123"));
373  unit_assert_operator_equal("locus:1.1.1.123", stripChargeFromConventionalSpectrumId("locus:1.1.1.123.2"));
374  unit_assert_operator_equal("basename.123", stripChargeFromConventionalSpectrumId("basename.123"));
376 }
377 
378 
379 int main(int argc, char** argv)
380 {
381  TEST_PROLOG(argc, argv)
382 
383  try
384  {
385  if (argc>1 && !strcmp(argv[1],"-v")) os_ = &cout;
388  testSerialize();
389  }
390  catch (exception& e)
391  {
392  TEST_FAILED(e.what())
393  }
394  catch (...)
395  {
396  TEST_FAILED("Caught unknown exception.")
397  }
398 
400 }
AnalysisProtocolCollection analysisProtocolCollection
Definition: IdentData.hpp:1018
int operator()(const EnzymePtr &x) const
MS_Asp_N
Asp-N: Endoproteinase Asp-N.
Definition: cv.hpp:4164
void testStripChargeFromConventionalSpectrumId()
std::vector< PeptideEvidencePtr > peptideEvidence
Definition: IdentData.hpp:656
Implementation of PeptideEvidenceType from the mzIdentML schema.
Definition: IdentData.hpp:626
AnalysisCollection analysisCollection
Definition: IdentData.hpp:1016
void stripUnmappedMetadata(IdentData &mzid)
Implementation of EnzymeType from the mzIdentML schema.
Definition: IdentData.hpp:408
MS_Trypsin
Trypsin: Enzyme trypsin.
Definition: cv.hpp:4026
PWIZ_API_DECL proteome::Peptide peptide(const Peptide &peptide)
creates a proteome::Peptide from an identdata::Peptide
void read(boost::shared_ptr< std::istream > is, IdentData &mzid, const pwiz::util::IterationListenerRegistry *=0) const
read in MZIDData object from a pepXML istream
Calculate diffs of objects in a ProteoWizard data model hierarchy.
Definition: diff_std.hpp:142
std::vector< SpectrumIdentificationListPtr > spectrumIdentificationList
Definition: IdentData.hpp:962
#define TEST_EPILOG
Definition: unit.hpp:173
void testPepXMLSpecificity()
NonSpecific
Definition: Digestion.hpp:120
std::vector< SpectrumIdentificationProtocolPtr > spectrumIdentificationProtocol
Definition: IdentData.hpp:911
PWIZ_API_DECL void initializeBasicSpectrumIdentification(IdentData &mzid)
PWIZ_API_DECL PepXMLSpecificity pepXMLSpecificity(const Enzyme &ez)
converts an identdata::Enzyme into a pepXML cut/no_cut/sense tuple
void write(std::ostream &os, const IdentData &mzid, const std::string &filepath, const pwiz::util::IterationListenerRegistry *=0) const
write MZIDData object to ostream as pepXML
MZIDData <-> pepXML stream serialization.
void clear()
clears the collections
DataCollection dataCollection
Definition: IdentData.hpp:1020
std::vector< EnzymePtr > enzymes
Definition: IdentData.hpp:435
void diff(const string &filename1, const string &filename2)
Serializer_pepXML configuration.
Implementation of ProviderType from the mzIdentML schema.
Definition: IdentData.hpp:234
std::vector< PeptidePtr > peptides
Definition: IdentData.hpp:655
Uncontrolled user parameters (essentially allowing free text). Before using these, one should verify whether there is an appropriate CV term available, and if so, use the CV term instead.
Definition: ParamTypes.hpp:185
void testSerializeReally(IdentData &mzid, const Serializer_pepXML::Config &config)
void testTranslation(const string &str)
void testSerialize()
SemiSpecific
neither termini must match digestion motif(s)
Definition: Digestion.hpp:121
std::vector< UserParam > userParams
a collection of uncontrolled user terms
Definition: ParamTypes.hpp:253
#define unit_assert_operator_equal(expected, actual)
Definition: unit.hpp:89
AnalysisSampleCollection analysisSampleCollection
Definition: IdentData.hpp:1012
Implementation of the MzIdentMLType from the mzIdentML schema.
Definition: IdentData.hpp:993
MS_Trypsin_P
Trypsin/P: Cleavage agent Trypsin/P.
Definition: cv.hpp:4191
boost::shared_ptr< Peptide > PeptidePtr
Definition: TraData.hpp:236
ParamContainer enzymeName
Definition: IdentData.hpp:420
#define BFS_STRING(p)
Definition: Filesystem.hpp:53
static const std::set< CVID > & getCleavageAgents()
returns the set of predefined cleavage agents defined in the PSI-MS CV
std::vector< SourceFilePtr > sourceFile
Definition: IdentData.hpp:946
int operator()(const EnzymePtr &x) const
std::vector< SpectrumIdentificationPtr > spectrumIdentification
Definition: IdentData.hpp:897
PWIZ_API_DECL void resolve(ContactRole &cr, IdentData &mzid)
std::vector< BibliographicReferencePtr > bibliographicReference
Definition: IdentData.hpp:1022
Implementation of SpectrumIdentificationProtocolType from the mzIdentML schema.
Definition: IdentData.hpp:545
std::vector< ContactPtr > auditCollection
Definition: IdentData.hpp:1010
#define TEST_FAILED(x)
Definition: unit.hpp:167
std::vector< DBSequencePtr > dbSequences
Definition: IdentData.hpp:654
ostream * os_
SequenceCollection sequenceCollection
Definition: IdentData.hpp:1014
void set(CVID cvid, const std::string &value="", CVID units=CVID_Unknown)
set/add a CVParam (not recursive)
#define TEST_PROLOG(argc, argv)
Definition: unit.hpp:165
PWIZ_API_DECL CVID cleavageAgent(const Enzyme &ez)
returns a cleavage agent CVID for an identdata::Enzyme
KernelTraitsBase< Kernel >::space_type::abscissa_type x
std::vector< AnalysisSoftwarePtr > analysisSoftwareList
Definition: IdentData.hpp:1006
ProteinDetectionListPtr proteinDetectionListPtr
Definition: IdentData.hpp:963
#define unit_assert(x)
Definition: unit.hpp:85
static const std::string & getCleavageAgentRegex(CVID agentCvid)
returns the official PSI Perl regular expression defining the places in a polypeptide or protein that...
PWIZ_API_DECL std::string stripChargeFromConventionalSpectrumId(const std::string &id)
strips charge state from known conventions of the pepXML spectrum attribute; used to find a unique id...
represents a tag-value pair, where the tag comes from the controlled vocabulary
Definition: ParamTypes.hpp:44
int main(int argc, char **argv)
Implementation of ProteinDetectionType from the mzIdentML schema.
Definition: IdentData.hpp:872