libpappsomspp
Library for mass spectrometry
enzyme.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * Copyright (c) 2015 Olivier Langella <Olivier.Langella@moulon.inra.fr>.
3  *
4  * This file is part of the PAPPSOms++ library.
5  *
6  * PAPPSOms++ is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * PAPPSOms++ is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with PAPPSOms++. If not, see <http://www.gnu.org/licenses/>.
18  *
19  * Contributors:
20  * Olivier Langella <Olivier.Langella@moulon.inra.fr> - initial API and
21  *implementation
22  ******************************************************************************/
23 
24 #include "enzyme.h"
25 #include <QStringList>
26 #include <QDebug>
27 #include "../exception/exceptionnotpossible.h"
28 //#include <iostream>
29 
30 namespace pappso
31 {
33 {
34  m_recognitionSite.setPattern("([KR])([^P])");
35  m_miscleavage = 0;
36 
37 
38  char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
39  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
40  m_wildCardX.assign(std::begin(vv1), std::end(vv1));
41 
42  char vv2[] = {'N', 'D'};
43  m_wildCardB.assign(std::begin(vv2), std::end(vv2));
44 
45  char vv3[] = {'Q', 'E'};
46  m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
47 }
48 
49 Enzyme::Enzyme(const QString &recognition_site)
50 {
51  m_recognitionSite.setPattern(recognition_site);
52  m_miscleavage = 0;
53 
54 
55  char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
56  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
57  m_wildCardX.assign(std::begin(vv1), std::end(vv1));
58 
59  char vv2[] = {'N', 'D'};
60  m_wildCardB.assign(std::begin(vv2), std::end(vv2));
61 
62  char vv3[] = {'Q', 'E'};
63  m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
64 }
65 
67 {
68 }
69 
70 void
71 Enzyme::setMiscleavage(unsigned int miscleavage)
72 {
73  m_miscleavage = miscleavage;
74 }
75 unsigned int
77 {
78  return m_miscleavage;
79 }
80 void
81 Enzyme::setMaxPeptideVariantListSize(std::size_t max_peptide_variant_list_size)
82 {
83  m_maxPeptideVariantListSize = max_peptide_variant_list_size;
84 }
85 
86 void
87 Enzyme::eat(std::int8_t sequence_database_id,
88  const ProteinSp &protein_sp,
89  bool is_decoy,
90  EnzymeProductInterface &enzyme_product) const
91 {
92  /*
93  * for aa in self.aa_to_cut:
94  seq = seq.replace(aa, aa + ' ')
95  seq_stack = []
96  for s in seq.strip().split(' '):
97  seq_stack.append(s)
98  if len(seq_stack) > self.misscleavage + 1:
99  seq_stack.pop(0)
100  s2 = ""
101  for s_miss in seq_stack[::-1]:
102  s2 = s_miss + s2
103  yield s2
104  */
105  qDebug() << "Enzyme::eat begin ";
106  const QString sequence = protein_sp.get()->getSequence();
107  qDebug() << sequence;
108  QStringList peptide_list;
109  int pos = 0;
110  int peptide_start = 0;
111  int peptide_size = sequence.size();
112  while((pos = m_recognitionSite.indexIn(sequence, pos)) != -1)
113  {
114  peptide_size = pos + m_recognitionSite.cap(1).length() - peptide_start;
115  // qDebug() << "pos=" << pos << " peptide_start=" << peptide_start << "
116  // peptide_size=" << peptide_size << " " <<
117  // sequence.mid(peptide_start,peptide_size);
118  if(peptide_size > 0)
119  {
120  peptide_list.append(sequence.mid(peptide_start, peptide_size));
121  }
122  peptide_start += peptide_size;
123  pos = peptide_start; // all peptides MUST be consecutive
124  }
125  peptide_size = sequence.size() - peptide_start;
126  if(peptide_size > 0)
127  {
128  peptide_list.append(sequence.mid(peptide_start, peptide_size));
129  }
130 
131  unsigned int start = 1;
132  bool is_nter = true;
133  foreach(const QString &peptide, peptide_list)
134  {
135  // enzyme_product.setPeptide(sequence_database_id, protein_sp,is_decoy,
136  // peptide, start,is_nter,0, false);
137  sanityCheck(enzyme_product,
138  sequence_database_id,
139  protein_sp,
140  is_decoy,
141  peptide,
142  start,
143  is_nter,
144  0,
145  false);
146  is_nter = false;
147  start += peptide.size();
148  }
149 
150  unsigned int miscleavage_i = 0;
151  while(miscleavage_i < m_miscleavage)
152  {
153  miscleavage_i++;
154  qDebug() << "miscleavage_i=" << miscleavage_i;
155  int chunk_number = miscleavage_i + 1;
156  unsigned int start = 1;
157  bool is_nter = true;
158 
159  for(auto i = 0; i < peptide_list.size(); ++i)
160  {
161  qDebug() << "start=" << start;
162  QStringList peptide_mis_list;
163  for(auto j = 0; (j < chunk_number) && ((i + j) < peptide_list.size());
164  j++)
165  {
166  peptide_mis_list << peptide_list.at(i + j);
167  }
168  if(peptide_mis_list.size() == chunk_number)
169  {
170  // enzyme_product.setPeptide(sequence_database_id,
171  // protein_sp,is_decoy, peptide_mis_list.join(""), start,is_nter,
172  // miscleavage_i, false);
173  sanityCheck(enzyme_product,
174  sequence_database_id,
175  protein_sp,
176  is_decoy,
177  peptide_mis_list.join(""),
178  start,
179  is_nter,
180  miscleavage_i,
181  false);
182  }
183  is_nter = false;
184  start += peptide_list.at(i).size();
185  }
186  }
187 }
188 
189 void
190 Enzyme::replaceWildcards(std::vector<std::string> *p_peptide_variant_list) const
191 {
192  std::string new_peptide = p_peptide_variant_list->at(0);
193  qDebug() << "Enzyme::replaceWildcards begin " << new_peptide.c_str();
194  std::vector<std::string> old_peptide_variant_list;
195  old_peptide_variant_list.assign(p_peptide_variant_list->begin(),
196  p_peptide_variant_list->end());
197 
198 
199  for(char wildcard : {'X', 'B', 'Z'})
200  {
201 
202  std::size_t position = new_peptide.find(wildcard);
203  if(position == std::string::npos)
204  {
205  continue;
206  }
207  else
208  {
209  p_peptide_variant_list->clear();
210  /*
211  new_peptide[position] = 'A';
212  p_peptide_variant_list->push_back(new_peptide);
213  break;
214  */
215 
216  const std::vector<char> *p_x_replace_wildcard = nullptr;
217  if(wildcard == 'X')
218  {
219  p_x_replace_wildcard = &m_wildCardX;
220  }
221  else if(wildcard == 'B')
222  {
223  p_x_replace_wildcard = &m_wildCardB;
224  }
225  else if(wildcard == 'Z')
226  {
227  p_x_replace_wildcard = &m_wildCardZ;
228  }
229 
230  if(p_x_replace_wildcard != nullptr)
231  {
232  for(std::string orig_peptide : old_peptide_variant_list)
233  {
234  for(char replace : *p_x_replace_wildcard)
235  {
236  orig_peptide[position] = replace;
237  p_peptide_variant_list->push_back(orig_peptide);
238  }
239  }
240  }
241  else
242  {
243  throw ExceptionNotPossible(
244  QObject::tr("x_replace_wildcard is empty"));
245  }
246  // new_peptide[position] = 'A';
247  // p_peptide_variant_list->push_back(new_peptide);
248  // p_peptide_variant_list->resize(1);
249  // std::cerr << "Enzyme::replaceWildcards begin
250  // p_peptide_variant_list.size()=" << p_peptide_variant_list->size()
251  // <<
252  // endl;
253  break;
254  }
255  }
256  std::vector<std::string>().swap(
257  old_peptide_variant_list); // clear old_peptide_variant_list reallocating
258 
259 
260  qDebug() << "Enzyme::replaceWildcards end " << new_peptide.c_str();
261 }
262 
263 void
264 Enzyme::setTakeOnlyFirstWildcard(bool take_only_first_wildcard)
265 {
266  m_takeOnlyFirstWildcard = take_only_first_wildcard;
267 }
268 
269 
270 void
272  std::int8_t sequence_database_id,
273  const ProteinSp &protein_sp,
274  bool is_decoy,
275  const PeptideStr &peptide,
276  unsigned int start,
277  bool is_nter,
278  unsigned int missed_cleavage_number,
279  bool semi_enzyme) const
280 {
281  if(peptide.contains('X') || peptide.contains('B') || peptide.contains('Z'))
282  {
283 
284  std::vector<std::string> peptide_variant_list;
285  peptide_variant_list.push_back(peptide.toStdString());
286 
287  while((peptide_variant_list.at(0).find('X') != std::string::npos) ||
288  (peptide_variant_list.at(0).find('B') != std::string::npos) ||
289  (peptide_variant_list.at(0).find('Z') != std::string::npos))
290  {
291  replaceWildcards(&peptide_variant_list);
292  if(peptide_variant_list.size() > m_maxPeptideVariantListSize)
293  {
294  peptide_variant_list.resize(m_maxPeptideVariantListSize);
295  peptide_variant_list.shrink_to_fit();
296  }
297  }
298 
299  // peptide_variant_list.resize(2);
301  {
302  enzyme_product.setPeptide(sequence_database_id,
303  protein_sp,
304  is_decoy,
305  QString(peptide_variant_list.at(0).c_str()),
306  start,
307  is_nter,
308  missed_cleavage_number,
309  semi_enzyme);
310  }
311  else
312  {
313  std::string peptide_variant = peptide_variant_list.back();
314  while(peptide_variant_list.size() > 0)
315  {
316  enzyme_product.setPeptide(sequence_database_id,
317  protein_sp,
318  is_decoy,
319  QString(peptide_variant.c_str()),
320  start,
321  is_nter,
322  missed_cleavage_number,
323  semi_enzyme);
324  peptide_variant_list.pop_back();
325  if(peptide_variant_list.size() > 0)
326  {
327  peptide_variant = peptide_variant_list.back();
328  }
329  }
330  }
331  std::vector<std::string>().swap(
332  peptide_variant_list); // clear peptide_variant_list reallocating
333  }
334  else
335  {
336  enzyme_product.setPeptide(sequence_database_id,
337  protein_sp,
338  is_decoy,
339  peptide,
340  start,
341  is_nter,
342  missed_cleavage_number,
343  semi_enzyme);
344  }
345 }
346 
347 const QRegExp &
349 {
350  return m_recognitionSite;
351 }
352 } // namespace pappso
virtual void setPeptide(std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme)=0
function to give the products of a protein digestion by an enzyme
std::size_t m_maxPeptideVariantListSize
Definition: enzyme.h:93
unsigned int getMiscleavage() const
get the maximum number of missed cleavage allowed in the digestion
Definition: enzyme.cpp:76
Enzyme()
build the default enzyme (trypsin) with recognition_site = "([KR])([^P])"
Definition: enzyme.cpp:32
void setMiscleavage(unsigned int miscleavage)
sets the maximum number of missed cleavage allowed in the digestion
Definition: enzyme.cpp:71
std::vector< char > m_wildCardB
Definition: enzyme.h:97
std::vector< char > m_wildCardZ
Definition: enzyme.h:98
std::vector< char > m_wildCardX
Definition: enzyme.h:96
QRegExp m_recognitionSite
example with a kinase == [K,R]
Definition: enzyme.h:89
void sanityCheck(EnzymeProductInterface &enzyme_product, std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme) const
Definition: enzyme.cpp:271
const QRegExp & getQRegExpRecognitionSite() const
Definition: enzyme.cpp:348
void replaceWildcards(std::vector< std::string > *p_peptide_variant_list) const
Definition: enzyme.cpp:190
void setTakeOnlyFirstWildcard(bool take_only_first_wildcard)
take only first m_takeOnlyFirstWildcard
Definition: enzyme.cpp:264
void eat(std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, EnzymeProductInterface &enzyme_product) const
digest a protein into enzyme products
Definition: enzyme.cpp:87
unsigned int m_miscleavage
Definition: enzyme.h:90
bool m_takeOnlyFirstWildcard
Definition: enzyme.h:91
void setMaxPeptideVariantListSize(std::size_t max_peptide_variant_list_size)
if there are wildcards in the protein sequence : restrict the number of possible peptide sequences
Definition: enzyme.cpp:81
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition: aa.cpp:39
QString PeptideStr
A type definition for PeptideStr.
Definition: types.h:43
std::shared_ptr< const Protein > ProteinSp
shared pointer on a Protein object
Definition: protein.h:43