File: sequence.cpp

package info (click to toggle)
massxpert 2.3.6-1squeeze1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 20,736 kB
  • ctags: 3,541
  • sloc: cpp: 44,108; xml: 7,381; sh: 604; makefile: 108; ansic: 7
file content (824 lines) | stat: -rw-r--r-- 21,121 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
/* massXpert - the true massist's program.
   --------------------------------------
   Copyright(C) 2006,2007 Filippo Rusconi

   http://www.massxpert.org/massXpert

   This file is part of the massXpert project.

   The massxpert project is the successor to the "GNU polyxmass"
   project that is an official GNU project package(see
   www.gnu.org). The massXpert project is not endorsed by the GNU
   project, although it is released ---in its entirety--- under the
   GNU General Public License. A huge part of the code in massXpert
   is actually a C++ rewrite of code in GNU polyxmass. As such
   massXpert was started at the Centre National de la Recherche
   Scientifique(FRANCE), that granted me the formal authorization to
   publish it under this Free Software License.

   This software is free software; you can redistribute it and/or
   modify it under the terms of the GNU  General Public
   License version 3, as published by the Free Software Foundation.
   

   This software is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this software; if not, write to the

   Free Software Foundation, Inc.,

   51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/


/////////////////////// Local includes
#include "sequence.hpp"
#include "polChemDef.hpp"


namespace massXpert
{

  //! Constructs a sequence
  /*! The sequence is in the form of a string of concatenated monomer
      codes. No quality check is performed.

    \param text sequence in the form of concatenated monomer codes.
  */
  Sequence::Sequence(const QString &text)
    : m_monomerText(text)
  {
  }


  //! Destroys the sequence.
  Sequence::~Sequence()
  {
    while(!m_monomerList.isEmpty())
      delete m_monomerList.takeFirst();
  }


  //! Sets the sequence text.
  /*!  \param text Monomer sequence as a string of monomer codes.
   */
  void
  Sequence::setMonomerText(const QString &text)
  {
    m_monomerText = text;
  }


  //! Appends text to the sequence text.
  /*!  \param text Monomer sequence as a string of monomer codes.
   */
  void
  Sequence::appendMonomerText(const QString &text)
  {
    if (text.isEmpty())
      return;
  
    m_monomerText += text;
  }


  //! Returns the sequence as a string of monomer codes.
  /*!  \return The sequence as a string.
   */
  const QString *
  Sequence::monomerText()
  {
    return &m_monomerText;
  }


  //! Returns the sequence as a list of monomers.
  /*!  \return The list of monomers.
   */
  const QList<const Monomer *> &
  Sequence::monomerList() const
  {
    return m_monomerList;
  }


  //! Returns the sequence as a list of monomers.
  /*!  \return The list of monomers.
   */
  QList<const Monomer *> *
  Sequence::monomerListPtr()
  {
    return &m_monomerList;
  }


  //! Returns the size of the sequence.
  /*! Returns the size of the sequence as the size of the list of
    monomers.
  
    \return The number of items in the list of monomers.
  */
  int 
  Sequence::size() const
  {
    return m_monomerList.size();
  }


  //! Removes all spaces, carriage returns and linefeeds.
  void 
  Sequence::unspacifyMonomerText()
  {
    // Removal of all spaces, carriage returns and linefeeds:
  
    for (int iter = m_monomerText.length() -1; iter >= 0 ; --iter)
      {
	QChar curChar = m_monomerText.at(iter);
      
	QChar::Category category = curChar.category();

	if(category == QChar::Separator_Space)
	  m_monomerText.remove(iter, 1);

	else if (curChar == '\n')
	  m_monomerText.remove(iter, 1);
	else if (curChar == '\r')
	  m_monomerText.remove(iter, 1);
      }
  }


  //! Creates the string representation of the sequence.
  /*! The string representation of the sequence is created by iterating
    in the list of monomers and concatenating into one single string the
    monomer code of each iterated monomer. The generated string is
    stored in the member datum.

    \return The number of codes concatenated into the string.

    \sa makeMonomerList()
  */
  int 
  Sequence::makeMonomerText()
  {
    int iter = 0;
  
    m_monomerText.clear();
  
    for (iter = 0; iter < m_monomerList.size(); ++iter)
      m_monomerText.append(m_monomerList.at(iter)->code());

    return iter;
  }


  QString *
  Sequence::monomerText(int start, int end, bool withModif) const
  {
    int localStart = 0;
    int localEnd = 0;
  
    QString *p_text = new QString();

    if (size() == 0)
      return p_text;
    
    if (start > end)
      {
	localStart = end;
	localEnd = start;
      }
    else
      {
	localStart = start;
	localEnd = end;
      }
    
    if (localStart < 0)
      localStart = 0;

    if (localEnd < 0 || localEnd >= size())
      localEnd = size() - 1;
  
    QString text;
  
    for (int iter = localStart ; iter < localEnd + 1 ; ++iter)
      {
	const Monomer *monomer = m_monomerList.at(iter);
      
	if(withModif)
	  {
	    if (monomer->isModified())
	      {
		for(int iter = 0; iter < monomer->modifList()->size();++iter)
		  {
		    text = QString("%1<%2>")
		      .arg(monomer->code())
		      .arg(monomer->modifList()->at(iter)->name());
		  }
	      }
	    else
	      text = monomer->code();
	  }
	else
	  text = monomer->code();
      
	p_text->append(text);
      }  

    return p_text;
  }


  QString *
  Sequence::monomerText(const CoordinateList &coordinateList, 
			     bool withModif, bool delimitedRegions) const
  {
    QString *p_text = new QString();
    
    for (int iter = 0; iter < coordinateList.size(); ++iter)
      {
	// New coordinates instance we are iterating into.
	Coordinates *coordinates = coordinateList.at(iter);

	QString *tempString = monomerText(coordinates->start(),
					   coordinates->end(),
					   withModif);
	
	if(delimitedRegions)
	  *p_text += QString("Region %1: %2\n")
	    .arg(coordinates->positionsAsText())
	    .arg(*tempString);
	else
	  *p_text += *tempString;
	
	delete(tempString);
      }
    
    *p_text += QString("\n");
    
    return p_text;
  }
  

  //! Creates a list of monomers from the string sequence.
  /*! The creation of the list of monomers is performed by iterating in
    the sequence text form and for each monomer code parsed a monomer is
    created by looking into a list of reference monomers(belonging to
    the polymer chemistry definition used at construction).
  
    \param reset If true, the list of monomers is first cleared.

    \param polChemDef Polymer chemistry definition to be used to craft
    the fully qualified monomers using their code in the text
    representation of the sequence.

    \param errorList list of int where to store the indices where errors
    are encountered. Defaults to 0, in which case no storing of the
    indices occurs.

    \return The size of the monomer list or -1 if an error occurred.
  */
  int 
  Sequence::makeMonomerList(const PolChemDef *polChemDef, bool reset,
			     QList<int> *errorList)
  {
    if (!polChemDef)
      return -1;
    
    int index = 0;
    int ret = -1;
    QString err;
    QString code;
  
    // If error indices are to be stored, the list MUST be empty.
    if (errorList)
      Q_ASSERT(errorList->size() == 0);
  
    if (reset)
      {
	while(!m_monomerList.isEmpty())
	  delete m_monomerList.takeFirst();
      }
  
    unspacifyMonomerText();

//     qDebug() << __FILE__ << __LINE__
// 	     << "Sequence:" << m_monomerText;
      
    ret = nextCode(&code, &index, &err, polChemDef->codeLength());
  
    const QList<Monomer*> &refList = polChemDef->monomerList();

    while(1)
      {
	if(ret < 0)
	  {
	    // There was an error in the parsed code. Store the index.
	    if (errorList)
	      {
		errorList->append(index);
		++index;
		ret = nextCode(&code, &index, &err, polChemDef->codeLength());
		continue;
	      }
	    else
	      {
		break;
	      }
	  }
          
	if(ret == 0)
	  break;
      
	Monomer *monomer = new Monomer(polChemDef, "NOT_SET");

	if(Monomer::isCodeInList(code, refList, monomer) == -1)
	  {
	    delete monomer;

	    if (errorList)
	      {
		errorList->append(index);
		++index;
		ret = nextCode(&code, &index, &err, polChemDef->codeLength());
		continue;
	      }
	    else
	      {
		return -1;
	      }
	  }
      
	m_monomerList.append(monomer);

// 	qDebug() << __FILE__ << __LINE__
// 		 << "New monomer:" << monomer->name();
	      
	++index;
	
// 	qDebug() <<  __FILE__ << __LINE__ << "index:" << index;
	
	ret = nextCode(&code, &index, &err, polChemDef->codeLength());
      }
    // End of 
    // while(1)
  
    if (errorList)
      {
	if(errorList->size())
	  return -1;
      }
  
    if (ret == -1)
      return -1;
  
    return m_monomerList.size();
  }


  //! Returns the next code occurring in the sequence.
  /*! Returns the code occurring in the sequence starting at index \p
    index.
  
    \param code Location where to store the code to return to caller.
  
    \param index Index at which parsing for a new code in the sequence has
    to start.
  
    \param err Location where to store the erroneous characters that
    might be encountered during parsing of the sequence.
  
    \param codeLength Number of authorized characters to qualify a
    monomer code.

    \return the length(in characters) of the returned code or -1 if an
    error occurred.
  */
  int
  Sequence::nextCode(QString *code, int *index, QString *err, int codeLength)
  {
    QString newCode;
    int iter = 0;
  
    // We get a sequence of monomer codes(like "LysArgGlu" for example)
    // and we have to return the next code starting from *index. Note
    // that the sequence must not contain invalid characters. The
    // invalid characters might be placed in err for further scrutiny by
    // the caller.

    // Returns the count of actually parsed characters in the string
    // newCode(copied to 'code' param). If an error occurs -1 is
    // returned and the faulty character is copied in 'err'. 'index' is
    // updated with the index of the last valid character parsed for
    // current code.

    Q_ASSERT(code);
    Q_ASSERT(index);
    Q_ASSERT(err);

    code->clear();
    err->clear();
  
    int length = m_monomerText.length();
  
    while(1)
      {
	if(iter >= codeLength)
	  {
	    // Because we have progressed farther than authorized by
	    // the number of characters allowed in the monomer codes
	    // of this polymer chemistry definition, we decrement iter
	    // and break the loop... Later in this function, we'll set
	    // the proper index in the sequence where next parsing run
	    // should occurs (the calling function will increment
	    // *index by one).

	    --iter;
	    break;
	  }
      
	if(iter + *index >= length)
	  break;
      
	QChar curChar = m_monomerText.at(iter + *index);
      
	if(!curChar.isLetter())
	  {
// 	    qDebug() << __FILE__ << __LINE__
// 		     << "The character is not a letter:"
// 		     << curChar;
	  
	    *err = curChar;

	    // The non-Letter character might be '/', which would be
	    // perfectly fine, as we use it to symbolize the actual
	    // cleavage site. Which means that we will continue
	    // parsing the rest of the string : we have to give the
	    // current position back to the caller in the *index
	    // variable for the next call to this function to start at
	    // next character (not falling back to '/', which would
	    // make us enter in an infinite loop).

	    *index = *index + iter;

	    return -1;
	  }
      
	bool isLower =(curChar.category() == QChar::Letter_Lowercase);
      
	if(iter == 0)
	  {
	    if (isLower)
	      {
		qDebug() << __FILE__ << __LINE__ 
			 << "First character of monomer code might not be"
			 << "lower case; sequence is"
			 << m_monomerText.toAscii();
		
		*err = curChar;
	      
		return -1;
	      }
	    else
	      {
		// Good, first char is uppercase.
		newCode += curChar;
	      }
	  }
	else //(iter != 0)
	  {
	    // We are not in our first iteration. So either the current
	    // character is lowercase and we are just continuing to
	    // iterate into a multi-char monomer code, or the current
	    // character is uppercase, in which case we are starting to
	    // iterate in a new monomer code.

	    if (isLower)
	      newCode += curChar;
	    else
	      {
		// Decrement iter, because this round was for nothing:
		// we had "invaded" the next monomer code in sequence,
		// which we must not do.

		--iter;
		break;
	      }
	  }

	++iter;
      }
  
    // We finished parsing at most codeLength characters out of
    // 'm_monomerText', so we have a valid code in the 'code' variable. We
    // can also compute a new index position in the sequence and return
    // the number of characters that we effectively parsed. Note that
    // the caller will be responsible for incrementing the 'index' value
    // by one character unit so as not to reparse the last characters of
    // the sent 'code' object.

    *index = *index + iter;
    *code = newCode;
    err->clear();
  
    return code->length();
  }
  


  // Returns -1 if an error was encountered, 0 if no match could be
  // found, 1 if a match was found.
  bool
  Sequence::findForwardMotif(Sequence *motif, 
			      const PolChemDef *polChemDef,
			      int *index)
  {
    Q_ASSERT(motif);
    Q_ASSERT(polChemDef);
    Q_ASSERT(index);
  
    if (*index < 0)
      return -1;
    if (*index >= size())
      return -1;
  
    int motifSize = motif->size();
  
    // If motif's length is 0, then nothing to search for, return
    // unmodified 'index'.
    if (!motifSize)
      return 0;
  
    // Simple optimization, if index + size of motif is greater then
    // size of sequence, return right away.
    if (*index + motifSize >= size())
      return 0;
  
    // First, make a monomerList.
    if (motif->makeMonomerList(polChemDef) == -1)
      return -1;

    // Compare *this sequence with the one in 'motif', starting at index
    // 'index' in *this sequence and 0 in 'motif'.
  
    bool matched = false;
    int matchIndex = 0;
  
    for (int iter = *index; iter < size(); ++iter)
      {
	matched = false;
	int jter = 0;
      
	const Monomer *monomer = at(iter);
	const Monomer *motifMonomer = motif->at(jter);
      
	// We do not compare with operator == because that comparison
	// would involve the comparison of modifications inside the
	// monomers, which would not work here.
	if(monomer->code() != motifMonomer->code())
	  continue;
      
	// An easy check is to see if the number of remaining monomers
	// in the polymer sequence is compatible with the number of
	// monomers still to be matched in the find array.  Imagine the
	// sequence of the polymer ends like this: ==========JTOUTVU and
	// the sequence to be searched for is : TVUL What we see is that
	// the T of the TVU of the sequence matches; however we can stop
	// the search right away because there is a 'L' in the search
	// pattern that is not present in the end part of the
	// sequence. This is exactly what is checked below.  Note that
	// this check makes SURE that at the end of the second inner
	// loop, when we get out of it, the sole reason we may not
	// consider that the match did not occur is because actually two
	// monomers differred and not because anybody came out of the
	// borders of the sequence in neither the array of the sequence
	// to be searched, nor the array of the polymer sequence. This
	// makes it very easy to assess if a match occurred or not.

	if(size() - iter < motif->size() - jter)
	  {
	    // Note that if it were ==, then it would have been possible
	    // that the sequence "just-in-time" match prior to ending of
	    // the polymer sequence array. Do not forget that we are in
	    // forward mode, thus we can break immediately, because we
	    // are certain that we won't have any chance to find the
	    // sequence downstream of current index.
	  
	    matched = FALSE;
	    break;
	  }

	matchIndex = iter;

	// We have to set the matched boolean to true, because if the
	// motif to find is one monomer-long, then the loop below will
	// not be entered, and we'll fail to know that the match
	// occurred later on.
	matched = true;
      
	// Now that we have our anchoring point in the *this sequence,
	// let's iterate in the motif, and check if the identity in
	// sequence goes along.
      
	for(int kter = jter + 1 ; kter < motif->size() ; ++kter)
	  {
	    // At first run in this loop, we are in the second cell of
	    // the find list, which means that we should have jter ==
	    // 1. And we should compare its contents with those of the
	    // cell in the sequence list at index(iter + jter).
	  
	    monomer = at(iter + kter);
	    motifMonomer = motif->at(kter);
	  
	    // We do not compare with operator == because that
	    // comparison would involve the comparison of modifications
	    // inside the monomers, which would not work here.
	    if (monomer->code() == motifMonomer->code())
	      {
		// The monomers still match. 
		matched = true;
		continue;
	      }
	    else
	      {
		matched = false;
		break;
	      }
	  }
	// End of 
	// for (int kter = jter + 1 ; kter < motif->size() ; ++kter)
      
	// At this point, we either have normally extinguished the run
	// in the inner loop, or we have gone out of it before its
	// normal termination. In either case, we have to test if the
	// match occurred or not.

	// Check if the match did NOT occur:
     
	if(!matched)
	  {
	    // We just continue with the outer loop, that is we continue
	    // searching in the polymer sequence for a match with the
	    // first monomer in the motif.
	  
	    continue;
	  }
	else 
	  {
	    // The match indeed occurred.
	  
	    *index = matchIndex;
	    return 1;
	  }
      }
    // End of
    // for (int iter = *index; iter < size(); ++iter)


    // No match could be achieved, we have to let the caller function
    // know this in a durable manner : returning 0.

    return 0;
  }



  //! Returns the monomer at index \p index.
  /*! 
  
    \param index index of the monomer to return in the list of
    monomers. Must comply with the boundaries of the monomer list(that is
    be >= 0 and < list.size()).

    \return a pointer to the monomer.
  */
  const Monomer *
  Sequence::at(int index) const
  {
//     qDebug() << __FILE__ << __LINE__ << "In call at() with value:"
// 	      << index ;

    if (index < 0)
      qFatal("%s@%d -- Index cannot be less than 0.",
	     __FILE__, __LINE__);

    if (index > m_monomerList.size())
      qFatal("%s@%d -- Index cannot be greater than polymer size.",
	     __FILE__, __LINE__);

    return m_monomerList.at(index);
  }


  int
  Sequence::monomerIndex(const Monomer *monomer)
  {
    for (int iter = 0; iter < m_monomerList.size(); ++iter)
      {
	if(m_monomerList.at(iter) == monomer)
	  return iter;
      }
  
    return -1;
  }


  //! Inserts the monomer at index \p index.
  /*! Assertions ensure that \p index is not less than 0 and not greater
    than sequence size as reported by size().

    This means that a monomer can only be inserted from a sequence if
    the sequence is at least in the form of a list of monomers.
  
    \param monomer dynamically allocated monomer. Assertion insures that
    this pointer is non-0.
  
    \param index Index of monomer to insert.
  
    \return Always true.
  */
  bool 
  Sequence::insertMonomerAt(const Monomer *monomer, int index)
  {
    Q_ASSERT(monomer);
    Q_ASSERT(index > -1 && index <= size());
    
    m_monomerList.insert(index, monomer);
  
    return true;
  }


  bool
  Sequence::prepareMonomerRemoval(const Monomer *monomer)
  {
    return true;
  }


  //! Removes the monomer at index \p index.
  /*! Assertions ensure that \p index is not less than 0 and not equal
    or greater than sequence size as reported by size().  

    This means that a monomer can only be removed from a sequence if the
    sequence is at least in the form of a list of monomers.
  
    \param index Index of monomer to remove.
  
    \return Always true.
  */
  bool 
  Sequence::removeMonomerAt(int index)
  {
    Q_ASSERT(index > -1);
    Q_ASSERT(index < size());
  
    const Monomer *monomer = at(index);
  
    if (!prepareMonomerRemoval(monomer))
      return false;
  
    m_monomerList.removeAt(index);
  
    delete monomer;
  
    return true;
  }


  bool
  Sequence::validate(const PolChemDef *polChemDef)
  {
    Q_ASSERT(polChemDef);
    
    if (makeMonomerList(polChemDef) > - 1)
      return true;
  
    return false;
  }

  quint16 
  Sequence::checksum(int startIdx, int endIdx, bool withModifs) const
  {
    if (!size())
      return 0;
    
    QString *text = monomerText(startIdx, endIdx, withModifs);
    
    QByteArray bytes = text->toUtf8();
    
    quint16 checksum = qChecksum(bytes.data(),  bytes.size());
    
//     qDebug() << __FILE__ << __LINE__
// 	     << "checksum:" << checksum;
    
    return checksum;
  }
  

} // namespace massXpert