File: tgTigSizeAnalysis.C

package info (click to toggle)
canu 1.7.1+dfsg-1~bpo9+1
  • links: PTS, VCS
  • area: main
  • in suites: stretch-backports
  • size: 7,680 kB
  • sloc: cpp: 66,708; perl: 13,682; ansic: 4,020; makefile: 627; sh: 472; python: 39
file content (140 lines) | stat: -rw-r--r-- 4,053 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

/******************************************************************************
 *
 *  This file is part of canu, a software program that assembles whole-genome
 *  sequencing reads into contigs.
 *
 *  This software is based on:
 *    'Celera Assembler' (http://wgs-assembler.sourceforge.net)
 *    the 'kmer package' (http://kmer.sourceforge.net)
 *  both originally distributed by Applera Corporation under the GNU General
 *  Public License, version 2.
 *
 *  Canu branched from Celera Assembler at its revision 4587.
 *  Canu branched from the kmer project at its revision 1994.
 *
 *  This file is derived from:
 *
 *    src/AS_CNS/MultiAlignSizeAnalysis.C
 *
 *  Modifications by:
 *
 *    Brian P. Walenz from 2012-MAR-26 to 2013-OCT-24
 *      are Copyright 2012-2013 J. Craig Venter Institute, and
 *      are subject to the GNU General Public License version 2
 *
 *    Brian P. Walenz on 2014-DEC-22
 *      are Copyright 2014 Battelle National Biodefense Institute, and
 *      are subject to the BSD 3-Clause License
 *
 *    Brian P. Walenz beginning on 2015-OCT-30
 *      are a 'United States Government Work', and
 *      are released in the public domain
 *
 *  File 'README.licenses' in the root directory of this distribution contains
 *  full conditions and disclaimers for each license.
 */

#include "tgTigSizeAnalysis.H"

#include <math.h>

#include <map>
#include <set>
#include <vector>
#include <algorithm>

using namespace std;


tgTigSizeAnalysis::tgTigSizeAnalysis(uint64 genomeSize_) {
  genomeSize = genomeSize_;
}

tgTigSizeAnalysis::~tgTigSizeAnalysis() {
}

void
tgTigSizeAnalysis::evaluateTig(tgTig *tig, bool useGapped) {

  //  Try to get the ungapped length.
  //  But revert to the gapped length if that doesn't exist.  This should
  //  only occur for pre-consensus unitigs.

  uint32  length = tig->length(useGapped);

  if (tig->_suggestRepeat)
    lenSuggestRepeat.push_back(length);

  if (tig->_suggestCircular)
    lenSuggestCircular.push_back(length);

  switch (tig->_class) {
    case tgTig_unassembled:   lenUnassembled.push_back(length);  break;
    case tgTig_bubble:        lenBubble.push_back(length);       break;
    case tgTig_contig:        lenContig.push_back(length);       break;
    default:                                                     break;
  }
}

void
tgTigSizeAnalysis::finalize(void) {

  sort(lenSuggestRepeat.begin(),   lenSuggestRepeat.end(),   greater<uint32>());
  sort(lenSuggestCircular.begin(), lenSuggestCircular.end(), greater<uint32>());

  sort(lenUnassembled.begin(), lenUnassembled.end(), greater<uint32>());
  sort(lenBubble.begin(),      lenBubble.end(),      greater<uint32>());
  sort(lenContig.begin(),      lenContig.end(),      greater<uint32>());
}

void
tgTigSizeAnalysis::printSummary(FILE *out, char *description, vector<uint32> &data) {
  uint64  cnt = data.size();
  uint64  sum = 0;
  uint64  tot = 0;
  uint64  nnn = 10;
  uint64  siz = 0;

  //  Duplicates AS_BAT_Instrumentation.C reportN50().

  if (cnt == 0)
    return;

  for (uint64 i=0; i<cnt; i++)
    tot += data[i];

  if (genomeSize > 0)
    siz = genomeSize;
  else
    siz = tot;

  for (uint64 i=0; i<cnt; i++) {
    sum += data[i];

    while (siz * nnn / 100 < sum) {
      fprintf(out, "%s ng%-3" F_U64P " %10" F_U32P " bp   lg%-3" F_U64P " %6" F_U64P "   sum %10" F_U64P " bp\n",
              description,
              nnn, data[i],
              nnn, i+1,
              sum);

      nnn += 10;
    }
  }

  fprintf(out, "%s sum %10" F_U64P " (genomeSize " F_U64 ")\n", description, tot, genomeSize);
  fprintf(out, "%s num %10" F_U64P "\n", description, cnt);
  fprintf(out, "%s ave %10" F_U64P "\n", description, tot / cnt);
}


void
tgTigSizeAnalysis::printSummary(FILE *out) {
  printSummary(out, "lenSuggestRepeat",   lenSuggestRepeat);
  printSummary(out, "lenSuggestCircular", lenSuggestCircular);

  printSummary(out, "lenUnassembled",     lenUnassembled);
  printSummary(out, "lenBubble",          lenBubble);
  printSummary(out, "lenContig",          lenContig);
}