File: genomeScanFastaFiles.cpp

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (92 lines) | stat: -rw-r--r-- 4,171 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#include "genomeScanFastaFiles.h"
#include "ErrorWarning.h"
#include "SequenceFuns.h"

uint genomeScanFastaFiles (Parameters &P, char* G, bool flagRun, Genome &mapGen) {//scans fasta files. flagRun=false: check and find full size, flaRun=true: collect all the data

    uint N=0;//total number of bases in the genome, including chr "spacers"
    if (!flagRun && mapGen.chrLength.size()>0) {//previous chr records exist
       mapGen.chrStart.pop_back();//remove last record, it will be recorded again
       N =  mapGen.chrStart.back()+mapGen.chrLength.back();
       mapGen.chrLength.pop_back();//remove last record, it will be recorded again
    };

    ifstream fileIn;
    for (uint ii=0;ii<mapGen.pGe.gFastaFiles.size();ii++) {//all the input files
        fileIn.open(mapGen.pGe.gFastaFiles.at(ii).c_str());
        if ( !fileIn.good() ) {//
            ostringstream errOut;
            errOut << "EXITING because of INPUT ERROR: could not open genomeFastaFile: " <<mapGen.pGe.gFastaFiles.at(ii) <<"\n";
            exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
        };
        char cc=fileIn.peek();
        if ( !fileIn.good() ) {//
            ostringstream errOut;
            errOut << "EXITING because of INPUT ERROR: could not read from genomeFastaFile: " <<mapGen.pGe.gFastaFiles.at(ii) <<"\n";
            exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
        };
        if (cc!='>') {
            ostringstream errOut;
            errOut << "EXITING because of INPUT ERROR: the file format of the genomeFastaFile: " <<mapGen.pGe.gFastaFiles.at(ii) << " is not fasta:";
            errOut << " the first character is '" <<cc<<"' ("<< (cc+0) << "), not '>'.\n";
            errOut << " Solution: check formatting of the fasta file. Make sure the file is uncompressed (unzipped).\n";
            exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
        };         
        while(!fileIn.eof()) {//read each file until eof
            string lineIn (4096,'.');
            getline(fileIn,lineIn);
            if (lineIn[0]=='>') {//new chromosome
                if (!flagRun) {
                    istringstream lineInStream (lineIn);
                    lineInStream.ignore(1,' ');
                    string chrName1;
                    lineInStream >> chrName1;
                    mapGen.chrName.push_back(chrName1);
                };

                if (!flagRun && mapGen.chrStart.size()>0) mapGen.chrLength.push_back(N-mapGen.chrStart.back()); //true length of the chr


                if (N>0) {//pad the chromosomes to bins boudnaries
                    N = ( (N+1)/mapGen.genomeChrBinNbases+1 )*mapGen.genomeChrBinNbases;
                };

                if (!flagRun) {
                    mapGen.chrStart.push_back(N);
                    P.inOut->logMain << mapGen.pGe.gFastaFiles.at(ii)<<" : chr # " << mapGen.chrStart.size()-1   \
                    		         << "  \""<< mapGen.chrName.back() <<"\""       \
									 << " chrStart: "<<N<<"\n"<<flush;
                };
            } else {//char lines
                if (flagRun) {
                    N += convertNucleotidesToNumbersRemoveControls(lineIn.c_str(),G+N,lineIn.size());
                } else {
                    for (uint ii=0; ii<lineIn.size();ii++) {
                        if (int(lineIn.at(ii))>=32)
                            ++N;
                    };
                };

            };
        };
        fileIn.close();
    };

    if (!flagRun)
        mapGen.chrLength.push_back(N-mapGen.chrStart.back()); //true length of the last chr

    N = ( (N+1)/mapGen.genomeChrBinNbases+1)*mapGen.genomeChrBinNbases;

    if (!flagRun) {
        mapGen.nChrReal=mapGen.chrStart.size();
        mapGen.chrStart.push_back(N); //last chromosome end+1
        P.inOut->logMain << "Chromosome sequence lengths: \n";
        for (uint ii=0;ii<mapGen.nChrReal;ii++) {
            mapGen.chrNameIndex[mapGen.chrName[ii]]=ii;
            P.inOut->logMain <<  mapGen.chrName[ii] <<"\t"<< mapGen.chrLength[ii] << "\n";
        };
    };

    return N;
};