File: nexusFormat.cpp

package info (click to toggle)
fastml 3.11-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,772 kB
  • sloc: cpp: 48,522; perl: 3,588; ansic: 819; makefile: 386; python: 83; sh: 55
file content (152 lines) | stat: -rw-r--r-- 5,275 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// $Id: nexusFormat.cpp 5987 2009-03-18 18:13:53Z itaymay $

#include "nexusFormat.h"
#include "someUtil.h"
#include "errorMsg.h"
#include <map>

sequenceContainer nexusFormat::read(istream &infile, const alphabet* pAlph) {
	sequenceContainer mySeqData = readUnAligned(infile, pAlph);
	mySeqData.makeSureAllSeqAreSameLengthAndGetLen();
	return mySeqData;
}

sequenceContainer nexusFormat::readUnAligned(istream &infile, const alphabet* pAlph) {
	if (!infile) {
		errorMsg::reportError("unable to read mase format, could not open file");
	}
	sequenceContainer mySeqData;;

	vector<string> seqFileData;
	putFileIntoVectorStringArray(infile,seqFileData);

	vector<string>::const_iterator it1 = seqFileData.begin();
	// make sure that the first 6 chars in the first line is #NEXUS
	if (it1->size()<6) errorMsg::reportError("first word in a nexus sequence file format must be #NEXUS",1);
	if (  ((*it1)[0] != '#') 
		|| (((*it1)[1] != 'N') && ((*it1)[1] != 'n'))
		|| (((*it1)[2] != 'E') && ((*it1)[2] != 'e'))
		|| (((*it1)[3] != 'X') && ((*it1)[3] != 'x'))
		|| (((*it1)[4] != 'U') && ((*it1)[4] != 'u'))
		|| (((*it1)[5] != 'S') && ((*it1)[5] != 's')) ) {
		errorMsg::reportError("first word in a nexus sequence file format must be #NEXUS",1);
	}
	it1++;

	while ( ( (*it1).find("matrix")  == -1) && ( (*it1).find("MATRIX")  == -1) && (it1!= seqFileData.end()))
	{ //check for the word matrix
		++it1;
	}
	
	int localid=0;
	//int x1 = ((*it1).find("matrix") != -1);
	//int x2 = ((*it1).find("MATRIX") != -1);
	if (((*it1).find("matrix") != -1) || ((*it1).find("MATRIX") != -1))
	{
		//taken from clustalFormat:
        //In case of codon alpahabet we cannot add a seqeunce that is not dividable by 3.
		//In this case the last nucleotides in each line (zero, one or two) 
		//should be saved. The next time the same sequence name appears - 
		//these saveed nucleotidea and are added to the begining of the line.
		map<string ,string> stringsToAdd;   


		for (++it1; it1 != seqFileData.end() ; ++it1)
		{
			if (((*it1).find("end;") != -1) || ((*it1).find("END;") != -1))
				break;
			if (it1->empty() || ((*it1).find(';') != -1)) 
			{ // empty line constinue
				continue;
			}
			sequence seq(pAlph);
			
			string taxonName;
			string remark;
			string stringSeq;
			bool beforeName = true;
			string::const_iterator stringIt = (it1)->begin();
			for (; stringIt != (it1)->end(); ++stringIt)
			{ //first loop finds the taxon name
				if ( ((*stringIt) == ' ') || ((*stringIt) == '\t'))
					if (beforeName == true)
						continue; //spaces before taxon name are legal
					else
						break; //A space marks the end of the taxon name 
				else 
				{
					taxonName += (*stringIt);
					beforeName = false;
				}
			}

			//check if a new sequence.
			//if the name already exists then init stringSeq with the nucleotide from the previous line of the same sequence
			if (stringsToAdd.find(taxonName)!=stringsToAdd.end()) 
				stringSeq = stringsToAdd[taxonName]; 

			for (; stringIt != (it1)->end(); ++stringIt) 
			{//second loop finds the sequecne
				if ( ((*stringIt)==' ') || 	((*stringIt) == '\t'))
					continue;
				else stringSeq += (*stringIt);
			}
			
			//when alphabet is codon stringSeq must be dividable by 3.
			// 1. save the reminder (0,1 or 2 last nucleotides) in stringToAdd
			// 2. substr the reminder from the sequence line.
			// 3. keep stringToAdd in map (according the name) to be added later.
			string stringToAdd="";
			if (pAlph->size()>=60){	// codon?
				if ((stringSeq.size()%3)==1){  //add the last nucleotide to the next line
					stringToAdd += stringSeq[stringSeq.size()-1];
					stringSeq = stringSeq.substr(0,stringSeq.size()-1);
				}
				if ((stringSeq.size() % 3) == 2){ //add the 2 last nucleotide to the next line
					stringToAdd+=stringSeq[stringSeq.size()-2];
					stringToAdd+=stringSeq[stringSeq.size()-1];
					stringSeq = stringSeq.substr(0, stringSeq.size() - 2);
				}
			}
			stringsToAdd[taxonName] = stringToAdd; //update the map with the new stringToAdd 
			//add sequence to container
			int id = mySeqData.getId(taxonName, false);
            if (id==-1) { // new sequence.
                mySeqData.add(sequence(stringSeq, taxonName,remark,localid, pAlph));
                localid++;
			}
			else {// the sequence is already there...
				sequence tmp(stringSeq,taxonName, remark, id, pAlph);
				mySeqData[id].operator += (tmp);
			}
		}
	}
	else
	{
		errorMsg::reportError("no sequence data in nexus file - no matrix keyword found");
	}
	
	return mySeqData;
}

void nexusFormat::write(ostream &out, const sequenceContainer& sc) {
	//vector<string> gfr = sd.getGeneralFileRemarks();
	//if (gfr.empty()) out<<";;\n;;\n";
	//for (vector<string>::const_iterator k=gfr.begin() ; k !=  gfr.end() ; ++k )
	//	out<<(*k)<<endl;
	out<<"#NEXUS"<<endl;
	out<<"begin data;"<<endl;
	out<<"dimensions ntax="<<sc.numberOfSeqs()<<" nchar="<<sc.seqLen() <<";"<<endl;
	if (sc.alphabetSize() == 4) 
		out<<"format datatype=dna gap=-;"<<endl;
	else
		out<<"format datatype=protein gap=-;"<<endl;
	out<<"matrix"<<endl;

	for (sequenceContainer::constTaxaIterator itSeq=sc.constTaxaBegin();itSeq!=sc.constTaxaEnd();++itSeq) {
		out<<"\t"<<itSeq->name()<<"\t"<<itSeq->toString()<<endl;
	}
	out<<";"<<endl;
	out<<"end;"<<endl;
}