File: phylipFormat.cpp

package info (click to toggle)
fastml 3.11-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,772 kB
  • sloc: cpp: 48,522; perl: 3,588; ansic: 819; makefile: 386; python: 83; sh: 55
file content (138 lines) | stat: -rw-r--r-- 4,448 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// $Id: phylipFormat.cpp 962 2006-11-07 15:13:34Z privmane $

#include "phylipFormat.h"
#include "someUtil.h"
#include "errorMsg.h"
#include "logFile.h"

sequenceContainer phylipFormat::read(istream &infile, const alphabet* alph){
	sequenceContainer mySeqData = readUnAligned(infile, alph);
	mySeqData.makeSureAllSeqAreSameLengthAndGetLen();
	return mySeqData;
}
sequenceContainer phylipFormat::readUnAligned(istream &infile, const alphabet* alph){
	sequenceContainer mySeqData;

	vector<string> seqFileData;
	putFileIntoVectorStringArray(infile,seqFileData);

	vector<string>::const_iterator currentLinePosition = seqFileData.begin();
	string::const_iterator itStr = seqFileData.begin()->begin();
	string::const_iterator itStrEnd = seqFileData.begin()->end();

	int f_numSeq;
	bool readSeqNum= fromStringIterToInt(itStr,itStrEnd,f_numSeq);
	if (readSeqNum == false) errorMsg::reportError("Error reading number of sequences while reading PHYLIP sequence format");
	int f_seqLength;
	bool readSeqLen= fromStringIterToInt(itStr,itStrEnd,f_seqLength);
	if (readSeqLen == false) errorMsg::reportError("Error reading the sequences length while reading PHYLIP sequence format");
	currentLinePosition++; // we read the first line.

	int localid=0;
	for (; currentLinePosition != seqFileData.end() ; ) {
		if (currentLinePosition->empty()) {++currentLinePosition;continue;} // empty line constinue
		string remark;
		string name;
		sequence seq(alph);


		
		if (mySeqData.numberOfSeqs() < f_numSeq ) {//get from the line a name and a sequence;
			
			string name1;
			string stringSeq1;
			string::const_iterator it2 = (currentLinePosition)->begin();
			for (; it2 != (currentLinePosition)->end();++it2) {
				if ((*it2)==' ') break;
				else name1+=(*it2);
			}
			for (; it2 != (currentLinePosition)->end();++it2) {
				if ((*it2)==' ') continue;
				else stringSeq1+=(*it2);
			}
			mySeqData.add(sequence(stringSeq1,name1,remark,localid,alph));
			currentLinePosition++;
			localid++;
		}
		else { // adding to the 
			string stringSeq1;
			string::const_iterator it2 = (currentLinePosition)->begin();
			int sequenceId=localid%f_numSeq;
			for (; it2 != (currentLinePosition)->end() && 
			       mySeqData[sequenceId].seqLen() <f_seqLength;++it2) {
				if ((*it2)==' ') continue;
				else stringSeq1+=(*it2);
				
			}
			sequence tmp(stringSeq1,"","",sequenceId,alph);
			mySeqData[sequenceId].operator += (tmp);
			currentLinePosition++;
			localid++;
		}
	}
	return mySeqData;
}

void phylipFormat::write(ostream &out, const sequenceContainer& sd,
						 const int numOfPositionInLine,
						 const int spaceEvery) {
	sequenceContainer::constTaxaIterator it5=sd.constTaxaBegin();
	for (;it5!=sd.constTaxaEnd();++it5) {
		if (it5->name().size() > 10) break;
	}
	if (it5 != sd.constTaxaEnd()) {
		LOG(1,<<"you asked to print in phylip format\n");
		LOG(1,<<"however, the names in phylip format\n");
		LOG(1,<<"must be no more than 10 characters.\n");
		LOG(1,<<"Names are hence trancated to ten   \n");
		LOG(1,<<"characters. Notice, that this might\n");
		LOG(1,<<"result in a two or more sequences  \n");
		LOG(1,<<"having the same name               \n");
	}
	
	//	vector<const sequenceContainer::sequenceDatum*> vec;
	//	sd.getSequenceDatumPtrVector(vec);
	out<<sd.numberOfSeqs()<<"   "<<sd.seqLen();
	if (sd.constTaxaBegin()==sd.constTaxaEnd()) return;
	
	int maxLengthOfSeqName =0;
	maxLengthOfSeqName=10;	// all this maxLengthOfSeqName is the 

	int currentPosition = 0;
	while (currentPosition < sd.seqLen() ) {
		out<<endl;
		out.flush();
		// for (vector<const sequenceContainer::sequenceDatum*>::const_iterator it5= vec.begin(); it5!=vec.end(); ++ it5) {
		   for (sequenceContainer::constTaxaIterator it5=sd.constTaxaBegin();it5!=sd.constTaxaEnd();++it5) {

			for (int iName = 0 ;iName<maxLengthOfSeqName; ++iName) {
				if (iName<it5->name().size()) {
					if (currentPosition<numOfPositionInLine) {
						out<<it5->name()[iName];
					}
					else out<<" ";
					out.flush();
				}
				else out<<" ";
			}
			out.flush();
			out<<" ";
			
			if (it5->seqLen()<numOfPositionInLine) 
				out<<it5->toString()<<endl;
			else {
				for (int k=currentPosition; k < currentPosition+numOfPositionInLine; ++k) {
					if (k>=it5->seqLen()) break;
					out<<it5->toString(k);
					if (((k+1)%spaceEvery==0) && (((k+1)%numOfPositionInLine!=0))) out<<" ";
				}
				out<<endl;
			}
		}
		currentPosition +=numOfPositionInLine;
		
	}
	return;
}