1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733
|
/*
Copyright (C) 2003 Ronald C Beavis, all rights reserved
X! tandem
This software is a component of the X! proteomics software
development project
Use of this software governed by the Artistic license, as reproduced here:
The Artistic License for all X! software, binaries and documentation
Preamble
The intent of this document is to state the conditions under which a
Package may be copied, such that the Copyright Holder maintains some
semblance of artistic control over the development of the package,
while giving the users of the package the right to use and distribute
the Package in a more-or-less customary fashion, plus the right to
make reasonable modifications.
Definitions
"Package" refers to the collection of files distributed by the Copyright
Holder, and derivatives of that collection of files created through
textual modification.
"Standard Version" refers to such a Package if it has not been modified,
or has been modified in accordance with the wishes of the Copyright
Holder as specified below.
"Copyright Holder" is whoever is named in the copyright or copyrights
for the package.
"You" is you, if you're thinking about copying or distributing this Package.
"Reasonable copying fee" is whatever you can justify on the basis of
media cost, duplication charges, time of people involved, and so on.
(You will not be required to justify it to the Copyright Holder, but
only to the computing community at large as a market that must bear
the fee.)
"Freely Available" means that no fee is charged for the item itself,
though there may be fees involved in handling the item. It also means
that recipients of the item may redistribute it under the same
conditions they received it.
1. You may make and give away verbatim copies of the source form of the
Standard Version of this Package without restriction, provided that
you duplicate all of the original copyright notices and associated
disclaimers.
2. You may apply bug fixes, portability fixes and other modifications
derived from the Public Domain or from the Copyright Holder. A
Package modified in such a way shall still be considered the Standard
Version.
3. You may otherwise modify your copy of this Package in any way, provided
that you insert a prominent notice in each changed file stating how and
when you changed that file, and provided that you do at least ONE of the
following:
a. place your modifications in the Public Domain or otherwise make them
Freely Available, such as by posting said modifications to Usenet
or an equivalent medium, or placing the modifications on a major
archive site such as uunet.uu.net, or by allowing the Copyright Holder
to include your modifications in the Standard Version of the Package.
b. use the modified Package only within your corporation or organization.
c. rename any non-standard executables so the names do not conflict
with standard executables, which must also be provided, and provide
a separate manual page for each non-standard executable that clearly
documents how it differs from the Standard Version.
d. make other distribution arrangements with the Copyright Holder.
4. You may distribute the programs of this Package in object code or
executable form, provided that you do at least ONE of the following:
a. distribute a Standard Version of the executables and library files,
together with instructions (in the manual page or equivalent) on
where to get the Standard Version.
b. accompany the distribution with the machine-readable source of the
Package with your modifications.
c. give non-standard executables non-standard names, and clearly
document the differences in manual pages (or equivalent), together
with instructions on where to get the Standard Version.
d. make other distribution arrangements with the Copyright Holder.
5. You may charge a reasonable copying fee for any distribution of
this Package. You may charge any fee you choose for support of
this Package. You may not charge a fee for this Package itself.
However, you may distribute this Package in aggregate with other
(possibly commercial) programs as part of a larger (possibly
commercial) software distribution provided that you do not a
dvertise this Package as a product of your own. You may embed this
Package's interpreter within an executable of yours (by linking);
this shall be construed as a mere form of aggregation, provided that
the complete Standard Version of the interpreter is so embedded.
6. The scripts and library files supplied as input to or produced as
output from the programs of this Package do not automatically fall
under the copyright of this Package, but belong to whomever generated
them, and may be sold commercially, and may be aggregated with this
Package. If such scripts or library files are aggregated with this
Package via the so-called "undump" or "unexec" methods of producing
a binary executable image, then distribution of such an image shall
neither be construed as a distribution of this Package nor shall it
fall under the restrictions of Paragraphs 3 and 4, provided that you
do not represent such an executable image as a Standard Version of
this Package.
7. C subroutines (or comparably compiled subroutines in other languages)
supplied by you and linked into this Package in order to emulate
subroutines and variables of the language defined by this Package
shall not be considered part of this Package, but are the equivalent
of input as in Paragraph 6, provided these subroutines do not change
the language in any way that would cause it to fail the regression
tests for the language.
8. Aggregation of this Package with a commercial distribution is always
permitted provided that the use of this Package is embedded; that is,
when no overt attempt is made to make this Package's interfaces visible
to the end user of the commercial distribution. Such use shall not be
construed as a distribution of this Package.
9. The name of the Copyright Holder may not be used to endorse or promote
products derived from this software without specific prior written permission.
10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
The End
*/
// File version: 2003-07-01
// File version: 2004-03-01
/*
* msequenceserver takes a sequence list file (in FASTA format) and uses it to load an
* msequencecontainer object with a set of amino acid sequences and descriptions. the container
* can be loaded repeatedly until the end-of-file is reached. the next list file path in a
* deque of file names is then extracted and the process continues until the last
* sequence from the last file in the deque is read into the msequencecontainer.
*/
#include "stdafx.h"
#include "msequence.h"
#include "msequencecollection.h"
#include "msequenceserver.h"
#include "xmltaxonomy.h"
#include <ctime>
#include "string.h"
msequenceServer::msequenceServer(void)
{
m_pCol = new msequenceCollection;
m_bStarted = false;
m_bDone = false;
m_bError = false;
m_strStatus = "msequenceServer initialized\n";
m_tColMax = 1000;
m_tStartAt = 1;
m_dTime = 0.0;
m_lFileType = FASTA;
// 2006.11.21 - increased the size from 32*4096 to 512*1024 because of very long lines in nr FASTA files
m_lSize = 512*1024-1;
m_pLine = new char[m_lSize+1];
}
msequenceServer::~msequenceServer(void)
{
if(m_pCol != NULL) {
delete m_pCol;
}
delete m_pLine;
}
/*
* clears the sequence collection object
*/
bool msequenceServer::clear(void)
{
if(m_pCol == NULL)
return false;
m_pCol->m_vASequences.clear();
return true;
}
/*
* returns true if the server has retrieved all possible sequences
*/
bool msequenceServer::done(void)
{
return m_bDone;
}
/*
* returns true if the server is in an error conditions
*/
bool msequenceServer::error(void)
{
return m_bError;
}
/*
* called to finish the sequence loading process
*/
bool msequenceServer::finish(void)
{
m_bDone = true;
fclose(m_pInput);
m_strStatus += "Server finished properly\n";
return m_bDone;
}
/*
* returns the time elapsed retrieving sequences in seconds
*/
double msequenceServer::get_time(void)
{
return m_dTime/(double)CLOCKS_PER_SEC;
}
bool msequenceServer::initialize(const size_t _t)
{
m_pCol->initialize(_t);
m_tColMax = _t;
return true;
}
/*
* retreives the list of FASTA sequence list files using an XmlTaxonomy object
* and a taxon string. the list of file paths is stored in the m_dstrFasta deque and
* the m_vstrFasta vector
*/
long msequenceServer::load_file(const string &_p,const string &_t)
{
m_strTaxonPath = _p;
m_strTaxon = _t;
XmlTaxonomy xmlTax;
string strType = "peptide";
if(!xmlTax.load(m_strTaxonPath,m_strTaxon,strType))
return 1;
size_t a = 0;
ifstream ifTest;
m_vstrFasta.clear();
long lFailed = 0;
while(a < xmlTax.m_vstrPaths.size()) {
ifTest.open(xmlTax.m_vstrPaths[a].c_str());
if(!ifTest.fail()) {
m_dstrFasta.push_back(xmlTax.m_vstrPaths[a]);
m_vstrFasta.push_back(xmlTax.m_vstrPaths[a]);
ifTest.close();
}
else {
lFailed++;
}
ifTest.clear();
a++;
}
if(m_dstrFasta.empty()) {
if(lFailed != 0) {
return 3;
}
return 2;
}
return 0;
}
/*
* refill the msequencecontainer object with the next set of sequences and descriptions
*/
unsigned long msequenceServer::next(const bool _f)
{
/*
* exit on completion
*/
if(done())
return 0;
/*
* start the reading process, if it hasn't been started
*/
if(!started()) {
if(!start()) {
m_bDone = true;
m_bError = true;
m_strStatus += "Server would not start.\r\n";
return 0;
}
}
if(m_lFileType == XBANG)
return next_pro(_f);
if(!_f)
return next_l();
/*
* initialized the time
*/
double dStart = clock();
unsigned long iLength = 0;
msequence seqTemp;
char cValue = '\0';
char *pValue = NULL;
char *pEol = NULL;
m_pCol->clear();
while(!feof(m_pInput) && iLength < m_pCol->m_tMax) {
/*
* store the description in a temporary msequence object, obtained in the previous read
*/
m_pCol->m_vASequences[iLength].m_strDes = m_strFirst;
/*
* strip whitespace characters from the sequence line
*/
pValue = m_pLine;
fgets(pValue,m_lSize,m_pInput);
/*
* clear the sequence in a temporary msequence object
*/
while(pValue[0] != '>' && !feof(m_pInput)) {
/*
* store initial sequence line, and repeat until the next description line is encountered
*/
pValue += strlen(pValue);
pValue--;
if(pValue > m_pLine) {
while(pValue > m_pLine && isspace(*pValue)) {
pValue--;
}
if(!isspace(*pValue) && *pValue != '\0') {
pValue++;
*pValue = '\0';
}
}
if(m_lSize - strlen(m_pLine) < 1024) {
string strTemp = m_pLine;
m_lSize = m_lSize + 1024*256;
delete m_pLine;
m_pLine = new char[m_lSize + 1];
strcpy(m_pLine,strTemp.c_str());
pValue = m_pLine + strlen(m_pLine);
}
fgets(pValue,m_lSize,m_pInput);
}
cValue = *pValue;
*pValue = '\0';
bz(m_pLine);
m_pCol->m_vASequences[iLength].m_strSeq = m_pLine;
m_pCol->m_vASequences[iLength].m_siPath = (short int)(m_vstrPaths.size() - 1);
*pValue = cValue;
/*
* store the next description line
*/
if(pValue[0] == '>') {
if(strchr(pValue,0x01)) {
pEol = strchr(pValue,0x01);
*pEol = '\0';
}
else {
pEol = pValue + strlen(pValue) - 1;
while(pEol > pValue && isspace(*pEol)) {
*pEol = '\0';
pEol--;
}
}
pEol = strchr(pValue,'\r');
if(pEol) {
*pEol = '\0';
}
pEol = strchr(pValue,'\n');
if(pEol) {
*pEol = '\0';
}
m_strFirst = pValue + 1;
}
m_pCol->m_tLength++;
iLength++;
}
/*
* if the current sequence list file is finished, close it and get the next one, otherwise finish
*/
if(feof(m_pInput)) {
if(m_dstrFasta.empty()) {
finish();
}
else {
fclose(m_pInput);
start();
}
}
/*
* store the time required to load the msequencecontainer
*/
m_dTime += (double)(clock() - dStart);
return iLength;
}
bool msequenceServer::bz(char *_p)
{
if(_p == NULL) {
return false;
}
char *pFind = strchr(_p,'B');
while(pFind != NULL) {
*pFind = 'N';
pFind = strchr(_p,'B');
}
pFind = strchr(_p,'Z');
while(pFind != NULL) {
*pFind = 'Q';
pFind = strchr(_p,'Z');
}
pFind = strchr(_p,'J');
while(pFind != NULL) {
*pFind = 'L';
pFind = strchr(_p,'J');
}
// pFind = strchr(_p,'X');
// while(pFind != NULL) {
// *pFind = '*';
// pFind = strchr(_p,'X');
// }
return true;
}
/*
* refill the msequencecontainer object with the next set of sequences and descriptions
*/
unsigned long msequenceServer::next_pro(const bool _f)
{
/*
* exit on completion
*/
if(done())
return 0;
/*
* start the reading process, if it hasn't been started
*/
if(!started()) {
if(!start()) {
m_bDone = true;
m_bError = true;
m_strStatus += "Server would not start.\r\n";
return 0;
}
}
/*
* initialized the time
*/
double dStart = clock();
unsigned long iLength = 0;
msequence seqTemp;
register char cValue = '\0';
char *pValue = NULL;
char *pEol = NULL;
m_pCol->clear();
unsigned long lLength = 0;
seqTemp.m_strDes = " ";
seqTemp.m_strSeq = " ";
size_t tS = 0;
while(!feof(m_pInput) && iLength < m_tColMax) {
/*
* store the description in a temporary msequence object, obtained in the previous read
*/
tS = fread(&lLength,4,1,m_pInput);
if(feof(m_pInput)) {
break;
}
#ifdef OSX
lLength = mac_rev(lLength);
#endif
if(lLength > m_lSize) {
delete m_pLine;
m_lSize = (unsigned long)(lLength + 1024);
m_pLine = new char[m_lSize + 1];
}
tS = fread(m_pLine,lLength,1,m_pInput);
if(feof(m_pInput)) {
break;
}
if(_f) {
m_pCol->m_vASequences[iLength].m_strDes = m_pLine;
}
tS = fread(&lLength,4,1,m_pInput);
#ifdef OSX
lLength = mac_rev(lLength);
#endif
if(lLength > m_lSize) {
delete m_pLine;
m_lSize = (unsigned long)(lLength + 1024);
m_pLine = new char[m_lSize + 1];
}
tS = fread(m_pLine,lLength,1,m_pInput);
if(feof(m_pInput)) {
break;
}
if(_f) {
bz(m_pLine);
m_pCol->m_vASequences[iLength].m_strSeq = m_pLine;
m_pCol->m_vASequences[iLength].m_siPath = (short int)(m_vstrPaths.size() - 1);
}
m_pCol->m_vASequences[iLength].m_mapMods.clear();
m_pCol->m_tLength++;
iLength++;
}
/*
* if the current sequence list file is finished, close it and get the next one, otherwise finish
*/
if(feof(m_pInput)) {
if(m_dstrFasta.empty()) {
finish();
}
else {
fclose(m_pInput);
start();
}
}
/*
* store the time required to load the msequencecontainer
*/
m_dTime += (double)(clock() - dStart);
return iLength;
}
/*
* mac_rev was added in version 2004.04.01 so that the OSX version can read files
* in .pro format that were compiled on a windows or linux box. mac OSX uses the
* reverse format for reading integers from the disk, so the bytes in the
* integer have to be reversed
*/
unsigned long msequenceServer::mac_rev(const unsigned long _l)
{
union sValue {
unsigned long ul;
unsigned char cl[4];
} lValue;
lValue.ul = _l;
union oValue {
unsigned long ul;
unsigned char cl[4];
} olValue;
olValue.cl[3] = lValue.cl[0];
olValue.cl[2] = lValue.cl[1];
olValue.cl[1] = lValue.cl[2];
olValue.cl[0] = lValue.cl[3];
return olValue.ul;
}
/*
* refill the msequencecontainer object with the next set of sequences and descriptions
*/
unsigned long msequenceServer::next_l(void)
{
/*
* exit on completion
*/
if(done())
return 0;
/*
* start the reading process, if it hasn't been started
*/
if(!started()) {
if(!start()) {
m_bDone = true;
m_bError = true;
m_strStatus += "Server would not start.\r\n";
return 0;
}
}
/*
* initialized the time
*/
double dStart = clock();
unsigned long iLength = 0;
msequence seqTemp;
long lSize = (10*4096)-1;
register char cValue = '\0';
char *pValue = NULL;
char *pEol = NULL;
char *pLine = new char[lSize+1];
size_t a = 0;
while(!feof(m_pInput) && iLength < m_pCol->m_tMax) {
/*
* store the description in a temporary msequence object, obtained in the previous read
*/
fgets(pLine,lSize,m_pInput);
/*
* clear the sequence in a temporary msequence object
*/
while(pLine[0] != '>' && !feof(m_pInput)) {
/*
* store initial sequence line, and repeat until the next description line is encountered
*/
fgets(pLine,lSize,m_pInput);
}
/*
* store the next description line
*/
if(pLine[0] == '>') {
if(strchr(pLine,0x01)) {
pEol = strchr(pLine,0x01);
*pEol = '\0';
}
else {
pEol = pLine + strlen(pLine) - 1;
while(pEol > pLine && isspace(*pEol)) {
*pEol = '\0';
pEol--;
}
}
pEol = strchr(pLine,'\r');
if(pEol) {
*pEol = '\0';
}
pEol = strchr(pLine,'\n');
if(pEol) {
*pEol = '\0';
}
m_strFirst = pLine + 1;
}
iLength++;
}
delete pLine;
/*
* if the current sequence list file is finished, close it and get the next one, otherwise finish
*/
if(feof(m_pInput)) {
if(m_dstrFasta.empty()) {
finish();
}
else {
fclose(m_pInput);
start();
}
}
/*
* store the time required to load the msequencecontainer
*/
m_dTime += (double)(clock() - dStart);
return iLength;
}
/*
* start the process of loading a sequence list file
*/
bool msequenceServer::start(void)
{
m_bStarted = false;
/*
* return false if there are no more sequence list files
*/
if(m_dstrFasta.empty()) {
return false;
}
m_strPath = m_dstrFasta.front();
m_dstrFasta.pop_front();
m_vstrPaths.push_back(m_strPath);
/*
* open the file
*/
m_pInput = fopen(m_strPath.c_str(),"rb");
if(m_pInput == NULL) {
m_bError = true;
m_strStatus = "\n*********\nWarning:\n Sequence list path '";
m_strStatus += m_strPath;
m_strStatus += "'\n could not be opened and was skipped.\n*********\n\n";
cout << m_strStatus.c_str();
return m_bStarted;
}
size_t tS = 0;
char *pS = NULL;
tS = fread(m_pLine,256,1,m_pInput);
string strDesc = "no description";
if(strstr(m_pLine,"xbang-pro-fasta-format") != NULL) {
m_lFileType = XBANG;
char *pV = m_pLine+64;
if(strlen(pV) > 0) {
strDesc = pV;
}
}
else if(m_pLine[0] == '>') {
fclose(m_pInput);
m_lFileType = FASTA;
m_pInput = fopen(m_strPath.c_str(),"r");
}
else {
m_lFileType = UNKNOWN;
m_bError = true;
m_strStatus = "\n*********\nWarning:\n Sequence list path '";
m_strStatus += m_strPath;
m_strStatus += "'\n was not in a recognized file format and was skipped.\n*********\n\n";
cout << m_strStatus.c_str();
return m_bStarted;
}
m_vstrDesc.push_back(strDesc);
m_bStarted = true;
m_strStatus += "Path '";
m_strStatus += m_strPath;
m_strStatus += "' was opened.\n";
/*
* read down to the first valid FASTA description line
*/
if(m_lFileType == XBANG)
return m_bStarted;
pS = fgets(m_pLine,m_lSize,m_pInput);
while(m_pLine[0] != '>' && !feof(m_pInput)) {
pS = fgets(m_pLine,m_lSize,m_pInput);
}
if(m_pLine[0] == '>') {
char *pEol = NULL;
if(strchr(m_pLine,0x01)) {
pEol = strchr(m_pLine,0x01);
*pEol = '\0';
}
else {
pEol = m_pLine + strlen(m_pLine) - 1;
while(pEol > m_pLine && isspace(*pEol)) {
*pEol = '\0';
pEol--;
}
}
pEol = strchr(m_pLine,'\r');
if(pEol) {
*pEol = '\0';
}
pEol = strchr(m_pLine,'\n');
if(pEol) {
*pEol = '\0';
}
m_strFirst = m_pLine+1;
}
/*
* create the msequencecollection object, if necessary
*/
return m_bStarted;
}
/*
* return true if the server is started
*/
bool msequenceServer::started(void)
{
return m_bStarted;
}
/*
* return true if started, but not finished
*/
bool msequenceServer::working(void)
{
return m_bStarted && !m_bDone;
}
|